001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.language; 031 032import gnu.trove.map.hash.TIntDoubleHashMap; 033import gnu.trove.map.hash.TIntIntHashMap; 034import gnu.trove.procedure.TIntIntProcedure; 035 036import java.io.IOException; 037import java.io.InputStream; 038import java.io.InputStreamReader; 039import java.io.UnsupportedEncodingException; 040import java.util.HashMap; 041import java.util.Locale; 042import java.util.Map; 043import java.util.zip.GZIPInputStream; 044 045import no.uib.cipr.matrix.DenseMatrix; 046 047import org.openimaj.citation.annotation.Reference; 048import org.openimaj.citation.annotation.ReferenceType; 049import org.openimaj.io.IOUtils; 050 051import com.google.gson.Gson; 052import com.google.gson.GsonBuilder; 053 054/** 055 * Short text language detection ported from langid: 056 * https://github.com/saffsd/langid.py 057 * 058 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 059 * 060 */ 061@Reference( 062 type = ReferenceType.Article, 063 author = { "Lui, Marco", "Baldwin, Timothy" }, 064 title = "Cross-domain Feature Selection for Language Identification", 065 year = "2011", 066 booktitle = "in Proceedings of 5th International Joint Conference on Natural Language Processing") 067public class LanguageDetector { 068 069 private static Gson gson; 070 071 static { 072 gson = new GsonBuilder(). 073 serializeNulls(). 074 create(); 075 } 076 077 /** 078 * default location of the compressed json version language model 079 */ 080 public static final String LANGUAGE_MODEL_JSON = "/org/openimaj/text/language/language.model.json.gz"; 081 /** 082 * default location of the compressed binary version of the language model 083 */ 084 public static final String LANGUAGE_MODEL_BINARY = "/org/openimaj/text/language/language.model.binary.gz"; 085 086 private LanguageModel languageModel; 087 088 /** 089 * Load a language model from {@value #LANGUAGE_MODEL_BINARY} 090 * 091 * @throws IOException 092 */ 093 094 public LanguageDetector() throws IOException { 095 this(false); 096 } 097 098 @SuppressWarnings("unchecked") 099 private void loadFromJSON() throws IOException { 100 Map<String, Object> languageModelRaw; 101 final InputStream is = new GZIPInputStream(LanguageDetector.class.getResourceAsStream(LANGUAGE_MODEL_JSON)); 102 languageModelRaw = gson.fromJson(new InputStreamReader(is), Map.class); 103 languageModel = new LanguageModel(languageModelRaw); 104 } 105 106 private void loadFromBinary() throws IOException { 107 this.languageModel = IOUtils.read( 108 new GZIPInputStream(LanguageDetector.class.getResourceAsStream(LANGUAGE_MODEL_BINARY)), 109 LanguageModel.class 110 ); 111 } 112 113 /** 114 * Create a language detector with a provided language model 115 * 116 * @param model 117 */ 118 public LanguageDetector(LanguageModel model) { 119 this.languageModel = model; 120 } 121 122 LanguageDetector(boolean fromJSON) throws IOException { 123 if (fromJSON) { 124 loadFromJSON(); 125 } 126 else { 127 loadFromBinary(); 128 } 129 } 130 131 /** 132 * 133 * A langauge with an associated confidence 134 * 135 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 136 * 137 */ 138 public static class WeightedLocale { 139 /** 140 * Default constructor 141 * 142 * @param language 143 * @param best 144 */ 145 public WeightedLocale(String language, double best) { 146 this.language = language; 147 this.confidence = best; 148 } 149 150 @Override 151 public String toString() { 152 return String.format("%s: %f", this.language.toString(), this.confidence); 153 } 154 155 /** 156 * @return the locale based on the language 157 */ 158 public Locale getLocale() { 159 return new Locale(language); 160 } 161 162 /** 163 * @return this weighted locale as a map 164 */ 165 public Map<String, Object> asMap() { 166 final Map<String, Object> map = new HashMap<String, Object>(); 167 map.put("language", language); 168 map.put("confidence", confidence); 169 return map; 170 } 171 172 /** 173 * @param map 174 * @return Construct a weighted locale from a map 175 */ 176 public static WeightedLocale fromMap(Map<String, Object> map) { 177 return new WeightedLocale((String) map.get("language"), (Double) map.get("confidence")); 178 } 179 180 /** 181 * Estimated language 182 */ 183 public String language; 184 185 /** 186 * Naive bayesian probability 187 */ 188 public double confidence; 189 } 190 191 /** 192 * Classify the language using a naive-bayes model 193 * 194 * @param text 195 * @return the detected language 196 */ 197 public WeightedLocale classify(String text) { 198 final DenseMatrix fv = tokenize(text); 199 final WeightedLocale locale = naiveBayesClassify(fv); 200 return locale; 201 } 202 203 DenseMatrix nbWorkspace = null; 204 205 private WeightedLocale naiveBayesClassify(DenseMatrix fv) { 206 if (nbWorkspace == null) { 207 nbWorkspace = new DenseMatrix(1, this.languageModel.naiveBayesPTC.numColumns()); 208 } 209 final double logFVSum = sumLogFactorial(fv); 210 fv.mult(this.languageModel.naiveBayesPTC, nbWorkspace);// times(this.languageModel.naiveBayesPTC); 211 final DenseMatrix pdc = nbWorkspace; 212 // multiplied.print(5, 5); 213 // this.languageModel.naiveBayesPTC.print(5, 5); 214 pdc.add(this.languageModel.naiveBayesPC); 215 final double[] pdData = pdc.getData(); 216 int bestIndex = -1; 217 double best = 0; 218 double sum = 0; 219 for (int i = 0; i < pdc.numColumns(); i++) { 220 final double correctedScore = pdData[i] - logFVSum; 221 // System.out.format("%s scores %f \n",this.languageModel.naiveBayesClasses[i],correctedScore); 222 sum += correctedScore; 223 if (bestIndex == -1 || correctedScore > best) 224 { 225 bestIndex = i; 226 best = correctedScore; 227 } 228 } 229 230 return new WeightedLocale(this.languageModel.naiveBayesClasses[bestIndex], best / sum); 231 } 232 233 // an element wise log-factorial 234 TIntDoubleHashMap logFacCache = new TIntDoubleHashMap(); 235 236 private double sumLogFactorial(DenseMatrix fv) { 237 double sum = 0; 238 final double[] data = fv.getData(); 239 for (int i = 0; i < fv.numColumns(); i++) { 240 final int fvi = (int) data[i]; 241 if (logFacCache.contains(fvi)) 242 { 243 sum += logFacCache.get(fvi); 244 } 245 else { 246 for (int j = 1; j < fvi + 1; j++) { 247 sum += Math.log(j); 248 } 249 } 250 } 251 return sum; 252 } 253 254 private DenseMatrix tokenize(String text) { 255 byte[] ords = null; 256 try { 257 ords = text.getBytes("UTF-8"); 258 } catch (final UnsupportedEncodingException e) { 259 } 260 int state = 0; 261 final TIntIntHashMap statecount = new TIntIntHashMap(); 262 for (final byte letter : ords) { 263 state = this.languageModel.tk_nextmove[(state << 8) + (letter & 0xff)]; 264 statecount.adjustOrPutValue(state, 1, 1); 265 } 266 final double[][] fv = new double[1][this.languageModel.naiveBayesNFeats]; 267 statecount.forEachEntry(new TIntIntProcedure() { 268 @Override 269 public boolean execute(int state, final int statecount) { 270 final int[] indexes = LanguageDetector.this.languageModel.tk_output.get(state); 271 if (indexes == null) 272 return true; 273 for (final int i : indexes) { 274 275 fv[0][i] += statecount; 276 } 277 return true; 278 } 279 }); 280 return new DenseMatrix(fv); 281 } 282 283 /** 284 * @return the underlying {@link LanguageModel} 285 */ 286 public LanguageModel getLanguageModel() { 287 return this.languageModel; 288 } 289 290 /** 291 * prints available languages 292 * 293 * @param args 294 * @throws IOException 295 */ 296 public static void main(String[] args) throws IOException { 297 final LanguageDetector lm = new LanguageDetector(); 298 System.out.println("Available languages: "); 299 for (final String string : lm.languageModel.naiveBayesClasses) { 300 System.out.println(string + ": " + new Locale(string).getDisplayLanguage()); 301 } 302 } 303}