001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.language;
031
032import gnu.trove.map.hash.TIntDoubleHashMap;
033import gnu.trove.map.hash.TIntIntHashMap;
034import gnu.trove.procedure.TIntIntProcedure;
035
036import java.io.IOException;
037import java.io.InputStream;
038import java.io.InputStreamReader;
039import java.io.UnsupportedEncodingException;
040import java.util.HashMap;
041import java.util.Locale;
042import java.util.Map;
043import java.util.zip.GZIPInputStream;
044
045import no.uib.cipr.matrix.DenseMatrix;
046
047import org.openimaj.citation.annotation.Reference;
048import org.openimaj.citation.annotation.ReferenceType;
049import org.openimaj.io.IOUtils;
050
051import com.google.gson.Gson;
052import com.google.gson.GsonBuilder;
053
054/**
055 * Short text language detection ported from langid:
056 * https://github.com/saffsd/langid.py
057 * 
058 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
059 * 
060 */
061@Reference(
062                type = ReferenceType.Article,
063                author = { "Lui, Marco", "Baldwin, Timothy" },
064                title = "Cross-domain Feature Selection for Language Identification",
065                year = "2011",
066                booktitle = "in Proceedings of 5th International Joint Conference on Natural Language Processing")
067public class LanguageDetector {
068
069        private static Gson gson;
070
071        static {
072                gson = new GsonBuilder().
073                                serializeNulls().
074                                create();
075        }
076
077        /**
078         * default location of the compressed json version language model
079         */
080        public static final String LANGUAGE_MODEL_JSON = "/org/openimaj/text/language/language.model.json.gz";
081        /**
082         * default location of the compressed binary version of the language model
083         */
084        public static final String LANGUAGE_MODEL_BINARY = "/org/openimaj/text/language/language.model.binary.gz";
085
086        private LanguageModel languageModel;
087
088        /**
089         * Load a language model from {@value #LANGUAGE_MODEL_BINARY}
090         * 
091         * @throws IOException
092         */
093
094        public LanguageDetector() throws IOException {
095                this(false);
096        }
097
098        @SuppressWarnings("unchecked")
099        private void loadFromJSON() throws IOException {
100                Map<String, Object> languageModelRaw;
101                final InputStream is = new GZIPInputStream(LanguageDetector.class.getResourceAsStream(LANGUAGE_MODEL_JSON));
102                languageModelRaw = gson.fromJson(new InputStreamReader(is), Map.class);
103                languageModel = new LanguageModel(languageModelRaw);
104        }
105
106        private void loadFromBinary() throws IOException {
107                this.languageModel = IOUtils.read(
108                                new GZIPInputStream(LanguageDetector.class.getResourceAsStream(LANGUAGE_MODEL_BINARY)),
109                                LanguageModel.class
110                                );
111        }
112
113        /**
114         * Create a language detector with a provided language model
115         * 
116         * @param model
117         */
118        public LanguageDetector(LanguageModel model) {
119                this.languageModel = model;
120        }
121
122        LanguageDetector(boolean fromJSON) throws IOException {
123                if (fromJSON) {
124                        loadFromJSON();
125                }
126                else {
127                        loadFromBinary();
128                }
129        }
130
131        /**
132         * 
133         * A langauge with an associated confidence
134         * 
135         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
136         * 
137         */
138        public static class WeightedLocale {
139                /**
140                 * Default constructor
141                 * 
142                 * @param language
143                 * @param best
144                 */
145                public WeightedLocale(String language, double best) {
146                        this.language = language;
147                        this.confidence = best;
148                }
149
150                @Override
151                public String toString() {
152                        return String.format("%s: %f", this.language.toString(), this.confidence);
153                }
154
155                /**
156                 * @return the locale based on the language
157                 */
158                public Locale getLocale() {
159                        return new Locale(language);
160                }
161
162                /**
163                 * @return this weighted locale as a map
164                 */
165                public Map<String, Object> asMap() {
166                        final Map<String, Object> map = new HashMap<String, Object>();
167                        map.put("language", language);
168                        map.put("confidence", confidence);
169                        return map;
170                }
171
172                /**
173                 * @param map
174                 * @return Construct a weighted locale from a map
175                 */
176                public static WeightedLocale fromMap(Map<String, Object> map) {
177                        return new WeightedLocale((String) map.get("language"), (Double) map.get("confidence"));
178                }
179
180                /**
181                 * Estimated language
182                 */
183                public String language;
184
185                /**
186                 * Naive bayesian probability
187                 */
188                public double confidence;
189        }
190
191        /**
192         * Classify the language using a naive-bayes model
193         * 
194         * @param text
195         * @return the detected language
196         */
197        public WeightedLocale classify(String text) {
198                final DenseMatrix fv = tokenize(text);
199                final WeightedLocale locale = naiveBayesClassify(fv);
200                return locale;
201        }
202
203        DenseMatrix nbWorkspace = null;
204
205        private WeightedLocale naiveBayesClassify(DenseMatrix fv) {
206                if (nbWorkspace == null) {
207                        nbWorkspace = new DenseMatrix(1, this.languageModel.naiveBayesPTC.numColumns());
208                }
209                final double logFVSum = sumLogFactorial(fv);
210                fv.mult(this.languageModel.naiveBayesPTC, nbWorkspace);// times(this.languageModel.naiveBayesPTC);
211                final DenseMatrix pdc = nbWorkspace;
212                // multiplied.print(5, 5);
213                // this.languageModel.naiveBayesPTC.print(5, 5);
214                pdc.add(this.languageModel.naiveBayesPC);
215                final double[] pdData = pdc.getData();
216                int bestIndex = -1;
217                double best = 0;
218                double sum = 0;
219                for (int i = 0; i < pdc.numColumns(); i++) {
220                        final double correctedScore = pdData[i] - logFVSum;
221                        // System.out.format("%s scores %f \n",this.languageModel.naiveBayesClasses[i],correctedScore);
222                        sum += correctedScore;
223                        if (bestIndex == -1 || correctedScore > best)
224                        {
225                                bestIndex = i;
226                                best = correctedScore;
227                        }
228                }
229
230                return new WeightedLocale(this.languageModel.naiveBayesClasses[bestIndex], best / sum);
231        }
232
233        // an element wise log-factorial
234        TIntDoubleHashMap logFacCache = new TIntDoubleHashMap();
235
236        private double sumLogFactorial(DenseMatrix fv) {
237                double sum = 0;
238                final double[] data = fv.getData();
239                for (int i = 0; i < fv.numColumns(); i++) {
240                        final int fvi = (int) data[i];
241                        if (logFacCache.contains(fvi))
242                        {
243                                sum += logFacCache.get(fvi);
244                        }
245                        else {
246                                for (int j = 1; j < fvi + 1; j++) {
247                                        sum += Math.log(j);
248                                }
249                        }
250                }
251                return sum;
252        }
253
254        private DenseMatrix tokenize(String text) {
255                byte[] ords = null;
256                try {
257                        ords = text.getBytes("UTF-8");
258                } catch (final UnsupportedEncodingException e) {
259                }
260                int state = 0;
261                final TIntIntHashMap statecount = new TIntIntHashMap();
262                for (final byte letter : ords) {
263                        state = this.languageModel.tk_nextmove[(state << 8) + (letter & 0xff)];
264                        statecount.adjustOrPutValue(state, 1, 1);
265                }
266                final double[][] fv = new double[1][this.languageModel.naiveBayesNFeats];
267                statecount.forEachEntry(new TIntIntProcedure() {
268                        @Override
269                        public boolean execute(int state, final int statecount) {
270                                final int[] indexes = LanguageDetector.this.languageModel.tk_output.get(state);
271                                if (indexes == null)
272                                        return true;
273                                for (final int i : indexes) {
274
275                                        fv[0][i] += statecount;
276                                }
277                                return true;
278                        }
279                });
280                return new DenseMatrix(fv);
281        }
282
283        /**
284         * @return the underlying {@link LanguageModel}
285         */
286        public LanguageModel getLanguageModel() {
287                return this.languageModel;
288        }
289
290        /**
291         * prints available languages
292         * 
293         * @param args
294         * @throws IOException
295         */
296        public static void main(String[] args) throws IOException {
297                final LanguageDetector lm = new LanguageDetector();
298                System.out.println("Available languages: ");
299                for (final String string : lm.languageModel.naiveBayesClasses) {
300                        System.out.println(string + ": " + new Locale(string).getDisplayLanguage());
301                }
302        }
303}