001/**
002 * Copyright (c) 2012, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.twitter.modes.preprocessing;
031
032import java.io.IOException;
033import java.util.ArrayList;
034import java.util.HashMap;
035import java.util.HashSet;
036import java.util.List;
037import java.util.Map;
038
039import org.openimaj.io.FileUtils;
040import org.openimaj.text.nlp.language.LanguageDetector.WeightedLocale;
041import org.openimaj.twitter.USMFStatus;
042
043/**
044 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
045 *
046 */
047public class StopwordMode extends TwitterPreprocessingMode<List<String>> {
048
049        private static final String STOPWORDS_KEY = "nostopwords";
050        private static final String[][] STOPWORD_FILES = {
051                new String[]{"en","/org/openimaj/text/stopwords/stopwords-list.txt"},
052                new String[]{"en","/org/openimaj/text/stopwords/en_stopwords.txt"},
053                new String[]{"en","/org/openimaj/text/stopwords/en_dokuwiki_stopwords.txt"},
054                new String[]{"bg","/org/openimaj/text/stopwords/bg_dokuwiki_stopwords.txt"},
055                new String[]{"da","/org/openimaj/text/stopwords/da_dokuwiki_stopwords.txt"},
056                new String[]{"de","/org/openimaj/text/stopwords/de_dokuwiki_stopwords.txt"},
057                new String[]{"el","/org/openimaj/text/stopwords/el_dokuwiki_stopwords.txt"},
058                new String[]{"es","/org/openimaj/text/stopwords/es_dokuwiki_stopwords.txt"},
059                new String[]{"fi","/org/openimaj/text/stopwords/fi_dokuwiki_stopwords.txt"},
060                new String[]{"fr","/org/openimaj/text/stopwords/fr_dokuwiki_stopwords.txt"},
061                new String[]{"it","/org/openimaj/text/stopwords/it_dokuwiki_stopwords.txt"},
062                new String[]{"nl","/org/openimaj/text/stopwords/nl_dokuwiki_stopwords.txt"},
063                new String[]{"pt","/org/openimaj/text/stopwords/pt_dokuwiki_stopwords.txt"},
064                new String[]{"sv","/org/openimaj/text/stopwords/sv_dokuwiki_stopwords.txt"},
065        };
066        private LanguageDetectionMode langMode;
067        private TokeniseMode tokMode;
068        private HashMap<String, HashSet<String>> languageStopwords;
069
070        /**
071         * @throws IOException
072         */
073        public StopwordMode() throws IOException {
074                langMode = new LanguageDetectionMode();
075                tokMode = new TokeniseMode();
076                languageStopwords = loadStopwords();
077        }
078
079        private HashMap<String, HashSet<String>> loadStopwords() {
080                HashMap<String,HashSet<String>> retMap = new HashMap<String,HashSet<String>>();
081                for (String[] swLangFile: STOPWORD_FILES) {
082                        try {
083                                HashSet<String> ret = new HashSet<String>();
084                                String[] swLines = FileUtils.readlines(StopwordMode.class.getResourceAsStream(swLangFile[1]),"UTF-8");
085                                for (String sw : swLines) {
086                                        if(sw.startsWith("#")) continue;
087                                        ret.add(sw.toLowerCase().trim());
088                                }
089                                retMap.put(swLangFile[0], ret);
090                        } catch (IOException e) {       }
091                }
092                return retMap;
093        }
094
095        @Override
096        public List<String> process(USMFStatus twitterStatus) {
097                List<String> nonstopwords = new ArrayList<String>();
098                try {
099                        Map<String,Object> localeMap = TwitterPreprocessingMode.results(twitterStatus,langMode);
100                        WeightedLocale locale = WeightedLocale.fromMap(localeMap);
101                        String country = locale.language.toLowerCase();
102                        Map<String,List<String>> tokens = TwitterPreprocessingMode.results(twitterStatus,tokMode);
103
104                        if(!languageStopwords.containsKey(country)){
105                                // We don't know stopwords for this language, all the tokens become the non-stopwords!
106                                nonstopwords.addAll(tokens.get(TokeniseMode.TOKENS_ALL));
107                        }
108                        else{
109                                HashSet<String> protectedToks = new HashSet<String>();
110                                protectedToks.addAll(tokens.get(TokeniseMode.TOKENS_PROTECTED));
111                                HashSet<String> stopwords = languageStopwords.get(country);
112                                for (String token : tokens.get(TokeniseMode.TOKENS_ALL)) {
113                                        if(!protectedToks.contains(token)) {
114                                                if(!stopwords.contains(token.toLowerCase()))
115                                                        nonstopwords.add(token);
116                                        }
117                                        else{
118                                                nonstopwords.add(token);
119                                        }
120                                }
121                        }
122                } catch (Exception e) { }
123                twitterStatus.addAnalysis(STOPWORDS_KEY, nonstopwords);
124                return nonstopwords;
125        }
126
127        @Override
128        public String getAnalysisKey() {
129                return STOPWORDS_KEY;
130        }
131
132}