001/** 002 * Copyright (c) 2012, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.twitter.modes.preprocessing; 031 032import java.io.IOException; 033import java.util.ArrayList; 034import java.util.HashMap; 035import java.util.HashSet; 036import java.util.List; 037import java.util.Map; 038 039import org.openimaj.io.FileUtils; 040import org.openimaj.text.nlp.language.LanguageDetector.WeightedLocale; 041import org.openimaj.twitter.USMFStatus; 042 043/** 044 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 045 * 046 */ 047public class StopwordMode extends TwitterPreprocessingMode<List<String>> { 048 049 private static final String STOPWORDS_KEY = "nostopwords"; 050 private static final String[][] STOPWORD_FILES = { 051 new String[]{"en","/org/openimaj/text/stopwords/stopwords-list.txt"}, 052 new String[]{"en","/org/openimaj/text/stopwords/en_stopwords.txt"}, 053 new String[]{"en","/org/openimaj/text/stopwords/en_dokuwiki_stopwords.txt"}, 054 new String[]{"bg","/org/openimaj/text/stopwords/bg_dokuwiki_stopwords.txt"}, 055 new String[]{"da","/org/openimaj/text/stopwords/da_dokuwiki_stopwords.txt"}, 056 new String[]{"de","/org/openimaj/text/stopwords/de_dokuwiki_stopwords.txt"}, 057 new String[]{"el","/org/openimaj/text/stopwords/el_dokuwiki_stopwords.txt"}, 058 new String[]{"es","/org/openimaj/text/stopwords/es_dokuwiki_stopwords.txt"}, 059 new String[]{"fi","/org/openimaj/text/stopwords/fi_dokuwiki_stopwords.txt"}, 060 new String[]{"fr","/org/openimaj/text/stopwords/fr_dokuwiki_stopwords.txt"}, 061 new String[]{"it","/org/openimaj/text/stopwords/it_dokuwiki_stopwords.txt"}, 062 new String[]{"nl","/org/openimaj/text/stopwords/nl_dokuwiki_stopwords.txt"}, 063 new String[]{"pt","/org/openimaj/text/stopwords/pt_dokuwiki_stopwords.txt"}, 064 new String[]{"sv","/org/openimaj/text/stopwords/sv_dokuwiki_stopwords.txt"}, 065 }; 066 private LanguageDetectionMode langMode; 067 private TokeniseMode tokMode; 068 private HashMap<String, HashSet<String>> languageStopwords; 069 070 /** 071 * @throws IOException 072 */ 073 public StopwordMode() throws IOException { 074 langMode = new LanguageDetectionMode(); 075 tokMode = new TokeniseMode(); 076 languageStopwords = loadStopwords(); 077 } 078 079 private HashMap<String, HashSet<String>> loadStopwords() { 080 HashMap<String,HashSet<String>> retMap = new HashMap<String,HashSet<String>>(); 081 for (String[] swLangFile: STOPWORD_FILES) { 082 try { 083 HashSet<String> ret = new HashSet<String>(); 084 String[] swLines = FileUtils.readlines(StopwordMode.class.getResourceAsStream(swLangFile[1]),"UTF-8"); 085 for (String sw : swLines) { 086 if(sw.startsWith("#")) continue; 087 ret.add(sw.toLowerCase().trim()); 088 } 089 retMap.put(swLangFile[0], ret); 090 } catch (IOException e) { } 091 } 092 return retMap; 093 } 094 095 @Override 096 public List<String> process(USMFStatus twitterStatus) { 097 List<String> nonstopwords = new ArrayList<String>(); 098 try { 099 Map<String,Object> localeMap = TwitterPreprocessingMode.results(twitterStatus,langMode); 100 WeightedLocale locale = WeightedLocale.fromMap(localeMap); 101 String country = locale.language.toLowerCase(); 102 Map<String,List<String>> tokens = TwitterPreprocessingMode.results(twitterStatus,tokMode); 103 104 if(!languageStopwords.containsKey(country)){ 105 // We don't know stopwords for this language, all the tokens become the non-stopwords! 106 nonstopwords.addAll(tokens.get(TokeniseMode.TOKENS_ALL)); 107 } 108 else{ 109 HashSet<String> protectedToks = new HashSet<String>(); 110 protectedToks.addAll(tokens.get(TokeniseMode.TOKENS_PROTECTED)); 111 HashSet<String> stopwords = languageStopwords.get(country); 112 for (String token : tokens.get(TokeniseMode.TOKENS_ALL)) { 113 if(!protectedToks.contains(token)) { 114 if(!stopwords.contains(token.toLowerCase())) 115 nonstopwords.add(token); 116 } 117 else{ 118 nonstopwords.add(token); 119 } 120 } 121 } 122 } catch (Exception e) { } 123 twitterStatus.addAnalysis(STOPWORDS_KEY, nonstopwords); 124 return nonstopwords; 125 } 126 127 @Override 128 public String getAnalysisKey() { 129 return STOPWORDS_KEY; 130 } 131 132}