001/** 002 * Copyright (c) 2012, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.twitter.modes.preprocessing; 031 032import java.io.IOException; 033import java.util.ArrayList; 034import java.util.HashSet; 035import java.util.List; 036import java.util.Locale; 037import java.util.Map; 038 039import org.openimaj.text.nlp.language.LanguageDetector.WeightedLocale; 040import org.openimaj.twitter.USMFStatus; 041import org.tartarus.snowball.SnowballProgram; 042import org.tartarus.snowball.ext.EnglishStemmer; 043 044/** 045 * A gateway class which loads and uses the #PorterEnglishStemmingFilter 046 * 047 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 048 * 049 */ 050public class StemmingMode extends TwitterPreprocessingMode<List<String>> { 051 052 final static String STEMMED = "stemmed"; 053 private TwitterPreprocessingMode<Map<String,Object>> langMode; 054 private TwitterPreprocessingMode<Map<String,List<String>>> tokMode; 055 private SnowballProgram stemmer; 056 057 /** 058 * Loads the language detector 059 * @throws IOException 060 */ 061 public StemmingMode() throws IOException { 062 try { 063 langMode = new LanguageDetectionMode(); 064 tokMode = new TokeniseMode(); 065 stemmer = new EnglishStemmer(); 066 } catch (Exception e) { 067 throw new IOException("Couldn't create required language detector and tokeniser",e); 068 } 069 } 070 071 @Override 072 public List<String> process(USMFStatus twitterStatus) { 073 List<String> stems = new ArrayList<String>(); 074 try { 075 Map<String,Object> localeMap = TwitterPreprocessingMode.results(twitterStatus,langMode); 076 WeightedLocale locale = WeightedLocale.fromMap(localeMap); 077 if(locale.getLocale().equals(Locale.ENGLISH)){ 078 Map<String,List<String>> tokens = TwitterPreprocessingMode.results(twitterStatus,tokMode); 079 HashSet<String> protectedToks = new HashSet<String>(); 080 protectedToks.addAll(tokens.get(TokeniseMode.TOKENS_PROTECTED)); 081 for (String token : tokens.get(TokeniseMode.TOKENS_ALL)) { 082 if(! protectedToks.contains(token)) { 083 stemmer.setCurrent(token); 084 stemmer.stem(); 085 stems.add(stemmer.getCurrent()); 086 } 087 else{ 088 stems.add(token); 089 } 090 091 } 092 } 093 } catch (Exception e) { } 094 twitterStatus.addAnalysis(STEMMED, stems); 095 return stems; 096 097 } 098 @Override 099 public String getAnalysisKey() { 100 return StemmingMode.STEMMED; 101 } 102 103}