001/**
002 * Copyright (c) 2012, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.twitter.modes.preprocessing;
031
032import java.io.IOException;
033import java.util.ArrayList;
034import java.util.HashSet;
035import java.util.List;
036import java.util.Locale;
037import java.util.Map;
038
039import org.openimaj.text.nlp.language.LanguageDetector.WeightedLocale;
040import org.openimaj.twitter.USMFStatus;
041import org.tartarus.snowball.SnowballProgram;
042import org.tartarus.snowball.ext.EnglishStemmer;
043
044/**
045 * A gateway class which loads and uses the #PorterEnglishStemmingFilter
046 * 
047 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
048 *
049 */
050public class StemmingMode extends TwitterPreprocessingMode<List<String>> {
051        
052        final static String STEMMED = "stemmed";
053        private TwitterPreprocessingMode<Map<String,Object>> langMode;
054        private TwitterPreprocessingMode<Map<String,List<String>>> tokMode;
055        private SnowballProgram stemmer;
056
057        /**
058         * Loads the language detector
059         * @throws IOException 
060         */
061        public StemmingMode() throws IOException {
062                try {
063                        langMode = new LanguageDetectionMode();
064                        tokMode = new TokeniseMode();
065                        stemmer = new EnglishStemmer();
066                } catch (Exception e) {
067                        throw new IOException("Couldn't create required language detector and tokeniser",e);
068                }
069        }
070
071        @Override
072        public List<String> process(USMFStatus twitterStatus) {
073                List<String> stems = new ArrayList<String>();
074                try {
075                        Map<String,Object> localeMap = TwitterPreprocessingMode.results(twitterStatus,langMode);
076                        WeightedLocale locale = WeightedLocale.fromMap(localeMap);
077                        if(locale.getLocale().equals(Locale.ENGLISH)){
078                                Map<String,List<String>> tokens = TwitterPreprocessingMode.results(twitterStatus,tokMode);
079                                HashSet<String> protectedToks = new HashSet<String>();
080                                protectedToks.addAll(tokens.get(TokeniseMode.TOKENS_PROTECTED));
081                                for (String token : tokens.get(TokeniseMode.TOKENS_ALL)) {
082                                        if(! protectedToks.contains(token)) {
083                                                stemmer.setCurrent(token);
084                                                stemmer.stem();
085                                                stems.add(stemmer.getCurrent());
086                                        }
087                                        else{
088                                                stems.add(token);
089                                        }
090                                        
091                                }
092                        }
093                } catch (Exception e) { }
094                twitterStatus.addAnalysis(STEMMED, stems);
095                return stems;   
096                
097        }
098        @Override
099        public String getAnalysisKey() {
100                return StemmingMode.STEMMED;
101        }
102
103}