001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.demos.sandbox.ml.linear.learner.stream.twitter;
031
032import java.util.HashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.Map.Entry;
036
037import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingMode;
038import org.openimaj.twitter.USMFStatus;
039import org.openimaj.util.filter.FilterUtils;
040import org.openimaj.util.function.Function;
041import org.openimaj.util.function.Predicate;
042
043/**
044 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
045 *
046 */
047public class USMFStatusBagOfWords implements Function<List<USMFStatus>,Map<String,Map<String,Double>>> {
048
049        private TwitterPreprocessingMode<List<String>> mode;
050        private Predicate<String> junkWords;
051        private NameStrategy userStrat;
052
053        /**
054         * @param mode the mode from which to grab words
055         */
056        public USMFStatusBagOfWords(TwitterPreprocessingMode<List<String>> mode) {
057                this.mode = mode;
058                this.userStrat = new UserNameStrategy();
059                initJunkWords();
060        }
061
062        /**
063         * @param mode the mode from which to grab words
064         * @param userNameStrategy
065         */
066        public USMFStatusBagOfWords(TwitterPreprocessingMode<List<String>> mode, NameStrategy userNameStrategy) {
067                this.mode = mode;
068                this.userStrat = userNameStrategy;
069                initJunkWords();
070        }
071
072
073
074        private void initJunkWords() {
075                this.junkWords = new Predicate<String>() {
076
077                        @Override
078                        public boolean test(String object) {
079                                String lowerCase = object.toLowerCase();
080                                if( lowerCase.length()<=2 ||
081                                        lowerCase.contains("http") ||
082                                        lowerCase.startsWith("@") ||
083                                        lowerCase.startsWith("#") ||
084                                        lowerCase.startsWith("$")
085                                ) {
086                                        return false;
087                                }
088                                else {
089                                        return true;
090                                }
091                        }
092                };
093        }
094
095        @Override
096        public Map<String,Map<String,Double>> apply(List<USMFStatus> in) {
097                Map<String, Map<String, Double>> ret = new HashMap<String, Map<String,Double>>();
098                Map<String,Double> userTotals = new HashMap<String, Double>();
099                for (USMFStatus usmfStatus : in) {
100                        String userName = this.userStrat.createName(usmfStatus);
101                        Map<String, Double> userWordCounts = userWordCounts(ret, userTotals,userName);
102
103                        try {
104                                List<String> words = TwitterPreprocessingMode.results(usmfStatus, mode);
105                                words = FilterUtils.filter(words, junkWords);
106                                userTotals.put(userName, userTotals.get(userName) + words.size());
107                                for (String word : words) {
108                                        Double currentWordCount = userWordCounts.get(word);
109                                        if(currentWordCount == null) userWordCounts.put(word, 1d);
110                                        else userWordCounts.put(word, currentWordCount + 1d);
111                                }
112
113                        } catch (Exception e) {
114
115                        }
116                }
117                for (Entry<String, Map<String, Double>> entry: ret.entrySet()) {
118                        Map<String, Double> userWords = entry.getValue();
119                        String userName = entry.getKey();
120                        for (String word : userWords.keySet()) {
121                                userWords.put(word, userWords.get(word) / userTotals.get(userName));
122                        }
123                }
124
125                return ret;
126        }
127
128        private Map<String, Double> userWordCounts(Map<String, Map<String, Double>> ret, Map<String,Double> totals, String userName) {
129                Map<String, Double> wordCounts = ret.get(userName);
130                if(wordCounts == null) {
131                        ret.put(userName, wordCounts = new HashMap<String, Double>());
132                        totals.put(userName, 0d);
133                }
134                return wordCounts;
135        }
136
137}