001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.demos.sandbox.ml.linear.learner.stream.twitter; 031 032import java.util.HashMap; 033import java.util.List; 034import java.util.Map; 035import java.util.Map.Entry; 036 037import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingMode; 038import org.openimaj.twitter.USMFStatus; 039import org.openimaj.util.filter.FilterUtils; 040import org.openimaj.util.function.Function; 041import org.openimaj.util.function.Predicate; 042 043/** 044 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 045 * 046 */ 047public class USMFStatusBagOfWords implements Function<List<USMFStatus>,Map<String,Map<String,Double>>> { 048 049 private TwitterPreprocessingMode<List<String>> mode; 050 private Predicate<String> junkWords; 051 private NameStrategy userStrat; 052 053 /** 054 * @param mode the mode from which to grab words 055 */ 056 public USMFStatusBagOfWords(TwitterPreprocessingMode<List<String>> mode) { 057 this.mode = mode; 058 this.userStrat = new UserNameStrategy(); 059 initJunkWords(); 060 } 061 062 /** 063 * @param mode the mode from which to grab words 064 * @param userNameStrategy 065 */ 066 public USMFStatusBagOfWords(TwitterPreprocessingMode<List<String>> mode, NameStrategy userNameStrategy) { 067 this.mode = mode; 068 this.userStrat = userNameStrategy; 069 initJunkWords(); 070 } 071 072 073 074 private void initJunkWords() { 075 this.junkWords = new Predicate<String>() { 076 077 @Override 078 public boolean test(String object) { 079 String lowerCase = object.toLowerCase(); 080 if( lowerCase.length()<=2 || 081 lowerCase.contains("http") || 082 lowerCase.startsWith("@") || 083 lowerCase.startsWith("#") || 084 lowerCase.startsWith("$") 085 ) { 086 return false; 087 } 088 else { 089 return true; 090 } 091 } 092 }; 093 } 094 095 @Override 096 public Map<String,Map<String,Double>> apply(List<USMFStatus> in) { 097 Map<String, Map<String, Double>> ret = new HashMap<String, Map<String,Double>>(); 098 Map<String,Double> userTotals = new HashMap<String, Double>(); 099 for (USMFStatus usmfStatus : in) { 100 String userName = this.userStrat.createName(usmfStatus); 101 Map<String, Double> userWordCounts = userWordCounts(ret, userTotals,userName); 102 103 try { 104 List<String> words = TwitterPreprocessingMode.results(usmfStatus, mode); 105 words = FilterUtils.filter(words, junkWords); 106 userTotals.put(userName, userTotals.get(userName) + words.size()); 107 for (String word : words) { 108 Double currentWordCount = userWordCounts.get(word); 109 if(currentWordCount == null) userWordCounts.put(word, 1d); 110 else userWordCounts.put(word, currentWordCount + 1d); 111 } 112 113 } catch (Exception e) { 114 115 } 116 } 117 for (Entry<String, Map<String, Double>> entry: ret.entrySet()) { 118 Map<String, Double> userWords = entry.getValue(); 119 String userName = entry.getKey(); 120 for (String word : userWords.keySet()) { 121 userWords.put(word, userWords.get(word) / userTotals.get(userName)); 122 } 123 } 124 125 return ret; 126 } 127 128 private Map<String, Double> userWordCounts(Map<String, Map<String, Double>> ret, Map<String,Double> totals, String userName) { 129 Map<String, Double> wordCounts = ret.get(userName); 130 if(wordCounts == null) { 131 ret.put(userName, wordCounts = new HashMap<String, Double>()); 132 totals.put(userName, 0d); 133 } 134 return wordCounts; 135 } 136 137}