001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.twitter.token.outputmode.stats; 031 032import gnu.trove.map.TObjectLongMap; 033import gnu.trove.map.hash.TObjectLongHashMap; 034import gnu.trove.procedure.TObjectLongProcedure; 035 036import java.util.HashMap; 037import java.util.Map.Entry; 038import java.util.regex.Pattern; 039 040import org.openimaj.text.nlp.patterns.EdgePunctuationPatternProvider; 041import org.openimaj.text.nlp.patterns.EmoticonPatternProvider; 042import org.openimaj.text.nlp.patterns.PatternProvider; 043import org.openimaj.text.nlp.patterns.PunctuationPatternProvider; 044import org.openimaj.text.nlp.patterns.TimePatternProvider; 045import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider; 046import org.openimaj.text.nlp.patterns.URLPatternProvider; 047 048public class StatsWordMatch { 049 private HashMap<String, Pattern> available; 050 private TObjectLongMap<String> counts; 051 052 public StatsWordMatch() { 053 this.available = new HashMap<String,Pattern>(); 054 addAvail(new EmoticonPatternProvider()); 055 addAvail(new URLPatternProvider()); 056 addAvail(new TimePatternProvider()); 057 addAvail(new PunctuationPatternProvider()); 058 TwitterStuffPatternProvider tpp = new TwitterStuffPatternProvider(); 059 addAvail("TwitterStuff.hashtags", tpp.hashtagPatternString()); 060 addAvail("TwitterStuff.retweets", tpp.retweetPatternString()); 061 addAvail("TwitterStuff.username", tpp.usernamePatternString()); 062 addAvail("EdgePunctuation",EdgePunctuationPatternProvider.edgePuncPattern()); 063 this.counts = new TObjectLongHashMap<String>(); 064 } 065 066 private void addAvail(PatternProvider pp) { 067 String name = pp.getClass().getName().split("PatternProvider")[0]; 068 name = name.substring(pp.getClass().getPackage().getName().length() + 1); 069 addAvail(name,pp); 070 071 } 072 073 private void addAvail(String name, PatternProvider pp) { 074 addAvail(name,pp.patternString()); 075 } 076 077 private void addAvail(String name, String pattern) { 078 079 this.available.put(name, Pattern.compile(pattern,Pattern.UNICODE_CASE|Pattern.CASE_INSENSITIVE)); 080 } 081 082 083 public void updateStats(String word, long count){ 084 boolean added = false; 085 String tokenise = " %s "; 086 String formattedWord = String.format(tokenise,word); 087 for (Entry<String, Pattern> spp: this.available.entrySet()) { 088 String name = spp.getKey(); 089 Pattern pp = spp.getValue(); 090 if(pp.matcher(formattedWord).find()){ 091 this.counts.adjustOrPutValue(name,count,count); 092 added=true; 093 } 094 } 095 if(!added){ 096// System.out.println("Adding to other: '" + word + "'"); 097 this.counts.adjustOrPutValue("Other", count, count); 098 } 099 } 100 101 @Override 102 public String toString(){ 103 final StringBuffer buffer = new StringBuffer(); 104 buffer.append("Type Stats:\n"); 105 final String format = "%s: %d\n"; 106 this.counts.forEachEntry(new TObjectLongProcedure<String>(){ 107 @Override 108 public boolean execute(String stat, long count) { 109 buffer.append(String.format(format,stat,count)); 110 return true; 111 } 112 }); 113 return buffer.toString(); 114 } 115}