001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.twitter.token.outputmode.stats;
031
032import gnu.trove.map.TObjectLongMap;
033import gnu.trove.map.hash.TObjectLongHashMap;
034import gnu.trove.procedure.TObjectLongProcedure;
035
036import java.util.HashMap;
037import java.util.Map.Entry;
038import java.util.regex.Pattern;
039
040import org.openimaj.text.nlp.patterns.EdgePunctuationPatternProvider;
041import org.openimaj.text.nlp.patterns.EmoticonPatternProvider;
042import org.openimaj.text.nlp.patterns.PatternProvider;
043import org.openimaj.text.nlp.patterns.PunctuationPatternProvider;
044import org.openimaj.text.nlp.patterns.TimePatternProvider;
045import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider;
046import org.openimaj.text.nlp.patterns.URLPatternProvider;
047
048public class StatsWordMatch {
049        private HashMap<String, Pattern> available;
050        private TObjectLongMap<String> counts;
051
052        public StatsWordMatch() {
053                this.available = new HashMap<String,Pattern>();
054                addAvail(new EmoticonPatternProvider());
055                addAvail(new URLPatternProvider());
056                addAvail(new TimePatternProvider());
057                addAvail(new PunctuationPatternProvider());
058                TwitterStuffPatternProvider tpp = new TwitterStuffPatternProvider();
059                addAvail("TwitterStuff.hashtags", tpp.hashtagPatternString());
060                addAvail("TwitterStuff.retweets", tpp.retweetPatternString());
061                addAvail("TwitterStuff.username", tpp.usernamePatternString());
062                addAvail("EdgePunctuation",EdgePunctuationPatternProvider.edgePuncPattern());
063                this.counts = new TObjectLongHashMap<String>();
064        }
065
066        private void addAvail(PatternProvider pp) {
067                String name = pp.getClass().getName().split("PatternProvider")[0];
068                name = name.substring(pp.getClass().getPackage().getName().length() + 1);
069                addAvail(name,pp);
070                
071        }
072        
073        private void addAvail(String name, PatternProvider pp) {
074                addAvail(name,pp.patternString());
075        }
076        
077        private void addAvail(String name, String pattern) {
078                
079                this.available.put(name, Pattern.compile(pattern,Pattern.UNICODE_CASE|Pattern.CASE_INSENSITIVE));
080        }
081        
082        
083        public void updateStats(String word, long count){
084                boolean added = false;
085                String tokenise = " %s ";
086                String formattedWord = String.format(tokenise,word);
087                for (Entry<String, Pattern> spp: this.available.entrySet()) {
088                        String name = spp.getKey();
089                        Pattern pp = spp.getValue();
090                        if(pp.matcher(formattedWord).find()){
091                                this.counts.adjustOrPutValue(name,count,count);
092                                added=true;
093                        }
094                }
095                if(!added){
096//                      System.out.println("Adding to other: '" + word + "'");
097                        this.counts.adjustOrPutValue("Other", count, count);
098                }
099        }
100        
101        @Override
102        public String toString(){
103                final StringBuffer buffer = new StringBuffer();
104                buffer.append("Type Stats:\n");
105                final String format = "%s: %d\n";
106                this.counts.forEachEntry(new TObjectLongProcedure<String>(){
107                        @Override
108                        public boolean execute(String stat, long count) {
109                                buffer.append(String.format(format,stat,count));
110                                return true;
111                        }                       
112                });
113                return buffer.toString();
114        }
115}