Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.sentiment.lexicon;
031
032import java.util.ArrayList;
033import java.util.HashSet;
034import java.util.Iterator;
035import java.util.LinkedList;
036import java.util.List;
037import java.util.Set;
038
039import org.arabidopsis.ahocorasick.AhoCorasick;
040import org.arabidopsis.ahocorasick.SearchResult;
041import org.openimaj.text.nlp.textpipe.annotations.AnnotationUtils;
042import org.openimaj.text.nlp.textpipe.annotations.RawTextAnnotation;
043import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation;
044import org.openimaj.text.nlp.textpipe.annotators.MissingRequiredAnnotationException;
045import org.openimaj.text.nlp.textpipe.annotators.OpenNLPTokenAnnotator;
046
047/**
048 * An implementation of Hatzivassiloglou and McKeown's approach to a
049 * semisupervised method of building a bipolar sentiment lexicon. This is a one
050 * pass version, in that the corpus to build from is fixed.
051 * 
052 */
053public class HMLexiconBuilder {
054
055        Set<String> positiveLexicon;
056        Set<String> negativeLexicon;
057        List<String> newPos;
058        List<String> newNeg;
059        List<String> corpus;
060        OpenNLPTokenAnnotator tokA;
061
062        public HMLexiconBuilder(List<String> posBootStrap, List<String> negBootStrap) {
063                this.positiveLexicon = new HashSet<String>();
064                this.negativeLexicon = new HashSet<String>();
065                this.newPos = new LinkedList<String>();
066                this.newNeg = new LinkedList<String>();
067                this.tokA = new OpenNLPTokenAnnotator();
068                for (String s : posBootStrap) {
069                        addToLexicon(positiveLexicon, newPos, s);
070                }
071                for (String s : negBootStrap) {
072                        addToLexicon(negativeLexicon, newNeg, s);
073                }
074        }
075
076        private void addToLexicon(Set<String> compSet, List<String> q, String token) {
077                if (compSet.add(token))
078                        q.add(token);
079        }
080
081        public void buildFromCorpus(List<String> corpus) {
082                this.corpus = corpus;
083                process();
084        }
085
086        private void process() {
087                while (!newPos.isEmpty())
088                        processNewLexTokens(positiveLexicon, newPos, negativeLexicon,
089                                        newNeg);
090                while (!newNeg.isEmpty())
091                        processNewLexTokens(negativeLexicon, newNeg, positiveLexicon,
092                                        newPos);
093                // Make sure that they have not added to each other after processing.
094                if (!(newPos.isEmpty() || !newNeg.isEmpty()))
095                        process();
096        }
097
098        private void processNewLexTokens(Set<String> lexicon, List<String> q,
099                        Set<String> anti_lexicon, List<String> anti_q) {
100                AhoCorasick<String> tri;
101                tri = new AhoCorasick<String>();
102                for (String string : q) {
103                        String syno = string + " and";
104                        String anti = string + " but";
105                        tri.add(syno.getBytes(), syno);
106                        tri.add(anti.getBytes(), anti);
107                }
108                tri.prepare();
109                q.clear();
110                for (String doc : corpus) {
111                        String lcdoc = doc.toLowerCase();
112                        Iterator<SearchResult<String>> result = tri.search(lcdoc.getBytes());
113                        List<String> hits = new ArrayList<String>();
114                        while (result.hasNext()) {
115                                SearchResult<String> sr = result.next();
116                                for (String s : sr.getOutputs()) {
117                                        hits.add(s);
118                                }
119                        }
120                        for (String hit : hits) {
121                                int tokeniseFrom = lcdoc.indexOf(hit) + hit.length();
122                                List<String> tokens = tokenise(lcdoc.substring(tokeniseFrom));
123                                Iterator<String> it = tokens.iterator();
124                                String newLex=null;
125                                boolean anti =false;
126                                if(it.hasNext()){
127                                        String first = it.next();
128                                        if(first.equals("not")){
129                                                anti=true;
130                                                if(it.hasNext()){
131                                                        newLex=it.next();
132                                                }
133                                        }
134                                        else{
135                                                newLex = first;
136                                        }
137                                }
138                                
139                                if(hit.endsWith("but"))anti=!anti;
140                                if (newLex!=null) {
141                                        if (!anti)addToLexicon(lexicon, q, newLex);
142                                        else addToLexicon(anti_lexicon, anti_q, newLex);                                        
143                                }
144                        }
145                }
146        }
147
148        private List<String> tokenise(String text) {
149                RawTextAnnotation rta = new RawTextAnnotation(text);
150                try {
151                        tokA.annotate(rta);
152                        return AnnotationUtils.getStringTokensFromTokenAnnotationList(rta.getAnnotationsFor(TokenAnnotation.class));
153                } catch (MissingRequiredAnnotationException e) {
154                        e.printStackTrace();
155                }
156                return null;
157        }
158
159}