001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.sentiment.lexicon; 031 032import java.util.ArrayList; 033import java.util.HashMap; 034import java.util.List; 035import java.util.Map; 036 037import org.openimaj.ml.clustering.DoubleCentroidsResult; 038import org.openimaj.ml.clustering.assignment.HardAssigner; 039import org.openimaj.ml.clustering.kmeans.DoubleKMeans; 040import org.openimaj.text.nlp.textpipe.annotations.POSAnnotation; 041import org.openimaj.text.nlp.textpipe.annotations.RawTextAnnotation; 042import org.openimaj.text.nlp.textpipe.annotations.SentenceAnnotation; 043import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation; 044import org.openimaj.text.nlp.textpipe.annotators.MissingRequiredAnnotationException; 045import org.openimaj.text.nlp.textpipe.annotators.OpenNLPPOSAnnotator; 046import org.openimaj.text.nlp.textpipe.annotators.OpenNLPSentenceAnnotator; 047import org.openimaj.text.nlp.textpipe.annotators.OpenNLPTokenAnnotator; 048import org.openimaj.util.pair.IntDoublePair; 049 050/** 051 * An implementation of Hatzivassiloglou and McKeown's approach to a 052 * semisupervised method of building a bipolar sentiment lexicon. This is a one 053 * pass version, in that the corpus to build from is fixed. 054 * 055 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 056 * 057 */ 058public class TotalLexBuilder { 059 private HashMap<String, double[]> vectors; 060 private HashMap<String, Integer> assignments; 061 private List<String> posConfirmation; 062 private List<String> negConfirmation; 063 private OpenNLPTokenAnnotator tokA; 064 private OpenNLPPOSAnnotator posA; 065 private OpenNLPSentenceAnnotator sentA; 066 private HashMap<String, List<HashMap<String, Counter>>> counts; 067 private int AND = 0, BUT = 1; 068 069 /** 070 * Constructor. 071 * 072 * @param posConfirmation 073 * = list of positive adjectives used to orient the 074 * classification. 075 * @param negConfirmation 076 * = list of negative adjectives used to orient the 077 * classification. 078 */ 079 public TotalLexBuilder(List<String> posConfirmation, 080 List<String> negConfirmation) 081 { 082 this.posConfirmation = posConfirmation; 083 this.negConfirmation = negConfirmation; 084 tokA = new OpenNLPTokenAnnotator(); 085 posA = new OpenNLPPOSAnnotator(); 086 sentA = new OpenNLPSentenceAnnotator(); 087 this.counts = new HashMap<String, List<HashMap<String, Counter>>>(); 088 } 089 090 /** 091 * Builds a Scored Sentiment mapping of adjectives from the corpus. 092 * 093 * @param corpus 094 * @return Scored Sentiment map of adjectives. 095 */ 096 public Map<String, Double> build(List<String> corpus) { 097 // Find all the adjective conjunctions. 098 for (final String doc : corpus) { 099 getAdjectiveConjunctions(doc, " and "); 100 getAdjectiveConjunctions(doc, " but "); 101 } 102 // Build the vectors for each adjective 103 buildVectors(); 104 normaliseVectors(); 105 cluster(); 106 return null; 107 } 108 109 private void cluster() { 110 final DoubleKMeans fkm = DoubleKMeans.createExact(counts.keySet().size(), 2); 111 final double[][] data = new double[counts.keySet().size()][]; 112 int i = 0; 113 for (final double[] ds : vectors.values()) { 114 data[i] = ds; 115 i++; 116 } 117 final DoubleCentroidsResult cluster = fkm.cluster(data); 118 final HardAssigner<double[], double[], IntDoublePair> assigner = cluster.defaultHardAssigner(); 119 assignments = new HashMap<String, Integer>(); 120 for (final String adj : vectors.keySet()) { 121 assignments.put(adj, assigner.assign(vectors.get(adj))); 122 } 123 for (final String adj : assignments.keySet()) { 124 System.out.println(adj + " " + assignments.get(adj)); 125 } 126 } 127 128 private void normaliseVectors() { 129 130 } 131 132 private void buildVectors() { 133 vectors = new HashMap<String, double[]>(); 134 for (final String adj : counts.keySet()) { 135 vectors.put(adj, new double[counts.keySet().size() * 2]); 136 final HashMap<String, Counter> andCount = counts.get(adj).get(AND); 137 final HashMap<String, Counter> butCount = counts.get(adj).get(BUT); 138 int i = 0; 139 for (final String adjInc : counts.keySet()) { 140 if (andCount.containsKey(adjInc)) { 141 vectors.get(adj)[i] = andCount.get(adjInc).count; 142 } else { 143 vectors.get(adj)[i] = 0; 144 } 145 if (butCount.containsKey(adjInc)) { 146 vectors.get(adj)[(i + counts.keySet().size())] = butCount 147 .get(adjInc).count; 148 } else { 149 vectors.get(adj)[(i + counts.keySet().size())] = 0; 150 } 151 i++; 152 } 153 } 154 } 155 156 private void getAdjectiveConjunctions(String toSearch, String conjunction) { 157 String leftToSearch = toSearch; 158 if (leftToSearch.contains(conjunction)) { 159 final RawTextAnnotation rta = new RawTextAnnotation(toSearch); 160 try { 161 sentA.annotate(rta); 162 tokA.annotate(rta); 163 posA.annotate(rta); 164 } catch (final MissingRequiredAnnotationException e) { 165 e.printStackTrace(); 166 } 167 final List<SentenceAnnotation> sentences = rta 168 .getAnnotationsFor(SentenceAnnotation.class); 169 int searchedIndex = 0; 170 int sentI = 0; 171 while (leftToSearch.contains(conjunction)) { 172 final int loc = leftToSearch.indexOf(conjunction) + 1 + searchedIndex; 173 List<TokenAnnotation> tokens = null; 174 int t = 0; 175 searchloop: for (int s = sentI; s < sentences.size(); s++) { 176 final SentenceAnnotation sentence = sentences.get(s); 177 if (sentence.start < loc && sentence.stop > loc) { 178 tokens = sentence 179 .getAnnotationsFor(TokenAnnotation.class); 180 for (t = 0; t < tokens.size(); t++) { 181 if (tokens.get(t).start + sentence.start == loc) { 182 break searchloop; 183 } 184 } 185 } 186 sentI++; 187 } 188 if (tokens != null) { 189 final int c = (conjunction.trim().equals("and")) ? AND : BUT; 190 checkForConjunctionGroup(tokens, t, c); 191 } 192 searchedIndex = loc + conjunction.length() - 1; 193 leftToSearch = toSearch.substring(searchedIndex); 194 } 195 } 196 } 197 198 private void checkForConjunctionGroup(List<TokenAnnotation> tokens, 199 int conjunctionIndex, int conjunction) 200 { 201 if (tokens.size() > conjunctionIndex + 1 202 && tokens.get(conjunctionIndex - 1) 203 .getAnnotationsFor(POSAnnotation.class).get(0).pos 204 .toString().contains("JJ") 205 && tokens.get(conjunctionIndex + 1) 206 .getAnnotationsFor(POSAnnotation.class).get(0).pos 207 .toString().contains("JJ")) 208 { 209 210 System.out.println(tokens.get(conjunctionIndex - 1) 211 .getStringToken() 212 + " " 213 + tokens.get(conjunctionIndex).getStringToken() 214 + " " 215 + tokens.get(conjunctionIndex + 1).getStringToken()); 216 217 final List<String> adjectives = new ArrayList<String>(); 218 adjectives.add(tokens.get(conjunctionIndex - 1).getStringToken()); 219 adjectives.add(tokens.get(conjunctionIndex + 1).getStringToken()); 220 for (int i = 0; i < adjectives.size(); i++) { 221 final String vecAdj = adjectives.get(i); 222 if (!counts.keySet().contains(vecAdj)) { 223 final ArrayList<HashMap<String, Counter>> cons = new ArrayList<HashMap<String, Counter>>(); 224 cons.add(new HashMap<String, Counter>()); 225 cons.add(new HashMap<String, Counter>()); 226 counts.put(vecAdj, cons); 227 } 228 final HashMap<String, Counter> conVector = counts.get(vecAdj).get( 229 conjunction); 230 for (int j = 0; j < adjectives.size(); j++) { 231 final String incAdj = adjectives.get(j); 232 if (!conVector.containsKey(incAdj)) { 233 conVector.put(incAdj, new Counter()); 234 } else 235 conVector.get(incAdj).inc(); 236 } 237 } 238 } 239 } 240 241 /** 242 * Easily incremented object for counting. 243 * 244 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 245 * 246 */ 247 public class Counter { 248 public double count; 249 250 public Counter() { 251 count = 1.0; 252 } 253 254 public void inc() { 255 count += 1; 256 } 257 } 258 259 /** 260 * Quick tester for class 261 * 262 * @param args 263 */ 264 public static void main(String[] args) { 265 final ArrayList<String> pos = new ArrayList<String>(); 266 final ArrayList<String> neg = new ArrayList<String>(); 267 final ArrayList<String> corpus = new ArrayList<String>(); 268 pos.add("dandy"); 269 neg.add("horrible"); 270 271 corpus.add("Hello, this day is just fine and dandy, I wonder if it is going to turn horrible and sad?." 272 + " Hopefully not. " 273 + "Then again, if you are fine and warm inside, it would not make a difference. " 274 + "Unless a dandy but horrible wolf came along. " 275 + "Then we would be be warm but sad inside. " 276 + "Our only option would be to offer the sad and horrible wolf the opportunity to be warm and dandy." + 277 "warm and fine. dandy and warm. fine but horrible. dandy but sad. sad and horrible." + 278 "fine and warm, fine and dandy, fine and warm, fine and dandy"); 279 280 // corpus.add("fine and warm"); 281 final TotalLexBuilder b = new TotalLexBuilder(pos, neg); 282 b.build(corpus); 283 } 284 285}