001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.sentiment.model.wordlist.util; 031 032import java.io.IOException; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.HashMap; 036import java.util.List; 037import java.util.Map; 038import java.util.Scanner; 039 040import org.openimaj.citation.annotation.Reference; 041import org.openimaj.citation.annotation.ReferenceType; 042import org.openimaj.io.ReadableASCII; 043 044/** 045 * The TFF data format is the word clue format used by OpinionFinder. Details of 046 * MPQA and this format can be found: http://www.cs.pitt.edu/mpqa/ 047 * <p> 048 * The way to think about TFF entries are clues that a given word (or set of 049 * words) give to the sentiment and subjectivity of a given phrase. There are 050 * many clever ways to use this information highlighted in this paper: 051 * 052 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 053 * 054 */ 055@Reference( 056 author = { "Janyce Wiebe", "Theresa Wilson", "Claire Cardie" }, 057 title = "Annotating expressions of opinions and emotions in language. ", 058 type = ReferenceType.Article, 059 year = "2005") 060public class TFF implements ReadableASCII { 061 /** 062 * The subjectivity leve 063 * 064 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 065 * 066 */ 067 public static enum Type { 068 /** 069 * very subjective 070 */ 071 strongsubj, 072 /** 073 * weakly subjective 074 */ 075 weaksubj 076 } 077 078 /** 079 * The Part of Speech of this clue. i.e. the clue applies when the word is 080 * at this POS 081 * 082 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 083 * 084 */ 085 public static enum Pos { 086 /** 087 * Adjective 088 */ 089 adj, 090 /** 091 * Adverb 092 */ 093 adverb, 094 /** 095 * wherever seen 096 */ 097 anypos, 098 /** 099 * seen as a noun 100 */ 101 noun, 102 /** 103 * seen as a verb 104 */ 105 verb 106 } 107 108 /** 109 * The polarity of this word in this POS 110 * 111 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 112 * 113 */ 114 public static enum Polarity { 115 /** 116 * both positive and negative 117 */ 118 both, 119 /** 120 * neutral 121 */ 122 neutral, 123 /** 124 * negative 125 */ 126 negative, 127 /** 128 * 129 */ 130 weakneg, 131 /** 132 * 133 */ 134 strongneg, 135 /** 136 * 137 */ 138 positive, 139 /** 140 * 141 */ 142 strongpos, 143 /** 144 * 145 */ 146 weakpos 147 } 148 149 /** 150 * A particular clue 151 * 152 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 153 * 154 */ 155 public static class Clue { 156 /** 157 * The clue subjectivity 158 */ 159 public Type type; 160 /** 161 * The words involved in the clue 162 */ 163 public String[] words; 164 /** 165 * The parts of speech of each word 166 */ 167 public Pos[] poses; 168 /** 169 * Whether the words are stemmed 170 */ 171 public boolean[] stemmed; 172 /** 173 * The source of this polarity 174 */ 175 public String polannsrc; 176 /** 177 * the polarity of the clue 178 */ 179 public Polarity polarity; 180 181 @Override 182 public Clue clone() { 183 final Clue entry = new Clue(); 184 entry.type = type; 185 entry.polannsrc = polannsrc; 186 entry.polarity = polarity; 187 entry.words = Arrays.copyOf(words, words.length); 188 entry.poses = Arrays.copyOf(poses, poses.length); 189 entry.stemmed = Arrays.copyOf(stemmed, stemmed.length); 190 return entry; 191 } 192 } 193 194 /** 195 * Every clue in this TFF 196 */ 197 public ArrayList<Clue> entriesList; 198 /** 199 * Every word mapped to each clue in this TFF 200 */ 201 public Map<String, List<Clue>> entriesMap; 202 203 /** 204 * instatiate the clue map and the clue list 205 */ 206 public TFF() { 207 entriesMap = new HashMap<String, List<Clue>>(); 208 entriesList = new ArrayList<Clue>(); 209 } 210 211 @Override 212 public void readASCII(Scanner in) throws IOException { 213 while (in.hasNextLine()) { 214 final String line = in.nextLine(); 215 if (line.startsWith("#")) 216 continue; 217 final String[] parts = line.split(" "); 218 final Clue entry = new Clue(); 219 for (final String part : parts) { 220 final String[] namevalue = part.split("="); 221 if (namevalue.length != 2) 222 continue; 223 final String name = namevalue[0]; 224 final String value = namevalue[1]; 225 if (name.equals("type")) 226 entry.type = Enum.valueOf(Type.class, value); 227 else if (name.equals("len")) { 228 final int len = Integer.parseInt(value); 229 entry.words = new String[len]; 230 entry.poses = new Pos[len]; 231 entry.stemmed = new boolean[len]; 232 } 233 else if (name.startsWith("word")) { 234 final int wordN = Integer.parseInt(name.substring(4)) - 1; 235 entry.words[wordN] = value; 236 } 237 else if (name.startsWith("pos")) { 238 final int posN = Integer.parseInt(name.substring(3)) - 1; 239 entry.poses[posN] = Enum.valueOf(Pos.class, value); 240 } 241 else if (name.startsWith("stemmed")) { 242 final int stemN = Integer.parseInt(name.substring(7)) - 1; 243 entry.stemmed[stemN] = value.equals("y"); 244 } 245 else if (name.equals("polannsrc")) { 246 entry.polannsrc = value; 247 } 248 else if (name.equals("mpqapolarity")) { 249 entry.polarity = Enum.valueOf(Polarity.class, value); 250 } 251 else { 252 // casually ignore this one! 253 } 254 } 255 this.entriesList.add(entry); 256 for (final String string : entry.words) { 257 List<Clue> wordEntries = this.entriesMap.get(string); 258 if (wordEntries == null) 259 this.entriesMap.put(string, wordEntries = new ArrayList<Clue>()); 260 wordEntries.add(entry); 261 } 262 } 263 264 } 265 266 @Override 267 public String asciiHeader() { 268 return ""; 269 } 270 271 @Override 272 public TFF clone() { 273 final TFF tff = new TFF(); 274 for (final Clue entry : this.entriesList) { 275 tff.entriesList.add(entry); 276 for (final String string : entry.words) { 277 List<Clue> wordEntries = tff.entriesMap.get(string); 278 if (wordEntries == null) 279 tff.entriesMap.put(string, wordEntries = new ArrayList<Clue>()); 280 wordEntries.add(entry); 281 } 282 } 283 return tff; 284 } 285 286}