001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.sentiment.model.wordlist.util;
031
032import java.io.IOException;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.HashMap;
036import java.util.List;
037import java.util.Map;
038import java.util.Scanner;
039
040import org.openimaj.citation.annotation.Reference;
041import org.openimaj.citation.annotation.ReferenceType;
042import org.openimaj.io.ReadableASCII;
043
044/**
045 * The TFF data format is the word clue format used by OpinionFinder. Details of
046 * MPQA and this format can be found: http://www.cs.pitt.edu/mpqa/
047 * <p>
048 * The way to think about TFF entries are clues that a given word (or set of
049 * words) give to the sentiment and subjectivity of a given phrase. There are
050 * many clever ways to use this information highlighted in this paper:
051 * 
052 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
053 * 
054 */
055@Reference(
056                author = { "Janyce Wiebe", "Theresa Wilson", "Claire Cardie" },
057                title = "Annotating expressions of opinions and emotions in language. ",
058                type = ReferenceType.Article,
059                year = "2005")
060public class TFF implements ReadableASCII {
061        /**
062         * The subjectivity leve
063         * 
064         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
065         * 
066         */
067        public static enum Type {
068                /**
069                 * very subjective
070                 */
071                strongsubj,
072                /**
073                 * weakly subjective
074                 */
075                weaksubj
076        }
077
078        /**
079         * The Part of Speech of this clue. i.e. the clue applies when the word is
080         * at this POS
081         * 
082         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
083         * 
084         */
085        public static enum Pos {
086                /**
087                 * Adjective
088                 */
089                adj,
090                /**
091                 * Adverb
092                 */
093                adverb,
094                /**
095                 * wherever seen
096                 */
097                anypos,
098                /**
099                 * seen as a noun
100                 */
101                noun,
102                /**
103                 * seen as a verb
104                 */
105                verb
106        }
107
108        /**
109         * The polarity of this word in this POS
110         * 
111         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
112         * 
113         */
114        public static enum Polarity {
115                /**
116                 * both positive and negative
117                 */
118                both,
119                /**
120                 * neutral
121                 */
122                neutral,
123                /**
124                 * negative
125                 */
126                negative,
127                /**
128                 * 
129                 */
130                weakneg,
131                /**
132                 * 
133                 */
134                strongneg,
135                /**
136                 * 
137                 */
138                positive,
139                /**
140                 * 
141                 */
142                strongpos,
143                /**
144                 * 
145                 */
146                weakpos
147        }
148
149        /**
150         * A particular clue
151         * 
152         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
153         * 
154         */
155        public static class Clue {
156                /**
157                 * The clue subjectivity
158                 */
159                public Type type;
160                /**
161                 * The words involved in the clue
162                 */
163                public String[] words;
164                /**
165                 * The parts of speech of each word
166                 */
167                public Pos[] poses;
168                /**
169                 * Whether the words are stemmed
170                 */
171                public boolean[] stemmed;
172                /**
173                 * The source of this polarity
174                 */
175                public String polannsrc;
176                /**
177                 * the polarity of the clue
178                 */
179                public Polarity polarity;
180
181                @Override
182                public Clue clone() {
183                        final Clue entry = new Clue();
184                        entry.type = type;
185                        entry.polannsrc = polannsrc;
186                        entry.polarity = polarity;
187                        entry.words = Arrays.copyOf(words, words.length);
188                        entry.poses = Arrays.copyOf(poses, poses.length);
189                        entry.stemmed = Arrays.copyOf(stemmed, stemmed.length);
190                        return entry;
191                }
192        }
193
194        /**
195         * Every clue in this TFF
196         */
197        public ArrayList<Clue> entriesList;
198        /**
199         * Every word mapped to each clue in this TFF
200         */
201        public Map<String, List<Clue>> entriesMap;
202
203        /**
204         * instatiate the clue map and the clue list
205         */
206        public TFF() {
207                entriesMap = new HashMap<String, List<Clue>>();
208                entriesList = new ArrayList<Clue>();
209        }
210
211        @Override
212        public void readASCII(Scanner in) throws IOException {
213                while (in.hasNextLine()) {
214                        final String line = in.nextLine();
215                        if (line.startsWith("#"))
216                                continue;
217                        final String[] parts = line.split(" ");
218                        final Clue entry = new Clue();
219                        for (final String part : parts) {
220                                final String[] namevalue = part.split("=");
221                                if (namevalue.length != 2)
222                                        continue;
223                                final String name = namevalue[0];
224                                final String value = namevalue[1];
225                                if (name.equals("type"))
226                                        entry.type = Enum.valueOf(Type.class, value);
227                                else if (name.equals("len")) {
228                                        final int len = Integer.parseInt(value);
229                                        entry.words = new String[len];
230                                        entry.poses = new Pos[len];
231                                        entry.stemmed = new boolean[len];
232                                }
233                                else if (name.startsWith("word")) {
234                                        final int wordN = Integer.parseInt(name.substring(4)) - 1;
235                                        entry.words[wordN] = value;
236                                }
237                                else if (name.startsWith("pos")) {
238                                        final int posN = Integer.parseInt(name.substring(3)) - 1;
239                                        entry.poses[posN] = Enum.valueOf(Pos.class, value);
240                                }
241                                else if (name.startsWith("stemmed")) {
242                                        final int stemN = Integer.parseInt(name.substring(7)) - 1;
243                                        entry.stemmed[stemN] = value.equals("y");
244                                }
245                                else if (name.equals("polannsrc")) {
246                                        entry.polannsrc = value;
247                                }
248                                else if (name.equals("mpqapolarity")) {
249                                        entry.polarity = Enum.valueOf(Polarity.class, value);
250                                }
251                                else {
252                                        // casually ignore this one!
253                                }
254                        }
255                        this.entriesList.add(entry);
256                        for (final String string : entry.words) {
257                                List<Clue> wordEntries = this.entriesMap.get(string);
258                                if (wordEntries == null)
259                                        this.entriesMap.put(string, wordEntries = new ArrayList<Clue>());
260                                wordEntries.add(entry);
261                        }
262                }
263
264        }
265
266        @Override
267        public String asciiHeader() {
268                return "";
269        }
270
271        @Override
272        public TFF clone() {
273                final TFF tff = new TFF();
274                for (final Clue entry : this.entriesList) {
275                        tff.entriesList.add(entry);
276                        for (final String string : entry.words) {
277                                List<Clue> wordEntries = tff.entriesMap.get(string);
278                                if (wordEntries == null)
279                                        tff.entriesMap.put(string, wordEntries = new ArrayList<Clue>());
280                                wordEntries.add(entry);
281                        }
282                }
283                return tff;
284        }
285
286}