001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.sentiment.lexicon;
031
032import java.util.ArrayList;
033import java.util.HashMap;
034import java.util.List;
035import java.util.Map;
036
037import org.openimaj.ml.clustering.DoubleCentroidsResult;
038import org.openimaj.ml.clustering.assignment.HardAssigner;
039import org.openimaj.ml.clustering.kmeans.DoubleKMeans;
040import org.openimaj.text.nlp.textpipe.annotations.POSAnnotation;
041import org.openimaj.text.nlp.textpipe.annotations.RawTextAnnotation;
042import org.openimaj.text.nlp.textpipe.annotations.SentenceAnnotation;
043import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation;
044import org.openimaj.text.nlp.textpipe.annotators.MissingRequiredAnnotationException;
045import org.openimaj.text.nlp.textpipe.annotators.OpenNLPPOSAnnotator;
046import org.openimaj.text.nlp.textpipe.annotators.OpenNLPSentenceAnnotator;
047import org.openimaj.text.nlp.textpipe.annotators.OpenNLPTokenAnnotator;
048import org.openimaj.util.pair.IntDoublePair;
049
050/**
051 * An implementation of Hatzivassiloglou and McKeown's approach to a
052 * semisupervised method of building a bipolar sentiment lexicon. This is a one
053 * pass version, in that the corpus to build from is fixed.
054 * 
055 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
056 * 
057 */
058public class TotalLexBuilder {
059        private HashMap<String, double[]> vectors;
060        private HashMap<String, Integer> assignments;
061        private List<String> posConfirmation;
062        private List<String> negConfirmation;
063        private OpenNLPTokenAnnotator tokA;
064        private OpenNLPPOSAnnotator posA;
065        private OpenNLPSentenceAnnotator sentA;
066        private HashMap<String, List<HashMap<String, Counter>>> counts;
067        private int AND = 0, BUT = 1;
068
069        /**
070         * Constructor.
071         * 
072         * @param posConfirmation
073         *            = list of positive adjectives used to orient the
074         *            classification.
075         * @param negConfirmation
076         *            = list of negative adjectives used to orient the
077         *            classification.
078         */
079        public TotalLexBuilder(List<String> posConfirmation,
080                        List<String> negConfirmation)
081        {
082                this.posConfirmation = posConfirmation;
083                this.negConfirmation = negConfirmation;
084                tokA = new OpenNLPTokenAnnotator();
085                posA = new OpenNLPPOSAnnotator();
086                sentA = new OpenNLPSentenceAnnotator();
087                this.counts = new HashMap<String, List<HashMap<String, Counter>>>();
088        }
089
090        /**
091         * Builds a Scored Sentiment mapping of adjectives from the corpus.
092         * 
093         * @param corpus
094         * @return Scored Sentiment map of adjectives.
095         */
096        public Map<String, Double> build(List<String> corpus) {
097                // Find all the adjective conjunctions.
098                for (final String doc : corpus) {
099                        getAdjectiveConjunctions(doc, " and ");
100                        getAdjectiveConjunctions(doc, " but ");
101                }
102                // Build the vectors for each adjective
103                buildVectors();
104                normaliseVectors();
105                cluster();
106                return null;
107        }
108
109        private void cluster() {
110                final DoubleKMeans fkm = DoubleKMeans.createExact(counts.keySet().size(), 2);
111                final double[][] data = new double[counts.keySet().size()][];
112                int i = 0;
113                for (final double[] ds : vectors.values()) {
114                        data[i] = ds;
115                        i++;
116                }
117                final DoubleCentroidsResult cluster = fkm.cluster(data);
118                final HardAssigner<double[], double[], IntDoublePair> assigner = cluster.defaultHardAssigner();
119                assignments = new HashMap<String, Integer>();
120                for (final String adj : vectors.keySet()) {
121                        assignments.put(adj, assigner.assign(vectors.get(adj)));
122                }
123                for (final String adj : assignments.keySet()) {
124                        System.out.println(adj + " " + assignments.get(adj));
125                }
126        }
127
128        private void normaliseVectors() {
129
130        }
131
132        private void buildVectors() {
133                vectors = new HashMap<String, double[]>();
134                for (final String adj : counts.keySet()) {
135                        vectors.put(adj, new double[counts.keySet().size() * 2]);
136                        final HashMap<String, Counter> andCount = counts.get(adj).get(AND);
137                        final HashMap<String, Counter> butCount = counts.get(adj).get(BUT);
138                        int i = 0;
139                        for (final String adjInc : counts.keySet()) {
140                                if (andCount.containsKey(adjInc)) {
141                                        vectors.get(adj)[i] = andCount.get(adjInc).count;
142                                } else {
143                                        vectors.get(adj)[i] = 0;
144                                }
145                                if (butCount.containsKey(adjInc)) {
146                                        vectors.get(adj)[(i + counts.keySet().size())] = butCount
147                                                        .get(adjInc).count;
148                                } else {
149                                        vectors.get(adj)[(i + counts.keySet().size())] = 0;
150                                }
151                                i++;
152                        }
153                }
154        }
155
156        private void getAdjectiveConjunctions(String toSearch, String conjunction) {
157                String leftToSearch = toSearch;
158                if (leftToSearch.contains(conjunction)) {
159                        final RawTextAnnotation rta = new RawTextAnnotation(toSearch);
160                        try {
161                                sentA.annotate(rta);
162                                tokA.annotate(rta);
163                                posA.annotate(rta);
164                        } catch (final MissingRequiredAnnotationException e) {
165                                e.printStackTrace();
166                        }
167                        final List<SentenceAnnotation> sentences = rta
168                                        .getAnnotationsFor(SentenceAnnotation.class);
169                        int searchedIndex = 0;
170                        int sentI = 0;
171                        while (leftToSearch.contains(conjunction)) {
172                                final int loc = leftToSearch.indexOf(conjunction) + 1 + searchedIndex;
173                                List<TokenAnnotation> tokens = null;
174                                int t = 0;
175                                searchloop: for (int s = sentI; s < sentences.size(); s++) {
176                                        final SentenceAnnotation sentence = sentences.get(s);
177                                        if (sentence.start < loc && sentence.stop > loc) {
178                                                tokens = sentence
179                                                                .getAnnotationsFor(TokenAnnotation.class);
180                                                for (t = 0; t < tokens.size(); t++) {
181                                                        if (tokens.get(t).start + sentence.start == loc) {
182                                                                break searchloop;
183                                                        }
184                                                }
185                                        }
186                                        sentI++;
187                                }
188                                if (tokens != null) {
189                                        final int c = (conjunction.trim().equals("and")) ? AND : BUT;
190                                        checkForConjunctionGroup(tokens, t, c);
191                                }
192                                searchedIndex = loc + conjunction.length() - 1;
193                                leftToSearch = toSearch.substring(searchedIndex);
194                        }
195                }
196        }
197
198        private void checkForConjunctionGroup(List<TokenAnnotation> tokens,
199                        int conjunctionIndex, int conjunction)
200        {
201                if (tokens.size() > conjunctionIndex + 1
202                                && tokens.get(conjunctionIndex - 1)
203                                                .getAnnotationsFor(POSAnnotation.class).get(0).pos
204                                                .toString().contains("JJ")
205                                && tokens.get(conjunctionIndex + 1)
206                                                .getAnnotationsFor(POSAnnotation.class).get(0).pos
207                                                .toString().contains("JJ"))
208                {
209
210                        System.out.println(tokens.get(conjunctionIndex - 1)
211                                        .getStringToken()
212                                        + " "
213                                        + tokens.get(conjunctionIndex).getStringToken()
214                                        + " "
215                                        + tokens.get(conjunctionIndex + 1).getStringToken());
216
217                        final List<String> adjectives = new ArrayList<String>();
218                        adjectives.add(tokens.get(conjunctionIndex - 1).getStringToken());
219                        adjectives.add(tokens.get(conjunctionIndex + 1).getStringToken());
220                        for (int i = 0; i < adjectives.size(); i++) {
221                                final String vecAdj = adjectives.get(i);
222                                if (!counts.keySet().contains(vecAdj)) {
223                                        final ArrayList<HashMap<String, Counter>> cons = new ArrayList<HashMap<String, Counter>>();
224                                        cons.add(new HashMap<String, Counter>());
225                                        cons.add(new HashMap<String, Counter>());
226                                        counts.put(vecAdj, cons);
227                                }
228                                final HashMap<String, Counter> conVector = counts.get(vecAdj).get(
229                                                conjunction);
230                                for (int j = 0; j < adjectives.size(); j++) {
231                                        final String incAdj = adjectives.get(j);
232                                        if (!conVector.containsKey(incAdj)) {
233                                                conVector.put(incAdj, new Counter());
234                                        } else
235                                                conVector.get(incAdj).inc();
236                                }
237                        }
238                }
239        }
240
241        /**
242         * Easily incremented object for counting.
243         * 
244         * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
245         * 
246         */
247        public class Counter {
248                public double count;
249
250                public Counter() {
251                        count = 1.0;
252                }
253
254                public void inc() {
255                        count += 1;
256                }
257        }
258
259        /**
260         * Quick tester for class
261         * 
262         * @param args
263         */
264        public static void main(String[] args) {
265                final ArrayList<String> pos = new ArrayList<String>();
266                final ArrayList<String> neg = new ArrayList<String>();
267                final ArrayList<String> corpus = new ArrayList<String>();
268                pos.add("dandy");
269                neg.add("horrible");
270
271                corpus.add("Hello, this day is just fine and dandy, I wonder if it is going to turn horrible and sad?."
272                                + " Hopefully not. "
273                                + "Then again, if you are fine and warm inside, it would not make a difference. "
274                                + "Unless a dandy but horrible wolf came along. "
275                                + "Then we would be  be warm but sad inside. "
276                                + "Our only option would be to offer the sad and horrible wolf the opportunity to be warm and dandy." +
277                                "warm and fine. dandy and warm. fine but horrible. dandy but sad. sad and horrible." +
278                                "fine and warm, fine and dandy, fine and warm, fine and dandy");
279
280                // corpus.add("fine and warm");
281                final TotalLexBuilder b = new TotalLexBuilder(pos, neg);
282                b.build(corpus);
283        }
284
285}