Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.sentiment.model.classifier;
031
032import java.util.ArrayList;
033import java.util.HashMap;
034import java.util.List;
035import java.util.Map.Entry;
036
037import org.openimaj.feature.DoubleFV;
038import org.openimaj.feature.FeatureExtractor;
039import org.openimaj.ml.annotation.AbstractAnnotator;
040import org.openimaj.ml.annotation.bayes.NaiveBayesAnnotator;
041
042/**
043 * {@link FeatureExtractor} that is suitable for {@link NaiveBayesAnnotator}.
044 * Should be initialized with training corpus of the machine learning
045 * {@link AbstractAnnotator} you are using.
046 *
047 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
048 *
049 */
050public class GeneralSentimentFeatureExtractor implements
051FeatureExtractor<DoubleFV, List<String>>
052{
053
054        private ArrayList<String> vocabList;
055        private int wordOccuranceThresh = 50;
056
057        /**
058         * Construct with the training set. This is required to build a vocabulary.
059         *
060         * @param domainVocabularyCorpus
061         *            list of tokenised corpus documents.
062         */
063        public GeneralSentimentFeatureExtractor(
064                        List<List<String>> domainVocabularyCorpus)
065        {
066                initialize(domainVocabularyCorpus);
067        }
068
069        /**
070         * Blank constructor. Will require initialize to be called at a later stage.
071         */
072        public GeneralSentimentFeatureExtractor() {
073
074        }
075
076        /**
077         * Allows a new vocabulary to be constructed from a new corpus.
078         *
079         * @param domainVocabularyCorpus
080         *            list of tokenised corpus documents.
081         */
082        public void initialize(List<List<String>> domainVocabularyCorpus) {
083                final HashMap<String, Integer> vocab = new HashMap<String, Integer>();
084                for (final List<String> doc : domainVocabularyCorpus) {
085                        for (final String s : doc) {
086                                Integer current = vocab.get(s);
087                                if (current == null)
088                                        current = 0;
089                                vocab.put(s, current + 1);
090
091                        }
092                }
093                this.vocabList = new ArrayList<String>();
094                for (final Entry<String, Integer> entry : vocab.entrySet()) {
095                        if (entry.getValue() > wordOccuranceThresh) {
096                                vocabList.add(entry.getKey());
097                        }
098                }
099        }
100
101        @Override
102        public DoubleFV extractFeature(List<String> tokens) {
103                final double[] vect = new double[vocabList.size()];
104                for (int i = 0; i < vect.length; i++) {
105                        vect[i] += 0.00001;
106                }
107                for (final String s : tokens) {
108                        final int ind = vocabList.indexOf(s);
109                        if (ind >= 0)
110                                vect[ind] += 1;
111                }
112                final double[] vectNorm = new double[vocabList.size()];
113                for (int i = 0; i < vect.length; i++) {
114                        vectNorm[i] = vect[i] / tokens.size();
115                }
116                return new DoubleFV(vect);
117        }
118
119}