001/**
002 * Copyright (c) 2012, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.twitter.modes.preprocessing;
031
032import java.util.ArrayList;
033import java.util.Arrays;
034import java.util.HashMap;
035import java.util.List;
036import java.util.Map;
037
038import org.kohsuke.args4j.Option;
039import org.openimaj.ml.annotation.ScoredAnnotation;
040import org.openimaj.text.nlp.namedentity.EntityExtractionResourceBuilder;
041import org.openimaj.text.nlp.namedentity.YagoEntityCandidateAnnotator;
042import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory;
043import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder;
044import org.openimaj.text.nlp.namedentity.YagoEntityCompleteAnnotator;
045import org.openimaj.text.nlp.namedentity.YagoEntityContextAnnotator;
046import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory;
047import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer;
048import org.openimaj.twitter.USMFStatus;
049
050/**
051 * -m NER
052 * 
053 * Named Entity Recognition Mode. This mode makes three types of annotation
054 * under the heading of Named_Entities. These can be specified with the -sea
055 * option. CANDIDATES - returns lists of possible Named Entities based on
056 * character matches of aliases. CONTEXT - returns the Named Entities with the
057 * highest contextual scores. DISAMBIG - Returns non overlapping unique Named
058 * Entities that have been disambiguated based on context.
059 * 
060 * NB! - Requires the YagoEntityExtraction resource folder. See
061 * {@link EntityExtractionResourceBuilder} for how to construct this folder.
062 * 
063 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
064 * 
065 */
066public class NERMode extends
067                TwitterPreprocessingMode<Map<String, List<String>>> {
068        private static final String NAMED_ENT_REC = "Named_Entities";
069        private static final String ALIAS_LOOKUP = "Entity_Candidates";
070        private static String CONTEXT_SCORES = "Entity_Context_Scores";
071        private static String DISAMBIGUATED = "Entity_Disambiguated";
072        private YagoEntityCandidateAnnotator ylca;
073        private YagoEntityContextAnnotator ywca;
074        private YagoEntityCompleteAnnotator ycca;
075
076        enum NERModeMode {
077                ALL, CANDIDATES, CONTEXT, DISAMBIG
078        }
079
080        @Option(name = "--set-entity-annotations", aliases = "-sea", required = false, usage = "The named entity annotations to be performed. Default is ALL", multiValued = true)
081        private List<NERModeMode> twitterExtras = new ArrayList<NERModeMode>(
082                        Arrays.asList(new NERModeMode[] { NERModeMode.ALL }));
083
084        @Option(name = "--set-resource-path", aliases = "-srp", required = false, usage = "The path to the resource folder. Default used if not specified.")
085        private String resourcePath = null;
086
087        /**
088         * Default Constructor
089         */
090        public NERMode() {
091                if (resourcePath == null) {
092                        YagoEntityCandidateFinder canF = YagoEntityCandidateFinderFactory
093                                        .createFromAliasFile(EntityExtractionResourceBuilder
094                                                        .getDefaultAliasFilePath());
095                        ylca = new YagoEntityCandidateAnnotator(canF);
096                        YagoEntityContextScorer conS = YagoEntityContextScorerFactory
097                                        .createFromIndexFile(EntityExtractionResourceBuilder
098                                                        .getDefaultIndexDirectoryPath());
099                        ywca = new YagoEntityContextAnnotator(conS);
100                        ycca = new YagoEntityCompleteAnnotator(conS, canF);
101                }
102        }
103
104        @Override
105        public Map<String, List<String>> process(USMFStatus twitterStatus) {
106                HashMap<String, ArrayList<HashMap<String, Object>>> result = new HashMap<String, ArrayList<HashMap<String, Object>>>();
107                // Add Alias Lookup annotations
108                result.put(ALIAS_LOOKUP, new ArrayList<HashMap<String, Object>>());
109                // Add context scoring annotations
110                result.put(CONTEXT_SCORES, new ArrayList<HashMap<String, Object>>());
111                // Add disambiguated annotations
112                result.put(DISAMBIGUATED, new ArrayList<HashMap<String, Object>>());
113
114                // Check that the twitterStatus has been tokenised.
115                if (twitterStatus.getAnalysis(TokeniseMode.TOKENS) == null) {
116                        TokeniseMode tm = new TokeniseMode();
117                        tm.process(twitterStatus);
118                }
119                @SuppressWarnings("unchecked")
120                List<String> allTokens = ((Map<String, List<String>>) twitterStatus
121                                .getAnalysis(TokeniseMode.TOKENS)).get(TokeniseMode.TOKENS_ALL);
122
123                if (twitterExtras.contains(NERModeMode.ALL)
124                                || twitterExtras.contains(NERModeMode.CANDIDATES)) {
125                        // Alias Lookup
126                        for (ScoredAnnotation<HashMap<String, Object>> anno : ylca
127                                        .annotate(allTokens)) {
128                                result.get(ALIAS_LOOKUP).add(anno.annotation);
129                        }
130                }
131                if (twitterExtras.contains(NERModeMode.ALL)
132                                || twitterExtras.contains(NERModeMode.CONTEXT)) {
133                        // Context
134                        for (ScoredAnnotation<HashMap<String, Object>> anno : ywca
135                                        .annotate(allTokens)) {
136                                result.get(CONTEXT_SCORES).add(anno.annotation);
137                        }
138                }
139                if (twitterExtras.contains(NERModeMode.ALL)
140                                || twitterExtras.contains(NERModeMode.DISAMBIG)) {
141                        // Disambiguated
142                        for (ScoredAnnotation<HashMap<String, Object>> anno : ycca
143                                        .annotate(allTokens)) {
144                                result.get(DISAMBIGUATED).add(anno.annotation);
145                        }
146                }
147                twitterStatus.addAnalysis(NAMED_ENT_REC, result);
148                return null;
149        }
150
151        @Override
152        public String getAnalysisKey() {
153                return NAMED_ENT_REC;
154        }
155
156        /**
157         * Tester for mode.
158         * 
159         * @param args
160         */
161        public static void main(String[] args) {
162                NERMode m = null;
163                try {
164                        m = new NERMode();
165                } catch (Exception e) {
166                        e.printStackTrace();
167                }
168                USMFStatus u = new USMFStatus();
169                String query = "British Airways and Lufthansa are airlines";
170                System.out.println(query);
171                u.fillFromString(query);
172                m.process(u);
173                HashMap<String, ArrayList<HashMap<String, Object>>> analysis = u
174                                .getAnalysis(NAMED_ENT_REC);
175                System.out.println("ALIAS LOOKUP");
176                for (HashMap<String, Object> anno : analysis.get(ALIAS_LOOKUP)) {
177                        System.out.println(anno.toString());
178                }
179                System.out.println("CONTEXT");
180                for (HashMap<String, Object> anno : analysis.get(CONTEXT_SCORES)) {
181                        System.out.println(anno.toString());
182                }
183                System.out.println("DISAMBIGUATION");
184                for (HashMap<String, Object> anno : analysis.get(DISAMBIGUATED)) {
185                        System.out.println(anno.toString());
186                }
187                System.out.println("Done");
188        }
189
190}