001/** 002 * Copyright (c) 2012, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.twitter.modes.preprocessing; 031 032import java.util.ArrayList; 033import java.util.Arrays; 034import java.util.HashMap; 035import java.util.List; 036import java.util.Map; 037 038import org.kohsuke.args4j.Option; 039import org.openimaj.ml.annotation.ScoredAnnotation; 040import org.openimaj.text.nlp.namedentity.EntityExtractionResourceBuilder; 041import org.openimaj.text.nlp.namedentity.YagoEntityCandidateAnnotator; 042import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory; 043import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder; 044import org.openimaj.text.nlp.namedentity.YagoEntityCompleteAnnotator; 045import org.openimaj.text.nlp.namedentity.YagoEntityContextAnnotator; 046import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory; 047import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer; 048import org.openimaj.twitter.USMFStatus; 049 050/** 051 * -m NER 052 * 053 * Named Entity Recognition Mode. This mode makes three types of annotation 054 * under the heading of Named_Entities. These can be specified with the -sea 055 * option. CANDIDATES - returns lists of possible Named Entities based on 056 * character matches of aliases. CONTEXT - returns the Named Entities with the 057 * highest contextual scores. DISAMBIG - Returns non overlapping unique Named 058 * Entities that have been disambiguated based on context. 059 * 060 * NB! - Requires the YagoEntityExtraction resource folder. See 061 * {@link EntityExtractionResourceBuilder} for how to construct this folder. 062 * 063 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 064 * 065 */ 066public class NERMode extends 067 TwitterPreprocessingMode<Map<String, List<String>>> { 068 private static final String NAMED_ENT_REC = "Named_Entities"; 069 private static final String ALIAS_LOOKUP = "Entity_Candidates"; 070 private static String CONTEXT_SCORES = "Entity_Context_Scores"; 071 private static String DISAMBIGUATED = "Entity_Disambiguated"; 072 private YagoEntityCandidateAnnotator ylca; 073 private YagoEntityContextAnnotator ywca; 074 private YagoEntityCompleteAnnotator ycca; 075 076 enum NERModeMode { 077 ALL, CANDIDATES, CONTEXT, DISAMBIG 078 } 079 080 @Option(name = "--set-entity-annotations", aliases = "-sea", required = false, usage = "The named entity annotations to be performed. Default is ALL", multiValued = true) 081 private List<NERModeMode> twitterExtras = new ArrayList<NERModeMode>( 082 Arrays.asList(new NERModeMode[] { NERModeMode.ALL })); 083 084 @Option(name = "--set-resource-path", aliases = "-srp", required = false, usage = "The path to the resource folder. Default used if not specified.") 085 private String resourcePath = null; 086 087 /** 088 * Default Constructor 089 */ 090 public NERMode() { 091 if (resourcePath == null) { 092 YagoEntityCandidateFinder canF = YagoEntityCandidateFinderFactory 093 .createFromAliasFile(EntityExtractionResourceBuilder 094 .getDefaultAliasFilePath()); 095 ylca = new YagoEntityCandidateAnnotator(canF); 096 YagoEntityContextScorer conS = YagoEntityContextScorerFactory 097 .createFromIndexFile(EntityExtractionResourceBuilder 098 .getDefaultIndexDirectoryPath()); 099 ywca = new YagoEntityContextAnnotator(conS); 100 ycca = new YagoEntityCompleteAnnotator(conS, canF); 101 } 102 } 103 104 @Override 105 public Map<String, List<String>> process(USMFStatus twitterStatus) { 106 HashMap<String, ArrayList<HashMap<String, Object>>> result = new HashMap<String, ArrayList<HashMap<String, Object>>>(); 107 // Add Alias Lookup annotations 108 result.put(ALIAS_LOOKUP, new ArrayList<HashMap<String, Object>>()); 109 // Add context scoring annotations 110 result.put(CONTEXT_SCORES, new ArrayList<HashMap<String, Object>>()); 111 // Add disambiguated annotations 112 result.put(DISAMBIGUATED, new ArrayList<HashMap<String, Object>>()); 113 114 // Check that the twitterStatus has been tokenised. 115 if (twitterStatus.getAnalysis(TokeniseMode.TOKENS) == null) { 116 TokeniseMode tm = new TokeniseMode(); 117 tm.process(twitterStatus); 118 } 119 @SuppressWarnings("unchecked") 120 List<String> allTokens = ((Map<String, List<String>>) twitterStatus 121 .getAnalysis(TokeniseMode.TOKENS)).get(TokeniseMode.TOKENS_ALL); 122 123 if (twitterExtras.contains(NERModeMode.ALL) 124 || twitterExtras.contains(NERModeMode.CANDIDATES)) { 125 // Alias Lookup 126 for (ScoredAnnotation<HashMap<String, Object>> anno : ylca 127 .annotate(allTokens)) { 128 result.get(ALIAS_LOOKUP).add(anno.annotation); 129 } 130 } 131 if (twitterExtras.contains(NERModeMode.ALL) 132 || twitterExtras.contains(NERModeMode.CONTEXT)) { 133 // Context 134 for (ScoredAnnotation<HashMap<String, Object>> anno : ywca 135 .annotate(allTokens)) { 136 result.get(CONTEXT_SCORES).add(anno.annotation); 137 } 138 } 139 if (twitterExtras.contains(NERModeMode.ALL) 140 || twitterExtras.contains(NERModeMode.DISAMBIG)) { 141 // Disambiguated 142 for (ScoredAnnotation<HashMap<String, Object>> anno : ycca 143 .annotate(allTokens)) { 144 result.get(DISAMBIGUATED).add(anno.annotation); 145 } 146 } 147 twitterStatus.addAnalysis(NAMED_ENT_REC, result); 148 return null; 149 } 150 151 @Override 152 public String getAnalysisKey() { 153 return NAMED_ENT_REC; 154 } 155 156 /** 157 * Tester for mode. 158 * 159 * @param args 160 */ 161 public static void main(String[] args) { 162 NERMode m = null; 163 try { 164 m = new NERMode(); 165 } catch (Exception e) { 166 e.printStackTrace(); 167 } 168 USMFStatus u = new USMFStatus(); 169 String query = "British Airways and Lufthansa are airlines"; 170 System.out.println(query); 171 u.fillFromString(query); 172 m.process(u); 173 HashMap<String, ArrayList<HashMap<String, Object>>> analysis = u 174 .getAnalysis(NAMED_ENT_REC); 175 System.out.println("ALIAS LOOKUP"); 176 for (HashMap<String, Object> anno : analysis.get(ALIAS_LOOKUP)) { 177 System.out.println(anno.toString()); 178 } 179 System.out.println("CONTEXT"); 180 for (HashMap<String, Object> anno : analysis.get(CONTEXT_SCORES)) { 181 System.out.println(anno.toString()); 182 } 183 System.out.println("DISAMBIGUATION"); 184 for (HashMap<String, Object> anno : analysis.get(DISAMBIGUATED)) { 185 System.out.println(anno.toString()); 186 } 187 System.out.println("Done"); 188 } 189 190}