001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.namedentity; 031 032import java.io.BufferedWriter; 033import java.io.File; 034import java.io.FileWriter; 035import java.io.IOException; 036import java.io.UnsupportedEncodingException; 037import java.util.ArrayList; 038import java.util.HashMap; 039import java.util.HashSet; 040import java.util.List; 041import java.util.Map; 042import java.util.Set; 043 044import javax.xml.parsers.DocumentBuilder; 045import javax.xml.parsers.DocumentBuilderFactory; 046import javax.xml.parsers.ParserConfigurationException; 047 048import org.apache.commons.lang.StringUtils; 049import org.openimaj.experiment.evaluation.classification.BasicClassificationResult; 050import org.openimaj.experiment.evaluation.classification.ClassificationEvaluator; 051import org.openimaj.experiment.evaluation.classification.ClassificationResult; 052import org.openimaj.experiment.evaluation.classification.analysers.roc.ROCAnalyser; 053import org.openimaj.experiment.evaluation.classification.analysers.roc.ROCResult; 054import org.openimaj.ml.annotation.ScoredAnnotation; 055import org.openimaj.text.nlp.EntityTweetTokeniser; 056import org.openimaj.text.nlp.TweetTokeniserException; 057import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder; 058import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer; 059import org.w3c.dom.Document; 060import org.w3c.dom.NamedNodeMap; 061import org.w3c.dom.Node; 062import org.w3c.dom.NodeList; 063import org.xml.sax.SAXException; 064 065/** 066 * Experiment for examining the ability of a 067 * Yago based organisation extractor. 068 * 069 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 070 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 071 * 072 */ 073public class YagoCompanyAnnotatorEvaluator { 074 private static final String CLASSIFICATION = "Organistaion"; 075 private static BufferedWriter logOut; 076 private static boolean logging; 077 private DocumentBuilderFactory docBuilderFactory; 078 private DocumentBuilder docBuilder; 079 private Map<FileEntityLocation, Set<String>> actual; 080 private Map<FileEntityLocation, ClassificationResult<String>> results; 081 private final YagoEntityCompleteAnnotator ycca; 082 private EntityTweetTokeniser tt; 083 private ClassificationEvaluator<ROCResult<String>, String, FileEntityLocation> ce; 084 private ROCAnalyser<FileEntityLocation, String> ra; 085 private boolean verbose=false; 086 087 /** 088 * @param args 089 * the first argument must be the alias list/index directory 090 */ 091 public static void main(String[] args) { 092 if (args.length < 1) { 093 System.out.println("You have not given me a directory for the Test data."); 094 System.exit(1); 095 } 096 if(args.length==2){ 097 createLogging(args[1]); 098 logging = true; 099 } 100 else{ 101 System.out.println("No logging file specified."); 102 logging=false; 103 } 104 final YagoCompanyAnnotatorEvaluator ya = new YagoCompanyAnnotatorEvaluator(); 105 ya.run(args[0]); 106 } 107 108 private static void createLogging(String logFilePath) { 109 File f = new File(logFilePath); 110 if(!f.isFile()){ 111 try { 112 f.createNewFile(); 113 } catch (IOException e) { 114 e.printStackTrace(); 115 } 116 } 117 else{ 118 } 119 FileWriter fstream = null; 120 try { 121 fstream = new FileWriter(logFilePath); 122 logOut = new BufferedWriter(fstream); 123 logOut.write(""); 124 } catch (IOException e) { 125 e.printStackTrace(); 126 } 127 128 } 129 130 /** 131 * instantiates the annotator 132 */ 133 public YagoCompanyAnnotatorEvaluator() { 134 YagoEntityCandidateFinder ycf = null; 135 ycf = YagoEntityCandidateFinderFactory.createFromAliasFile(EntityExtractionResourceBuilder.getDefaultAliasFilePath()); 136 YagoEntityContextScorer ycs = null; 137 ycs = YagoEntityContextScorerFactory.createFromIndexFile(EntityExtractionResourceBuilder.getDefaultIndexDirectoryPath()); 138 ycca = new YagoEntityCompleteAnnotator(ycs,ycf); 139 } 140 141 /** 142 * @param testDirectory 143 * given a directory, run the evaluation 144 */ 145 public void run(String testDirectory) { 146 System.out.println("Started...."); 147 buildTruthAndClassifications(testDirectory); 148 ra = new ROCAnalyser<YagoCompanyAnnotatorEvaluator.FileEntityLocation, String>(); 149 ce = new ClassificationEvaluator<ROCResult<String>, String, FileEntityLocation>(results, actual, ra); 150 final ROCResult<String> analysisResult = ce.analyse(ce.evaluate()); 151 System.out.println(analysisResult.getDetailReport()); 152 doMyCalcs(); 153 if(logging) 154 try { 155 logOut.flush(); 156 logOut.close(); 157 } catch (IOException e) { 158 e.printStackTrace(); 159 } 160 } 161 162 private void doMyCalcs() { 163 double fp=0; 164 double tp=0; 165 double fn=0; 166 for(FileEntityLocation fe:results.keySet()){ 167 if(actual.keySet().contains(fe))tp++; 168 else fp++; 169 } 170 for(FileEntityLocation fe:actual.keySet()){ 171 if(!results.keySet().contains(fe))fn++; 172 } 173 System.out.println("Precision : "+(tp/(tp+fp))); 174 System.out.println("Recall : "+(tp/(tp+fn))); 175 } 176 177 /** 178 * @param testDirectory 179 */ 180 private void buildTruthAndClassifications(String testDirectory) { 181 final File f = new File(testDirectory); 182 actual = new HashMap<FileEntityLocation, Set<String>>(); 183 results = new HashMap<FileEntityLocation, ClassificationResult<String>>(); 184 if (f.isDirectory()) { 185 // Initialize XML parsing objects 186 docBuilderFactory = DocumentBuilderFactory.newInstance(); 187 docBuilder = null; 188 try { 189 docBuilder = docBuilderFactory.newDocumentBuilder(); 190 } catch (final ParserConfigurationException e) { 191 e.printStackTrace(); 192 } 193 194 for (final File s : f.listFiles()) { 195 final String name = s.getName(); 196 print("#################Processing " + name); 197 if (name.substring(name.lastIndexOf(".") + 1).equals("xml")) { 198 Document doc = null; 199 try { 200 doc = docBuilder.parse(s); 201 } catch (final SAXException e) { 202 203 e.printStackTrace(); 204 } catch (final IOException e) { 205 e.printStackTrace(); 206 } 207 doc.getDocumentElement().normalize(); 208 final HashMap<Integer, String> res = getResultsFrom(doc.getElementsByTagName("TextWithNodes").item(0) 209 .getTextContent(), s.getAbsolutePath()); 210 final HashMap<Integer, String> act = getActualFrom(doc.getElementsByTagName("TextWithNodes").item(0) 211 .getTextContent(), doc.getElementsByTagName("AnnotationSet"), s.getAbsolutePath()); 212 print("---------MY MISSES----------"); 213 for (final int key : act.keySet()) { 214 if (!res.keySet().contains(key)) { 215 print(act.get(key)); 216 } 217 } 218 print("---------THEIR MISSES----------"); 219 for (final int key : res.keySet()) { 220 if (!act.keySet().contains(key)) { 221 print(res.get(key)); 222 } 223 } 224 } 225 } 226 } 227 } 228 229 private HashMap<Integer, String> getResultsFrom(String textContent, String filePath) { 230 print("---------RESULTS----------"); 231 try { 232 tt = new EntityTweetTokeniser(textContent); 233 } catch (final UnsupportedEncodingException e) { 234 e.printStackTrace(); 235 } catch (final TweetTokeniserException e) { 236 e.printStackTrace(); 237 } 238 final ArrayList<String> tokens = (ArrayList<String>) tt.getStringTokens(); 239 final List<ScoredAnnotation<HashMap<String, Object>>> annos = ycca.annotate(tokens); 240 final HashMap<Integer, String> r = new HashMap<Integer, String>(); 241 for (final ScoredAnnotation<HashMap<String, Object>> anno : annos) { 242 if (anno.annotation.get(EntityAnnotator.TYPE)==NamedEntity.Type.Organisation.toString()){ 243 final FileEntityLocation fe = getFE(anno, textContent, tokens); 244 final BasicClassificationResult<String> c = new BasicClassificationResult<String>(); 245 c.put(CLASSIFICATION, 1); 246 fe.file = filePath; 247 results.put(fe, c); 248 if (fe.start >= 0 && fe.start < textContent.length() 249 && fe.stop >= 0 && fe.stop < textContent.length() 250 && fe.stop > fe.start) { 251 final String s = textContent.substring(fe.start, fe.stop) 252 + " " + fe.start + ", " + fe.stop; 253 r.put(fe.start + fe.stop, s); 254 print(s); 255 } else 256 System.err.println("Substring out of range for :" 257 + anno.annotation.get(EntityAnnotator.URI)); 258 } 259 //else System.out.println("Skipped person : "+anno.annotation.get(EntityAnnotator.URI)); 260 } 261 return r; 262 } 263 264 private FileEntityLocation getFE(ScoredAnnotation<HashMap<String, Object>> anno, String textContent, 265 ArrayList<String> tokens) 266 { 267 // calculate the start char index 268 final int sInd = (Integer) anno.annotation.get(EntityAnnotator.START_TOKEN); 269 final String sToken = tokens.get(sInd); 270 // join all previous tokens with empty and get length 271 int minStartChar = StringUtils.join(tokens.subList(0, sInd), "").length(); 272 // get the index of the first occurrence of the token after the minimum 273 int startCharOff = textContent.substring(minStartChar).indexOf(sToken); 274 final int startChar = minStartChar + startCharOff; 275 // calculate the end char index 276 final int eInd = (Integer) anno.annotation.get(EntityAnnotator.END_TOKEN); 277 final String eToken = tokens.get(eInd); 278 minStartChar = StringUtils.join(tokens.subList(0, eInd), "").length(); 279 startCharOff = textContent.substring(minStartChar).indexOf(eToken); 280 final int endChar = minStartChar + startCharOff + eToken.length(); 281 final FileEntityLocation fe = new FileEntityLocation(); 282 fe.start = startChar; 283 fe.stop = endChar; 284 return fe; 285 } 286 287 private HashMap<Integer, String> getActualFrom(String textContent, NodeList anoSets, String filePath) { 288 print("---------Actual----------"); 289 final HashSet<String> c = new HashSet<String>(); 290 c.add(CLASSIFICATION); 291 final HashMap<Integer, String> r = new HashMap<Integer, String>(); 292 for (int i = 0; i < anoSets.getLength(); i++) { 293 final Node n = anoSets.item(i); 294 final NamedNodeMap m = n.getAttributes(); 295 if (m.getNamedItem("Name") != null && m.getNamedItem("Name").getNodeValue().equals("Key")) { 296 final NodeList anoChildren = n.getChildNodes(); 297 for (int j = 0; j < anoChildren.getLength(); j++) { 298 final Node child = anoChildren.item(j); 299 if (child.hasAttributes() && child.getAttributes().getNamedItem("Type") != null 300 && child.getAttributes().getNamedItem("Type").getNodeValue().equals("Organization")) 301 { 302 final int startchar = Integer.parseInt(child.getAttributes().getNamedItem("StartNode") 303 .getNodeValue()); 304 final int endchar = Integer 305 .parseInt(child.getAttributes().getNamedItem("EndNode").getNodeValue()); 306 final FileEntityLocation fe = new FileEntityLocation(); 307 fe.file = filePath; 308 fe.start = startchar; 309 fe.stop = endchar; 310 actual.put(fe, c); 311 final String s = textContent.substring(fe.start, fe.stop) + " " + fe.start + ", " + fe.stop; 312 r.put(fe.start + fe.stop, s); 313 print(s); 314 } 315 } 316 } 317 } 318 return r; 319 } 320 321 /** 322 * @param path to the Gate document 323 * @return plain text of document. 324 */ 325 public static String getRawStringFromTest(String path) { 326 final File f = new File(path); 327 Document doc = null; 328 DocumentBuilderFactory factory = null; 329 DocumentBuilder docBuilder = null; 330 // Initialize XML parsing objects 331 factory = DocumentBuilderFactory.newInstance(); 332 docBuilder = null; 333 try { 334 docBuilder = factory.newDocumentBuilder(); 335 } catch (final ParserConfigurationException e) { 336 e.printStackTrace(); 337 } 338 try { 339 doc = docBuilder.parse(f); 340 } catch (final SAXException e) { 341 e.printStackTrace(); 342 } catch (final IOException e) { 343 e.printStackTrace(); 344 } 345 doc.getDocumentElement().normalize(); 346 return doc.getElementsByTagName("TextWithNodes").item(0).getTextContent(); 347 } 348 349 private void print(String message){ 350 if(verbose)System.out.println(message); 351 if(logging) 352 try { 353 logOut.append(message+"\n"); 354 } catch (IOException e) { 355 e.printStackTrace(); 356 } 357 } 358 359 /** 360 * An object which uniquely identifies and equates a start/stop in a 361 * specific file. 362 * 363 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 364 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 365 * 366 */ 367 public static class FileEntityLocation { 368 String file; 369 int start; 370 int stop; 371 372 @Override 373 public int hashCode() { 374 final int prime = 31; 375 int result = 1; 376 result = prime * result + ((file == null) ? 0 : file.hashCode()); 377 result = prime * result + start; 378 result = prime * result + stop; 379 return result; 380 } 381 382 @Override 383 public boolean equals(Object obj) { 384 if (!(obj instanceof FileEntityLocation)) 385 return false; 386 final FileEntityLocation comp = (FileEntityLocation) obj; 387 if (!comp.file.equals(this.file)) 388 return false; 389 if (comp.start != this.start) 390 return false; 391 if (comp.stop != this.stop) 392 return false; 393 return true; 394 } 395 396 } 397 398}