001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.namedentity; 031 032import info.bliki.wiki.filter.PlainTextConverter; 033import info.bliki.wiki.model.WikiModel; 034 035import java.io.BufferedReader; 036import java.io.BufferedWriter; 037import java.io.File; 038import java.io.FileNotFoundException; 039import java.io.FileReader; 040import java.io.FileWriter; 041import java.io.IOException; 042import java.util.HashMap; 043 044import javax.xml.parsers.DocumentBuilder; 045import javax.xml.parsers.DocumentBuilderFactory; 046import javax.xml.parsers.ParserConfigurationException; 047 048import org.apache.commons.lang.StringEscapeUtils; 049import org.apache.lucene.document.FieldType; 050import org.apache.lucene.store.SimpleFSDirectory; 051import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder; 052import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer; 053import org.openimaj.text.nlp.namedentity.YagoEntityExactMatcherFactory.YagoEntityExactMatcher; 054import org.w3c.dom.Document; 055import org.w3c.dom.NodeList; 056import org.xml.sax.SAXException; 057 058/** 059 * This class has various methods that can be used to build the resources 060 * required by {@link YagoEntityCandidateFinder}, 061 * {@link YagoEntityContextScorer} and {@link YagoEntityExactMatcher}. These 062 * resources are a text File of entity aliases, and a lucene index of contextual 063 * data. 064 * 065 * The directory of the stripped down Yago tsv files is required. This directory 066 * can be built with {@link SeedBuilder}. 067 * 068 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 069 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 070 * 071 */ 072public class EntityExtractionResourceBuilder { 073 074 /** 075 * Default file name for the alias text file. 076 */ 077 public static String DEFAULT_ALIAS_NAME = "AliasMapFile.txt"; 078 /** 079 * Default directory name for the lucene index. 080 */ 081 public static String DEFAULT_CONTEXT_NAME = "YagoLucene"; 082 private static String DEFAULT_ROOT_NAME = ".YagoEntityExtraction"; 083 private static String wikiApiPrefix = "http://en.wikipedia.org/w/api.php?format=xml&action=query&titles="; 084 private static String wikiApiSuffix = "&prop=revisions&rvprop=content"; 085 private boolean verbose = true; 086 // This will build for location entities. There are too many for memory. 087 // Leave false. 088 private boolean locations = false; 089 private static BufferedWriter logOut; 090 091 /** 092 * Builds the alias text file in the default location. 093 * 094 * @param seedDirectoryPath 095 * = path location of the stripped down Yago .tsv files. 096 */ 097 public void buildCandidateAliasFile(String seedDirectoryPath) { 098 buildCandidateAliasFile(seedDirectoryPath, getDefaultRootPath() 099 + File.separator + DEFAULT_ALIAS_NAME); 100 } 101 102 /** 103 * Builds the alias text file in the specified location. 104 * 105 * @param seedDirectoryPath 106 * = path location of the stripped down Yago .tsv files. 107 * @param destinationPath 108 * = path to build the alias text file. 109 */ 110 public void buildCandidateAliasFile(String seedDirectoryPath, 111 String destinationPath) 112 { 113 writeAliasFile(getEntities(seedDirectoryPath), destinationPath, 114 seedDirectoryPath); 115 } 116 117 /** 118 * Builds the lucene index in the default path. 119 * 120 * @param seedDirectoryPath 121 * = path location of the stripped down Yago .tsv files. 122 */ 123 public void buildContextLuceneIndex(String seedDirectoryPath) { 124 buildContextLuceneIndex(seedDirectoryPath, getDefaultRootPath() 125 + File.separator + DEFAULT_CONTEXT_NAME); 126 } 127 128 /** 129 * Builds the lucene index at the specified path. 130 * 131 * @param seedDirectoryPath 132 * @param destinationPath 133 */ 134 public void buildContextLuceneIndex(String seedDirectoryPath, 135 String destinationPath) 136 { 137 try { 138 buildIndex(getEntities(seedDirectoryPath), destinationPath, 139 seedDirectoryPath); 140 } catch (final IOException e) { 141 e.printStackTrace(); 142 } 143 } 144 145 /** 146 * Builds the alias text file and the lucene index in the default root 147 * directory. 148 * 149 * @param seedDirectoryPath 150 */ 151 public void buildAll(String seedDirectoryPath) { 152 validateFileStructure(); 153 createLogging(getDefaultRootPath() + File.separator + "log.txt"); 154 buildAll(seedDirectoryPath, getDefaultRootPath()); 155 try { 156 logOut.flush(); 157 logOut.close(); 158 } catch (final IOException e) { 159 e.printStackTrace(); 160 } 161 } 162 163 /** 164 * Builds the alias text file and the lucene index in the specified root 165 * directory. 166 * 167 * @param seedDirectoryPath 168 * @param destinationPath 169 */ 170 public void buildAll(String seedDirectoryPath, String destinationPath) { 171 // Get the entities as people and organisations 172 print("Building All..."); 173 final HashMap<String, YagoNamedEntity> entities = getEntities(seedDirectoryPath); 174 writeAliasFile(entities, destinationPath + File.separator 175 + DEFAULT_ALIAS_NAME, seedDirectoryPath); 176 try { 177 buildIndex(entities, destinationPath + File.separator 178 + DEFAULT_CONTEXT_NAME, seedDirectoryPath); 179 } catch (final IOException e) { 180 e.printStackTrace(); 181 } 182 print("Done"); 183 } 184 185 /** 186 * @return default root directory path for all YagoEntity resources. 187 */ 188 public static String getDefaultRootPath() { 189 return System.getProperty("user.home") + File.separator 190 + DEFAULT_ROOT_NAME; 191 } 192 193 /** 194 * @return default alias text file path. 195 */ 196 public static String getDefaultAliasFilePath() { 197 return getDefaultRootPath() + File.separator + DEFAULT_ALIAS_NAME; 198 } 199 200 /** 201 * @return defualt lucene directory path. 202 */ 203 public static String getDefaultIndexDirectoryPath() { 204 return getDefaultRootPath() + File.separator + DEFAULT_CONTEXT_NAME; 205 } 206 207 public static String getAliasFrom(String rootName) { 208 String result; 209 String noGeo = null; 210 if (rootName.startsWith("geoent_")) { 211 noGeo = rootName.substring(rootName.indexOf('_') + 1, 212 rootName.lastIndexOf('_')); 213 } else 214 noGeo = rootName; 215 final String spaces = noGeo.replaceAll("_", " "); 216 String noParen; 217 if (spaces.contains("(")) 218 noParen = spaces.substring(0, spaces.indexOf("(")); 219 else 220 noParen = spaces; 221 String dropComma; 222 if (noParen.contains(",")) 223 dropComma = noParen.substring(0, spaces.indexOf(",")); 224 else 225 dropComma = noParen; 226 result = dropComma; 227 return result; 228 } 229 230 private void validateFileStructure() { 231 final File rootDir = new File(getDefaultRootPath()); 232 if (!rootDir.isDirectory()) { 233 rootDir.mkdir(); 234 } 235 final File indexDir = new File(getDefaultRootPath() + File.separator 236 + DEFAULT_CONTEXT_NAME); 237 if (!indexDir.isDirectory()) { 238 indexDir.mkdir(); 239 } else { 240 for (final File f : indexDir.listFiles()) 241 f.delete(); 242 } 243 } 244 245 private static void createLogging(String logFilePath) { 246 final File f = new File(logFilePath); 247 if (!f.isFile()) { 248 try { 249 f.createNewFile(); 250 } catch (final IOException e) { 251 e.printStackTrace(); 252 } 253 } else { 254 } 255 FileWriter fstream = null; 256 try { 257 fstream = new FileWriter(logFilePath); 258 logOut = new BufferedWriter(fstream); 259 logOut.write(""); 260 } catch (final IOException e) { 261 // TODO Auto-generated catch block 262 e.printStackTrace(); 263 } 264 265 } 266 267 private void buildIndex(HashMap<String, YagoNamedEntity> entities, 268 String destinationPath, String seedDirectoryPath) 269 throws IOException 270 { 271 print("Building Index..."); 272 setEntityContextValues(entities, seedDirectoryPath); 273 print("Initializing Lucene objects..."); 274 275 // initialize lucene objects 276 final String[] names = { "uri", "context", "type" }; 277 FieldType[] types; 278 final FieldType ti = new FieldType(); 279 ti.setIndexed(true); 280 ti.setTokenized(true); 281 ti.setStored(true); 282 final FieldType n = new FieldType(); 283 n.setStored(true); 284 n.setIndexed(true); 285 types = new FieldType[3]; 286 types[0] = n; 287 types[1] = ti; 288 types[2] = n; 289 final File f = new File(destinationPath); 290 final QuickIndexer qi = new QuickIndexer(new SimpleFSDirectory(f)); 291 292 // Initialize wiki objects 293 final DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory 294 .newInstance(); 295 DocumentBuilder docBuilder = null; 296 Document doc; 297 try { 298 docBuilder = docBuilderFactory.newDocumentBuilder(); 299 } catch (final ParserConfigurationException e) { 300 e.printStackTrace(); 301 } 302 doc = null; 303 final WikiModel wikiModel = new WikiModel( 304 "http://www.mywiki.com/wiki/${image}", 305 "http://www.mywiki.com/wiki/${title}"); 306 int count = 0; 307 print("Building Lucene Index..."); 308 for (final YagoNamedEntity entity : entities.values()) { 309 count++; 310 if (count % 5000 == 0) 311 print("Processed " + count); 312 // if wikiURL, add wiki to context 313 if (entity.wikiURL != null) { 314 final String title = entity.wikiURL.substring(entity.wikiURL 315 .lastIndexOf("/") + 1); 316 try { 317 doc = docBuilder.parse(wikiApiPrefix + title 318 + wikiApiSuffix); 319 } catch (final SAXException e) { 320 e.printStackTrace(); 321 } catch (final IOException e) { 322 e.printStackTrace(); 323 } 324 doc.getDocumentElement().normalize(); 325 final NodeList revisions = doc.getElementsByTagName("rev"); 326 if (revisions.getLength() > 0) { 327 final String markup = revisions.item(0).getTextContent(); 328 329 // convert markup dump to plaintext. 330 final String plainStr = wikiModel.render( 331 new PlainTextConverter(), markup); 332 // add it to the context. 333 entity.addContext(plainStr); 334 } 335 } 336 final String[] values = { entity.rootName, entity.getContext(), 337 entity.type.toString() }; 338 qi.addDocumentFromFields(names, values, types); 339 } 340 qi.finalise(); 341 } 342 343 private void setEntityContextValues( 344 final HashMap<String, YagoNamedEntity> entities, 345 String seedDirectoryPath) 346 { 347 print("Setting Context Values..."); 348 BufferedReader in = null; 349 // Created 350 try { 351 in = openFileAsReadStream(seedDirectoryPath + File.separator 352 + "created_stripped.tsv"); 353 } catch (final FileNotFoundException e) { 354 e.printStackTrace(); 355 } 356 StreamLooper sl = new StreamLooper(in) { 357 @Override 358 protected void doWork(String s) { 359 final String[] values = s.split("\\s+"); 360 final String rootName = values[1]; 361 final String context = convertResource(values[2]); 362 if (entities.keySet().contains(rootName)) { 363 entities.get(rootName).addContext(context); 364 } 365 } 366 }; 367 sl.loop(); 368 369 // wikiAnchorText 370 try { 371 in = openFileAsReadStream(seedDirectoryPath + File.separator 372 + "hasWikipediaAnchorText_stripped.tsv"); 373 } catch (final FileNotFoundException e) { 374 e.printStackTrace(); 375 } 376 sl = new StreamLooper(in) { 377 @Override 378 protected void doWork(String s) { 379 final String[] values = s.split("\\s+"); 380 final String rootName = values[1]; 381 final String context = convertLiteral(values[2]); 382 if (entities.keySet().contains(rootName)) { 383 entities.get(rootName).addContext(context); 384 } 385 } 386 }; 387 sl.loop(); 388 389 // wikiUrl 390 391 try { 392 in = openFileAsReadStream(seedDirectoryPath + File.separator 393 + "hasWikipediaUrl_stripped.tsv"); 394 } catch (final FileNotFoundException e) { 395 e.printStackTrace(); 396 } 397 sl = new StreamLooper(in) { 398 @Override 399 protected void doWork(String s) { 400 final String[] values = s.split("\\s+"); 401 final String rootName = values[1]; 402 if (entities.keySet().contains(rootName)) { 403 entities.get(rootName).wikiURL = values[2].replaceAll("\"", 404 ""); 405 } 406 } 407 }; 408 sl.loop(); 409 // validate 410 print("Validating Context..."); 411 int noContext = 0; 412 for (final YagoNamedEntity ne : entities.values()) { 413 for (final String alias : ne.aliasList) { 414 ne.addContext(alias); 415 } 416 if ((ne.getContext() == null || ne.getContext().equals("")) 417 && ne.wikiURL == null) 418 { 419 noContext++; 420 } 421 } 422 print("No Context: " + noContext); 423 } 424 425 private void setEntityAliasValues( 426 final HashMap<String, YagoNamedEntity> entities, 427 String seedDirectoryPath) 428 { 429 print("Setting Alias Values..."); 430 // Populate 'isCalled' 431 BufferedReader in = null; 432 try { 433 in = openFileAsReadStream(seedDirectoryPath + File.separator 434 + "isCalled_stripped.tsv"); 435 } catch (final FileNotFoundException e) { 436 e.printStackTrace(); 437 } 438 StreamLooper sl = new StreamLooper(in) { 439 @Override 440 protected void doWork(String s) { 441 final String[] values = s.split("\\s+"); 442 final String rootName = values[1]; 443 final String alias = convertLiteral(values[2]); 444 if (entities.keySet().contains(rootName)) { 445 entities.get(rootName).addAlias(alias); 446 } 447 } 448 }; 449 sl.loop(); 450 451 // populate 'means' 452 453 try { 454 in = openFileAsReadStream(seedDirectoryPath + File.separator 455 + "means_stripped.tsv"); 456 } catch (final FileNotFoundException e) { 457 e.printStackTrace(); 458 } 459 sl = new StreamLooper(in) { 460 @Override 461 protected void doWork(String s) { 462 final String[] values = s.split("\\s+"); 463 final String rootName = values[2]; 464 final String alias = convertLiteral(values[1]); 465 // System.out.println(alias); 466 if (entities.keySet().contains(rootName)) { 467 entities.get(rootName).addAlias(alias); 468 } 469 } 470 }; 471 sl.loop(); 472 print("Validating Aliases..."); 473 for (final YagoNamedEntity ne : entities.values()) { 474 final String alias = getAliasFrom(ne.rootName); 475 ne.addAlias(alias); 476 } 477 } 478 479 private void writeAliasFile(HashMap<String, YagoNamedEntity> entities, 480 String destinationPath, String seedDirectoryPath) 481 { 482 setEntityAliasValues(entities, seedDirectoryPath); 483 484 BufferedWriter w; 485 try { 486 w = openFileAsWriteStream(destinationPath); 487 w.write(""); 488 for (final YagoNamedEntity ne : entities.values()) { 489 if (ne.aliasList.size() > 0) { 490 w.append("+" + ne.rootName + "\n"); 491 for (final String alias : ne.aliasList) { 492 w.append("." + alias + "\n"); 493 } 494 } 495 } 496 } catch (final IOException e) { 497 e.printStackTrace(); 498 } 499 } 500 501 private HashMap<String, YagoNamedEntity> getEntities( 502 String seedDirectoryPath) 503 { 504 print("Getting Entities..."); 505 final HashMap<String, YagoNamedEntity> result = new HashMap<String, YagoNamedEntity>(); 506 BufferedReader in = null; 507 try { 508 in = openFileAsReadStream(seedDirectoryPath + File.separator 509 + "wordnet_person_100007846.txt"); 510 } catch (final FileNotFoundException e2) { 511 e2.printStackTrace(); 512 } 513 // get People 514 StreamLooper sl = new StreamLooper(in) { 515 @Override 516 protected void doWork(String s) { 517 final String[] values = s.split("\\s+"); 518 final String rootName = convertLiteral(values[1]); 519 if (!rootName.startsWith("Category:")) { 520 final YagoNamedEntity ne = new YagoNamedEntity(rootName, 521 NamedEntity.Type.Person); 522 result.put(rootName, ne); 523 } 524 } 525 }; 526 sl.loop(); 527 528 // get Organisations 529 try { 530 in = openFileAsReadStream(seedDirectoryPath + File.separator 531 + "wordnet_organization_108008335.txt"); 532 } catch (final FileNotFoundException e1) { 533 e1.printStackTrace(); 534 } 535 sl = new StreamLooper(in) { 536 @Override 537 protected void doWork(String s) { 538 final String[] values = s.split("\\s+"); 539 final String rootName = convertLiteral(values[1]); 540 if (!(rootName.startsWith("Category:") || rootName 541 .startsWith("geoent_"))) 542 { 543 final YagoNamedEntity ne = new YagoNamedEntity(rootName, 544 NamedEntity.Type.Organisation); 545 result.put(rootName, ne); 546 } 547 } 548 }; 549 sl.loop(); 550 551 if (locations) { 552 // get Locations 553 try { 554 in = openFileAsReadStream(seedDirectoryPath + File.separator 555 + "wordnet_location_100027167.txt"); 556 } catch (final FileNotFoundException e1) { 557 e1.printStackTrace(); 558 } 559 sl = new StreamLooper(in) { 560 @Override 561 protected void doWork(String s) { 562 final String[] values = s.split("\\s+"); 563 final String rootName = convertLiteral(values[1]); 564 if (!rootName.startsWith("Category:")) { 565 final YagoNamedEntity ne = new YagoNamedEntity(rootName, 566 NamedEntity.Type.Location); 567 result.put(rootName, ne); 568 } 569 } 570 }; 571 sl.loop(); 572 } 573 print("Total Entities: " + result.size()); 574 return result; 575 } 576 577 public static BufferedReader openFileAsReadStream(String path) 578 throws FileNotFoundException 579 { 580 FileReader fr = null; 581 fr = new FileReader(path); 582 final BufferedReader br = new BufferedReader(fr); 583 return br; 584 } 585 586 public static BufferedWriter openFileAsWriteStream(String path) 587 throws IOException 588 { 589 FileWriter fw = null; 590 fw = new FileWriter(path); 591 final BufferedWriter bw = new BufferedWriter(fw); 592 return bw; 593 } 594 595 private static String convertLiteral(String literal) { 596 final String escaped = StringEscapeUtils.unescapeJava(literal); 597 String first = null; 598 if (escaped.startsWith("\"")) 599 first = escaped.substring(1); 600 else 601 first = escaped; 602 if (first.endsWith("\"")) 603 return first.substring(0, first.length() - 1); 604 else 605 return first; 606 } 607 608 private static String convertResource(String literal) { 609 final String escaped = StringEscapeUtils.unescapeJava(literal); 610 return escaped.replaceAll("_", " "); 611 } 612 613 private void print(String message) { 614 if (verbose) 615 System.out.println(message); 616 if (logOut != null) { 617 log(message); 618 } 619 } 620 621 private void log(String message) { 622 try { 623 logOut.append(message + "\n"); 624 } catch (final IOException e) { 625 e.printStackTrace(); 626 } 627 } 628 629 /** 630 * Defualt main. 631 * 632 * @param args 633 * = path to the seed directory. 634 */ 635 public static void main(String[] args) { 636 new EntityExtractionResourceBuilder().buildCandidateAliasFile(args[0]); 637 } 638 639 /** 640 * Helper class to iterate through the lines of a Reader to do a bit of work 641 * on each. 642 * 643 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 644 * 645 */ 646 public static abstract class StreamLooper { 647 BufferedReader reader; 648 649 public StreamLooper(BufferedReader reader) { 650 this.reader = reader; 651 } 652 653 /** 654 * Iterates through each line to do the work. 655 */ 656 public void loop() { 657 String s = null; 658 try { 659 while ((s = reader.readLine()) != null) { 660 doWork(s); 661 } 662 reader.close(); 663 } catch (final IOException e) { 664 e.printStackTrace(); 665 } 666 } 667 668 /** 669 * Do what you want to each line here. 670 * 671 * @param s 672 */ 673 protected abstract void doWork(String s); 674 } 675 676}