001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.namedentity; 031 032import java.io.BufferedReader; 033import java.io.DataInputStream; 034import java.io.FileInputStream; 035import java.io.FileNotFoundException; 036import java.io.IOException; 037import java.io.InputStream; 038import java.io.InputStreamReader; 039import java.util.ArrayList; 040import java.util.Arrays; 041import java.util.Collections; 042import java.util.HashMap; 043import java.util.HashSet; 044import java.util.List; 045import java.util.Map; 046 047import org.apache.commons.lang.StringUtils; 048import org.openimaj.text.nlp.namedentity.NGramGenerator.StringNGramGenerator; 049import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation; 050 051/** 052 * Factory object for : -creating {@link YagoEntityCandidateFinder} in various 053 * ways. -creating Yago Entity Alias text files in various ways. 054 * 055 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 056 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 057 */ 058public class YagoEntityCandidateFinderFactory { 059 060 061 /** 062 * Returns a {@link YagoEntityCandidateFinder} given a path Yago Entity 063 * Alias textfile 064 * 065 * @param pathToAliasFile 066 * @return {@link YagoEntityCandidateFinder} 067 */ 068 public static YagoEntityCandidateFinder createFromAliasFile(String pathToAliasFile){ 069 HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>(); 070 071 InputStream fstream = null; 072 try { 073 fstream = new FileInputStream(pathToAliasFile); 074 } catch (FileNotFoundException e) { 075 e.printStackTrace(); 076 } 077 // Get the object of DataInputStream 078 DataInputStream in = new DataInputStream(fstream); 079 BufferedReader br = new BufferedReader(new InputStreamReader(in)); 080 String strLine = null; 081 try { 082 strLine = br.readLine(); 083 } catch (IOException e) { 084 e.printStackTrace(); 085 } 086 // Read File Line By Line 087 String entity_Uri = null; 088 while (strLine != null) { 089 if (!strLine.startsWith("+") && !strLine.startsWith(".")) { 090 try { 091 strLine = br.readLine(); 092 } catch (IOException e) { 093 e.printStackTrace(); 094 } 095 continue; 096 } 097 if (strLine.startsWith("+")) { 098 entity_Uri = strLine.substring(1); 099 try { 100 strLine = br.readLine(); 101 } catch (IOException e) { 102 e.printStackTrace(); 103 } 104 } 105 while (strLine != null && strLine.startsWith(".")) { 106 String alias = strLine.substring(1).trim(); 107 if (result.containsKey(alias)) { 108 if (!result.get(alias).contains(entity_Uri)) { 109 result.get(alias).add(entity_Uri); 110 } 111 } else { 112 ArrayList<String> comps = new ArrayList<String>(); 113 comps.add(entity_Uri); 114 result.put(alias, comps); 115 } 116 try { 117 strLine = br.readLine(); 118 } catch (IOException e) { 119 e.printStackTrace(); 120 } 121 } 122 } 123 // Close the input stream 124 try { 125 in.close(); 126 } catch (IOException e) { 127 e.printStackTrace(); 128 } 129 return new YagoEntityCandidateFinder(result); 130 } 131 132 /** 133 * Class that uses an Alias HashMap to find candidate Entities for a list of 134 * tokens. 135 */ 136 public static class YagoEntityCandidateFinder { 137 138 private HashMap<String, ArrayList<String>> aliasMap; 139 private IgnoreTokenStripper ss; 140 private ArrayList<Integer> ngrams; 141 142 private YagoEntityCandidateFinder( 143 HashMap<String, ArrayList<String>> aliasMap) { 144 ss = new IgnoreTokenStripper(IgnoreTokenStripper.Language.English); 145 this.aliasMap = aliasMap; 146 this.setNgrams( 1, 2, 3, 4 , 5); 147 }; 148 149 /** 150 * Set the ngram sizes that the CandidateFinder will search with. 151 * @param ngrams 152 */ 153 public void setNgrams(Integer... ngrams) { 154 HashSet<Integer> ngUnique = new HashSet<Integer>( 155 Arrays.asList(ngrams)); 156 ArrayList<Integer> n = new ArrayList<Integer>(ngUnique); 157 Collections.sort(n); 158 this.ngrams = n; 159 } 160 161 /** 162 * Gets candidate entities. 163 * 164 * @param tokens 165 * @return A list of a list of {@link NamedEntity}s that are matched to 166 * the same tokens. ( A token ngram can match to multiple 167 * entities ) 168 */ 169 public List<List<NamedEntity>> getCandidates(List<String> tokens) { 170 // get Ngram entities 171 HashMap<Integer, Map<Integer, List<NamedEntity>>> ngramEntities = new HashMap<Integer, Map<Integer, List<NamedEntity>>>(); 172 for (int i = 0; i < ngrams.size(); i++) { 173 ngramEntities.put(ngrams.get(i), 174 getNgramEntities(ngrams.get(i), tokens)); 175 } 176 // Resolve Collisions 177 Map<Integer, List<NamedEntity>> top = ngramEntities.get(ngrams 178 .get(ngrams.size() - 1)); 179 // For each map of ngram Entities starting from smallest ngram... 180 for (int i = 0; i < ngrams.size(); i++) { 181 int lowSize = ngrams.get(i); 182 Map<Integer, List<NamedEntity>> lowEnts = ngramEntities 183 .get(lowSize); 184 // ...for each ngram Entity in the map... 185 for (int startTokenLow : lowEnts.keySet()) { 186 int endTokenLow = startTokenLow + lowSize - 1; 187 boolean collision = false; 188 // ...check that it does not collide with a larger ngram 189 // entity... 190 breakLoop: for (int j = i + 1; j < ngrams.size(); j++) { 191 192 int highSize = ngrams.get(j); 193 for (int startTokenHigh : ngramEntities.get(highSize) 194 .keySet()) { 195 int endTokenHigh = startTokenHigh + highSize - 1; 196 if ((startTokenLow <= endTokenHigh && startTokenLow >= startTokenHigh) 197 || (endTokenLow >= startTokenHigh && endTokenLow <= endTokenHigh)) { 198 collision = true; 199 break breakLoop; 200 } 201 } 202 } 203 if (!collision) { 204 top.put(startTokenLow, lowEnts.get(startTokenLow)); 205 } 206 } 207 } 208 ArrayList<List<NamedEntity>> rr = new ArrayList<List<NamedEntity>>(); 209 for (List<NamedEntity> entList : top.values()) { 210 rr.add(entList); 211 } 212 return rr; 213 } 214 215 /** 216 * Gets candidate entities. 217 * 218 * @param tokens 219 * @return A list of a list of {@link NamedEntity}s that are matched to 220 * the same tokens. ( A token ngram can match to multiple 221 * entities ) 222 */ 223 public List<List<NamedEntity>> getCandidatesFromReversableTokenList(List<TokenAnnotation> tokens) { 224 // get Ngram entities 225 HashMap<Integer, Map<Integer, List<NamedEntity>>> ngramEntities = new HashMap<Integer, Map<Integer, List<NamedEntity>>>(); 226 for (int i = 0; i < ngrams.size(); i++) { 227 ngramEntities.put(ngrams.get(i), 228 getNgramEntitiesFromRTL(ngrams.get(i), tokens)); 229 } 230 // Resolve Collisions 231 Map<Integer, List<NamedEntity>> top = ngramEntities.get(ngrams 232 .get(ngrams.size() - 1)); 233 // For each map of ngram Entities starting from smallest ngram... 234 for (int i = 0; i < ngrams.size(); i++) { 235 int lowSize = ngrams.get(i); 236 Map<Integer, List<NamedEntity>> lowEnts = ngramEntities 237 .get(lowSize); 238 // ...for each ngram Entity in the map... 239 for (int startTokenLow : lowEnts.keySet()) { 240 int endTokenLow = startTokenLow + lowSize - 1; 241 boolean collision = false; 242 // ...check that it does not collide with a larger ngram 243 // entity... 244 breakLoop: for (int j = i + 1; j < ngrams.size(); j++) { 245 246 int highSize = ngrams.get(j); 247 for (int startTokenHigh : ngramEntities.get(highSize) 248 .keySet()) { 249 int endTokenHigh = startTokenHigh + highSize - 1; 250 if ((startTokenLow <= endTokenHigh && startTokenLow >= startTokenHigh) 251 || (endTokenLow >= startTokenHigh && endTokenLow <= endTokenHigh)) { 252 collision = true; 253 break breakLoop; 254 } 255 } 256 } 257 if (!collision) { 258 top.put(startTokenLow, lowEnts.get(startTokenLow)); 259 } 260 } 261 } 262 ArrayList<List<NamedEntity>> rr = new ArrayList<List<NamedEntity>>(); 263 for (List<NamedEntity> entList : top.values()) { 264 rr.add(entList); 265 } 266 return rr; 267 } 268 269 private Map<Integer, List<NamedEntity>> getNgramEntitiesFromRTL( 270 Integer n, List<TokenAnnotation> baseTokens) { 271 NGramGenerator<TokenAnnotation> ngen = new NGramGenerator<TokenAnnotation>(TokenAnnotation.class); 272 List<TokenAnnotation[]> ngrams = ngen.getNGrams( 273 baseTokens, n); 274 List<String> tokens = new ArrayList<String>(); 275 for (int i = 0; i < ngrams.size(); i++) { 276 tokens.add(ngrams.get(0)[0].reverse(Arrays.asList(ngrams.get(i))).trim()); 277 } 278 HashMap<Integer, List<NamedEntity>> result = new HashMap<Integer, List<NamedEntity>>(); 279 // Try and match ngrams 280 for (int i = 0; i < tokens.size(); i++) { 281 String token = tokens.get(i); 282 if (!ss.isIgnoreToken(token)) { 283 ArrayList<String> matches = getYagoCandidates(token); 284 if (matches != null) { 285 ArrayList<NamedEntity> subRes = new ArrayList<NamedEntity>(); 286 for (String match : matches) { 287 NamedEntity ne = new NamedEntity(); 288 TokenAnnotation[] ngram = ngrams.get(i); 289 ne.rootName=match; 290 ne.startChar=ngram[0].start; 291 ne.stopChar=ngram[ngram.length-1].stop; 292 ne.type = NamedEntity.Type.Organisation; 293 subRes.add(ne); 294 } 295 result.put(i, subRes); 296 } 297 } 298 } 299 return result; 300 } 301 302 private HashMap<Integer, List<NamedEntity>> getNgramEntities(int n, 303 List<String> baseTokens) { 304 List<String[]> ngrams = new StringNGramGenerator().getNGrams( 305 baseTokens, n); 306 List<String> tokens = new ArrayList<String>(); 307 for (int i = 0; i < ngrams.size(); i++) { 308 tokens.add(StringUtils.join(ngrams.get(i), " ")); 309 } 310 HashMap<Integer, List<NamedEntity>> result = new HashMap<Integer, List<NamedEntity>>(); 311 // Try and match ngrams 312 for (int i = 0; i < tokens.size(); i++) { 313 String token = tokens.get(i); 314 if (!ss.isIgnoreToken(token)) { 315 ArrayList<String> matches = getYagoCandidates(token); 316 if (matches != null) { 317 ArrayList<NamedEntity> subRes = new ArrayList<NamedEntity>(); 318 for (String match : matches) { 319 NamedEntity ne = new NamedEntity(); 320 ne.rootName = match; 321 ne.startToken = i; 322 ne.stopToken = i - 1 + n; 323 ne.type = NamedEntity.Type.Organisation; 324 subRes.add(ne); 325 } 326 result.put(i, subRes); 327 } 328 } 329 } 330 return result; 331 } 332 333 private ArrayList<String> getYagoCandidates(String token) { 334 if (aliasMap.containsKey(token)) { 335 //System.out.println(aliasMap.get(token)); 336 ArrayList<String> result = aliasMap.get(token); 337 return result; 338 } else 339 return null; 340 } 341 342 } 343 344}