001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.namedentity; 031 032import java.io.File; 033import java.util.ArrayList; 034import java.util.List; 035import java.util.Map; 036 037import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder; 038import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer; 039import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation; 040 041/** 042 * Constructs a {@link YagoEntityExactMatcher} from provided resource folder or default. 043 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 044 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 045 * 046 */ 047public class YagoEntityExactMatcherFactory { 048 049 /** 050 * Build a {@link YagoEntityExactMatcher} from the default YagoEntity folder 051 * path. See {@link EntityExtractionResourceBuilder} for details on 052 * constructing this folder. 053 * 054 * @return {@link YagoEntityExactMatcher} 055 */ 056 public static YagoEntityExactMatcher getMatcher() { 057 return getMatcher(EntityExtractionResourceBuilder.getDefaultRootPath()); 058 } 059 060 /** 061 * Build a {@link YagoEntityExactMatcher} from the provided resource path. 062 * See {@link EntityExtractionResourceBuilder} for details on constructing 063 * this folder. 064 * 065 * @param yagoEntityFolderPath 066 * @return {@link YagoEntityExactMatcher} 067 */ 068 public static YagoEntityExactMatcher getMatcher(String yagoEntityFolderPath) { 069 YagoEntityCandidateFinder ycf = null; 070 ycf = YagoEntityCandidateFinderFactory 071 .createFromAliasFile(yagoEntityFolderPath 072 + File.separator 073 + EntityExtractionResourceBuilder.DEFAULT_ALIAS_NAME); 074 YagoEntityContextScorer ycs = null; 075 ycs = YagoEntityContextScorerFactory 076 .createFromIndexFile(yagoEntityFolderPath 077 + File.separator 078 + EntityExtractionResourceBuilder.DEFAULT_CONTEXT_NAME); 079 return new YagoEntityExactMatcher(ycs, ycf); 080 } 081 082 /** 083 * The class that will extract unique Entities from a given list of tokens. 084 * 085 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 086 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 087 * 088 */ 089 public static class YagoEntityExactMatcher { 090 091 private YagoEntityContextScorer contextScorer; 092 /** 093 * made public so that you have access to the candidateFinder to setNGrams. 094 */ 095 public YagoEntityCandidateFinder candidateFinder; 096 097 /** 098 * Default constructor. 099 * 100 * @param contextScorer 101 * @param candidateFinder 102 */ 103 public YagoEntityExactMatcher(YagoEntityContextScorer contextScorer, 104 YagoEntityCandidateFinder candidateFinder) { 105 this.contextScorer = contextScorer; 106 this.candidateFinder = candidateFinder; 107 } 108 109 /** 110 * Returns a list of most likely unique Named Entities. These will not 111 * overlap in the tokens that they have matched. 112 * 113 * @param possibleEntityTokens 114 * @param contextTokens 115 * @return list {@link NamedEntity} 116 */ 117 public List<NamedEntity> matchExact(List<String> possibleEntityTokens, 118 List<String> contextTokens) { 119 List<NamedEntity> result = new ArrayList<NamedEntity>(); 120 // Check if any candidates are found 121 List<List<NamedEntity>> candidates = candidateFinder 122 .getCandidates(possibleEntityTokens); 123 // If none found, return an empty. 124 if (candidates.size() == 0) { 125 return result; 126 } 127 // Use Context Scoring to disambiguate candidates 128 for (List<NamedEntity> can : candidates) { 129 ArrayList<String> companies = new ArrayList<String>(); 130 for (NamedEntity ent : can) { 131 companies.add(ent.rootName); 132 } 133 // get the localised context for each list of named Entities 134 Map<NamedEntity, Float> contextScores = contextScorer 135 .getScoresForEntityList(companies, contextTokens); 136 float topScore = 0; 137 NamedEntity resEntity = null; 138 for (NamedEntity entity : can) { 139 if (contextScores.keySet().contains(entity) 140 && contextScores.get(entity) > topScore) { 141 resEntity = entity; 142 for (NamedEntity te : contextScores.keySet()) { 143 if (resEntity.equals(te)) { 144 resEntity.type = te.type; 145 } 146 } 147 topScore = contextScores.get(entity); 148 } 149 } 150 if (resEntity != null) 151 result.add(resEntity); 152 } 153 return result; 154 } 155 156 /** 157 * @see #matchExact(List, List) 158 * @param possibleEntityTokens 159 * @param context 160 * @return list of {@link NamedEntity} 161 */ 162 public List<NamedEntity> matchExact( 163 List<TokenAnnotation> possibleEntityTokens, 164 String context) { 165 List<NamedEntity> result = new ArrayList<NamedEntity>(); 166 // Check if any candidates are found 167 List<List<NamedEntity>> candidates = candidateFinder 168 .getCandidatesFromReversableTokenList(possibleEntityTokens); 169 // If none found, return an empty. 170 if (candidates.size() == 0) { 171 return result; 172 } 173 // Use Context Scoring to disambiguate candidates 174 for (List<NamedEntity> can : candidates) { 175 ArrayList<String> companies = new ArrayList<String>(); 176 for (NamedEntity ent : can) { 177 companies.add(ent.rootName); 178 } 179 // get the localised context for each list of named Entities 180 Map<NamedEntity, Float> contextScores = contextScorer 181 .getScoresForEntityList(companies, context); 182 float topScore = 0; 183 NamedEntity resEntity = null; 184 for (NamedEntity entity : can) { 185 if (contextScores.keySet().contains(entity) 186 && contextScores.get(entity) > topScore) { 187 resEntity = entity; 188 for (NamedEntity te : contextScores.keySet()) { 189 if (resEntity.equals(te)) { 190 resEntity.type = te.type; 191 } 192 } 193 topScore = contextScores.get(entity); 194 } 195 } 196 if (resEntity != null) 197 result.add(resEntity); 198 } 199 return result; 200 } 201 } 202 203}