001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.io.File;
033import java.util.ArrayList;
034import java.util.List;
035import java.util.Map;
036
037import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder;
038import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer;
039import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation;
040
041/**
042 * Constructs a {@link YagoEntityExactMatcher} from provided resource folder or default.
043 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
044 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
045 * 
046 */
047public class YagoEntityExactMatcherFactory {
048
049        /**
050         * Build a {@link YagoEntityExactMatcher} from the default YagoEntity folder
051         * path. See {@link EntityExtractionResourceBuilder} for details on
052         * constructing this folder.
053         * 
054         * @return {@link YagoEntityExactMatcher}
055         */
056        public static YagoEntityExactMatcher getMatcher() {
057                return getMatcher(EntityExtractionResourceBuilder.getDefaultRootPath());
058        }
059
060        /**
061         * Build a {@link YagoEntityExactMatcher} from the provided resource path.
062         * See {@link EntityExtractionResourceBuilder} for details on constructing
063         * this folder.
064         * 
065         * @param yagoEntityFolderPath
066         * @return {@link YagoEntityExactMatcher}
067         */
068        public static YagoEntityExactMatcher getMatcher(String yagoEntityFolderPath) {
069                YagoEntityCandidateFinder ycf = null;
070                ycf = YagoEntityCandidateFinderFactory
071                                .createFromAliasFile(yagoEntityFolderPath
072                                                + File.separator
073                                                + EntityExtractionResourceBuilder.DEFAULT_ALIAS_NAME);
074                YagoEntityContextScorer ycs = null;
075                ycs = YagoEntityContextScorerFactory
076                                .createFromIndexFile(yagoEntityFolderPath
077                                                + File.separator
078                                                + EntityExtractionResourceBuilder.DEFAULT_CONTEXT_NAME);
079                return new YagoEntityExactMatcher(ycs, ycf);
080        }
081
082        /**
083         * The class that will extract unique Entities from a given list of tokens.
084         * 
085         * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
086         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
087         * 
088         */
089        public static class YagoEntityExactMatcher {
090
091                private YagoEntityContextScorer contextScorer;
092                /**
093                 * made public so that you have access to the candidateFinder to setNGrams.
094                 */
095                public YagoEntityCandidateFinder candidateFinder;
096
097                /**
098                 * Default constructor.
099                 * 
100                 * @param contextScorer
101                 * @param candidateFinder
102                 */
103                public YagoEntityExactMatcher(YagoEntityContextScorer contextScorer,
104                                YagoEntityCandidateFinder candidateFinder) {
105                        this.contextScorer = contextScorer;
106                        this.candidateFinder = candidateFinder;
107                }
108
109                /**
110                 * Returns a list of most likely unique Named Entities. These will not
111                 * overlap in the tokens that they have matched.
112                 * 
113                 * @param possibleEntityTokens
114                 * @param contextTokens
115                 * @return list {@link NamedEntity}
116                 */
117                public List<NamedEntity> matchExact(List<String> possibleEntityTokens,
118                                List<String> contextTokens) {
119                        List<NamedEntity> result = new ArrayList<NamedEntity>();
120                        // Check if any candidates are found
121                        List<List<NamedEntity>> candidates = candidateFinder
122                                        .getCandidates(possibleEntityTokens);
123                        // If none found, return an empty.
124                        if (candidates.size() == 0) {
125                                return result;
126                        }
127                        // Use Context Scoring to disambiguate candidates
128                        for (List<NamedEntity> can : candidates) {
129                                ArrayList<String> companies = new ArrayList<String>();
130                                for (NamedEntity ent : can) {
131                                        companies.add(ent.rootName);
132                                }
133                                // get the localised context for each list of named Entities
134                                Map<NamedEntity, Float> contextScores = contextScorer
135                                                .getScoresForEntityList(companies, contextTokens);
136                                float topScore = 0;
137                                NamedEntity resEntity = null;
138                                for (NamedEntity entity : can) {
139                                        if (contextScores.keySet().contains(entity)
140                                                        && contextScores.get(entity) > topScore) {
141                                                resEntity = entity;
142                                                for (NamedEntity te : contextScores.keySet()) {
143                                                        if (resEntity.equals(te)) {
144                                                                resEntity.type = te.type;
145                                                        }
146                                                }
147                                                topScore = contextScores.get(entity);
148                                        }
149                                }
150                                if (resEntity != null)
151                                        result.add(resEntity);
152                        }
153                        return result;
154                }
155
156                /**
157                 * @see #matchExact(List, List)
158                 * @param possibleEntityTokens
159                 * @param context
160                 * @return list of {@link NamedEntity}
161                 */
162                public List<NamedEntity> matchExact(
163                                List<TokenAnnotation> possibleEntityTokens,
164                                String context) {
165                        List<NamedEntity> result = new ArrayList<NamedEntity>();
166                        // Check if any candidates are found
167                        List<List<NamedEntity>> candidates = candidateFinder
168                                        .getCandidatesFromReversableTokenList(possibleEntityTokens);
169                        // If none found, return an empty.
170                        if (candidates.size() == 0) {
171                                return result;
172                        }
173                        // Use Context Scoring to disambiguate candidates
174                        for (List<NamedEntity> can : candidates) {
175                                ArrayList<String> companies = new ArrayList<String>();
176                                for (NamedEntity ent : can) {
177                                        companies.add(ent.rootName);
178                                }
179                                // get the localised context for each list of named Entities
180                                Map<NamedEntity, Float> contextScores = contextScorer
181                                                .getScoresForEntityList(companies, context);
182                                float topScore = 0;
183                                NamedEntity resEntity = null;
184                                for (NamedEntity entity : can) {
185                                        if (contextScores.keySet().contains(entity)
186                                                        && contextScores.get(entity) > topScore) {
187                                                resEntity = entity;
188                                                for (NamedEntity te : contextScores.keySet()) {
189                                                        if (resEntity.equals(te)) {
190                                                                resEntity.type = te.type;                                                               
191                                                        }
192                                                }
193                                                topScore = contextScores.get(entity);
194                                        }
195                                }
196                                if (resEntity != null)
197                                        result.add(resEntity);
198                        }
199                        return result;
200                }
201        }
202
203}