001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.io.BufferedReader;
033import java.io.DataInputStream;
034import java.io.FileInputStream;
035import java.io.FileNotFoundException;
036import java.io.IOException;
037import java.io.InputStream;
038import java.io.InputStreamReader;
039import java.util.ArrayList;
040import java.util.Arrays;
041import java.util.Collections;
042import java.util.HashMap;
043import java.util.HashSet;
044import java.util.List;
045import java.util.Map;
046
047import org.apache.commons.lang.StringUtils;
048import org.openimaj.text.nlp.namedentity.NGramGenerator.StringNGramGenerator;
049import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation;
050
051/**
052 * Factory object for : -creating {@link YagoEntityCandidateFinder} in various
053 * ways. -creating Yago Entity Alias text files in various ways.
054 * 
055 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
056 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
057 */
058public class YagoEntityCandidateFinderFactory {
059        
060        
061        /**
062         * Returns a {@link YagoEntityCandidateFinder} given a path Yago Entity
063         * Alias textfile
064         * 
065         * @param pathToAliasFile
066         * @return {@link YagoEntityCandidateFinder}
067         */
068        public static YagoEntityCandidateFinder createFromAliasFile(String pathToAliasFile){
069                HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
070                
071                InputStream fstream = null;
072                try {
073                        fstream = new FileInputStream(pathToAliasFile);
074                } catch (FileNotFoundException e) {                     
075                        e.printStackTrace();
076                }
077                // Get the object of DataInputStream
078                DataInputStream in = new DataInputStream(fstream);
079                BufferedReader br = new BufferedReader(new InputStreamReader(in));
080                String strLine = null;
081                try {
082                        strLine = br.readLine();
083                } catch (IOException e) {                       
084                        e.printStackTrace();
085                }
086                // Read File Line By Line
087                String entity_Uri = null;
088                while (strLine != null) {
089                        if (!strLine.startsWith("+") && !strLine.startsWith(".")) {
090                                try {
091                                        strLine = br.readLine();
092                                } catch (IOException e) {                                       
093                                        e.printStackTrace();
094                                }
095                                continue;
096                        }
097                        if (strLine.startsWith("+")) {
098                                entity_Uri = strLine.substring(1);
099                                try {
100                                        strLine = br.readLine();
101                                } catch (IOException e) {
102                                        e.printStackTrace();
103                                }                               
104                        }
105                        while (strLine != null && strLine.startsWith(".")) {
106                                String alias = strLine.substring(1).trim();
107                                if (result.containsKey(alias)) {
108                                        if (!result.get(alias).contains(entity_Uri)) {
109                                                result.get(alias).add(entity_Uri);                                              
110                                        }
111                                } else {
112                                        ArrayList<String> comps = new ArrayList<String>();
113                                        comps.add(entity_Uri);
114                                        result.put(alias, comps);                                       
115                                }
116                                try {
117                                        strLine = br.readLine();
118                                } catch (IOException e) {                                       
119                                        e.printStackTrace();
120                                }
121                        }
122                }
123                // Close the input stream
124                try {
125                        in.close();
126                } catch (IOException e) {                       
127                        e.printStackTrace();
128                }               
129                return new YagoEntityCandidateFinder(result);
130        }
131
132        /**
133         * Class that uses an Alias HashMap to find candidate Entities for a list of
134         * tokens.
135         */
136        public static class YagoEntityCandidateFinder {
137
138                private HashMap<String, ArrayList<String>> aliasMap;
139                private IgnoreTokenStripper ss;
140                private ArrayList<Integer> ngrams;
141
142                private YagoEntityCandidateFinder(
143                                HashMap<String, ArrayList<String>> aliasMap) {
144                        ss = new IgnoreTokenStripper(IgnoreTokenStripper.Language.English);
145                        this.aliasMap = aliasMap;
146                        this.setNgrams( 1, 2, 3, 4 , 5);
147                };
148
149                /**
150                 * Set the ngram sizes that the CandidateFinder will search with.
151                 * @param ngrams
152                 */
153                public void setNgrams(Integer... ngrams) {
154                        HashSet<Integer> ngUnique = new HashSet<Integer>(
155                                        Arrays.asList(ngrams));
156                        ArrayList<Integer> n = new ArrayList<Integer>(ngUnique);
157                        Collections.sort(n);
158                        this.ngrams = n;
159                }
160
161                /**
162                 * Gets candidate entities.
163                 * 
164                 * @param tokens
165                 * @return A list of a list of {@link NamedEntity}s that are matched to
166                 *         the same tokens. ( A token ngram can match to multiple
167                 *         entities )
168                 */
169                public List<List<NamedEntity>> getCandidates(List<String> tokens) {
170                        // get Ngram entities
171                        HashMap<Integer, Map<Integer, List<NamedEntity>>> ngramEntities = new HashMap<Integer, Map<Integer, List<NamedEntity>>>();
172                        for (int i = 0; i < ngrams.size(); i++) {
173                                ngramEntities.put(ngrams.get(i),
174                                                getNgramEntities(ngrams.get(i), tokens));
175                        }
176                        // Resolve Collisions
177                        Map<Integer, List<NamedEntity>> top = ngramEntities.get(ngrams
178                                        .get(ngrams.size() - 1));
179                        // For each map of ngram Entities starting from smallest ngram...
180                        for (int i = 0; i < ngrams.size(); i++) {
181                                int lowSize = ngrams.get(i);
182                                Map<Integer, List<NamedEntity>> lowEnts = ngramEntities
183                                                .get(lowSize);
184                                // ...for each ngram Entity in the map...
185                                for (int startTokenLow : lowEnts.keySet()) {
186                                        int endTokenLow = startTokenLow + lowSize - 1;
187                                        boolean collision = false;
188                                        // ...check that it does not collide with a larger ngram
189                                        // entity...
190                                        breakLoop: for (int j = i + 1; j < ngrams.size(); j++) {
191
192                                                int highSize = ngrams.get(j);
193                                                for (int startTokenHigh : ngramEntities.get(highSize)
194                                                                .keySet()) {
195                                                        int endTokenHigh = startTokenHigh + highSize - 1;
196                                                        if ((startTokenLow <= endTokenHigh && startTokenLow >= startTokenHigh)
197                                                                        || (endTokenLow >= startTokenHigh && endTokenLow <= endTokenHigh)) {
198                                                                collision = true;
199                                                                break breakLoop;
200                                                        }
201                                                }
202                                        }
203                                        if (!collision) {
204                                                top.put(startTokenLow, lowEnts.get(startTokenLow));
205                                        }
206                                }
207                        }
208                        ArrayList<List<NamedEntity>> rr = new ArrayList<List<NamedEntity>>();
209                        for (List<NamedEntity> entList : top.values()) {
210                                rr.add(entList);
211                        }
212                        return rr;
213                }
214                
215                /**
216                 * Gets candidate entities.
217                 * 
218                 * @param tokens
219                 * @return A list of a list of {@link NamedEntity}s that are matched to
220                 *         the same tokens. ( A token ngram can match to multiple
221                 *         entities )
222                 */
223                public List<List<NamedEntity>> getCandidatesFromReversableTokenList(List<TokenAnnotation> tokens) {
224                        // get Ngram entities
225                        HashMap<Integer, Map<Integer, List<NamedEntity>>> ngramEntities = new HashMap<Integer, Map<Integer, List<NamedEntity>>>();
226                        for (int i = 0; i < ngrams.size(); i++) {
227                                ngramEntities.put(ngrams.get(i),
228                                                getNgramEntitiesFromRTL(ngrams.get(i), tokens));
229                        }
230                        // Resolve Collisions
231                        Map<Integer, List<NamedEntity>> top = ngramEntities.get(ngrams
232                                        .get(ngrams.size() - 1));
233                        // For each map of ngram Entities starting from smallest ngram...
234                        for (int i = 0; i < ngrams.size(); i++) {
235                                int lowSize = ngrams.get(i);
236                                Map<Integer, List<NamedEntity>> lowEnts = ngramEntities
237                                                .get(lowSize);
238                                // ...for each ngram Entity in the map...
239                                for (int startTokenLow : lowEnts.keySet()) {
240                                        int endTokenLow = startTokenLow + lowSize - 1;
241                                        boolean collision = false;
242                                        // ...check that it does not collide with a larger ngram
243                                        // entity...
244                                        breakLoop: for (int j = i + 1; j < ngrams.size(); j++) {
245
246                                                int highSize = ngrams.get(j);
247                                                for (int startTokenHigh : ngramEntities.get(highSize)
248                                                                .keySet()) {
249                                                        int endTokenHigh = startTokenHigh + highSize - 1;
250                                                        if ((startTokenLow <= endTokenHigh && startTokenLow >= startTokenHigh)
251                                                                        || (endTokenLow >= startTokenHigh && endTokenLow <= endTokenHigh)) {
252                                                                collision = true;
253                                                                break breakLoop;
254                                                        }
255                                                }
256                                        }
257                                        if (!collision) {
258                                                top.put(startTokenLow, lowEnts.get(startTokenLow));
259                                        }
260                                }
261                        }
262                        ArrayList<List<NamedEntity>> rr = new ArrayList<List<NamedEntity>>();
263                        for (List<NamedEntity> entList : top.values()) {
264                                rr.add(entList);
265                        }
266                        return rr;
267                }
268
269                private Map<Integer, List<NamedEntity>> getNgramEntitiesFromRTL(
270                                Integer n, List<TokenAnnotation> baseTokens) {
271                        NGramGenerator<TokenAnnotation> ngen = new NGramGenerator<TokenAnnotation>(TokenAnnotation.class);
272                        List<TokenAnnotation[]> ngrams = ngen.getNGrams(
273                                        baseTokens, n);
274                        List<String> tokens = new ArrayList<String>();
275                        for (int i = 0; i < ngrams.size(); i++) {
276                                tokens.add(ngrams.get(0)[0].reverse(Arrays.asList(ngrams.get(i))).trim());
277                        }
278                        HashMap<Integer, List<NamedEntity>> result = new HashMap<Integer, List<NamedEntity>>();
279                        // Try and match ngrams
280                        for (int i = 0; i < tokens.size(); i++) {
281                                String token = tokens.get(i);
282                                if (!ss.isIgnoreToken(token)) {
283                                        ArrayList<String> matches = getYagoCandidates(token);
284                                        if (matches != null) {
285                                                ArrayList<NamedEntity> subRes = new ArrayList<NamedEntity>();
286                                                for (String match : matches) {
287                                                        NamedEntity ne = new NamedEntity();
288                                                        TokenAnnotation[] ngram = ngrams.get(i);
289                                                        ne.rootName=match;
290                                                        ne.startChar=ngram[0].start;
291                                                        ne.stopChar=ngram[ngram.length-1].stop;
292                                                        ne.type = NamedEntity.Type.Organisation;
293                                                        subRes.add(ne);
294                                                }
295                                                result.put(i, subRes);
296                                        }
297                                }
298                        }
299                        return result;
300                }
301
302                private HashMap<Integer, List<NamedEntity>> getNgramEntities(int n,
303                                List<String> baseTokens) {
304                        List<String[]> ngrams = new StringNGramGenerator().getNGrams(
305                                        baseTokens, n);
306                        List<String> tokens = new ArrayList<String>();
307                        for (int i = 0; i < ngrams.size(); i++) {
308                                tokens.add(StringUtils.join(ngrams.get(i), " "));
309                        }
310                        HashMap<Integer, List<NamedEntity>> result = new HashMap<Integer, List<NamedEntity>>();
311                        // Try and match ngrams
312                        for (int i = 0; i < tokens.size(); i++) {
313                                String token = tokens.get(i);
314                                if (!ss.isIgnoreToken(token)) {
315                                        ArrayList<String> matches = getYagoCandidates(token);
316                                        if (matches != null) {
317                                                ArrayList<NamedEntity> subRes = new ArrayList<NamedEntity>();
318                                                for (String match : matches) {
319                                                        NamedEntity ne = new NamedEntity();
320                                                        ne.rootName = match;
321                                                        ne.startToken = i;
322                                                        ne.stopToken = i - 1 + n;
323                                                        ne.type = NamedEntity.Type.Organisation;
324                                                        subRes.add(ne);
325                                                }
326                                                result.put(i, subRes);
327                                        }
328                                }
329                        }
330                        return result;
331                }
332
333                private ArrayList<String> getYagoCandidates(String token) {
334                        if (aliasMap.containsKey(token)) {
335                                //System.out.println(aliasMap.get(token));
336                                ArrayList<String> result = aliasMap.get(token);
337                                return result;
338                        } else
339                                return null;
340                }
341
342        }
343
344}