001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import info.bliki.wiki.filter.PlainTextConverter;
033import info.bliki.wiki.model.WikiModel;
034
035import java.io.BufferedReader;
036import java.io.BufferedWriter;
037import java.io.File;
038import java.io.FileNotFoundException;
039import java.io.FileReader;
040import java.io.FileWriter;
041import java.io.IOException;
042import java.util.HashMap;
043
044import javax.xml.parsers.DocumentBuilder;
045import javax.xml.parsers.DocumentBuilderFactory;
046import javax.xml.parsers.ParserConfigurationException;
047
048import org.apache.commons.lang.StringEscapeUtils;
049import org.apache.lucene.document.FieldType;
050import org.apache.lucene.store.SimpleFSDirectory;
051import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder;
052import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer;
053import org.openimaj.text.nlp.namedentity.YagoEntityExactMatcherFactory.YagoEntityExactMatcher;
054import org.w3c.dom.Document;
055import org.w3c.dom.NodeList;
056import org.xml.sax.SAXException;
057
058/**
059 * This class has various methods that can be used to build the resources
060 * required by {@link YagoEntityCandidateFinder},
061 * {@link YagoEntityContextScorer} and {@link YagoEntityExactMatcher}. These
062 * resources are a text File of entity aliases, and a lucene index of contextual
063 * data.
064 *
065 * The directory of the stripped down Yago tsv files is required. This directory
066 * can be built with {@link SeedBuilder}.
067 *
068 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
069 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
070 *
071 */
072public class EntityExtractionResourceBuilder {
073
074        /**
075         * Default file name for the alias text file.
076         */
077        public static String DEFAULT_ALIAS_NAME = "AliasMapFile.txt";
078        /**
079         * Default directory name for the lucene index.
080         */
081        public static String DEFAULT_CONTEXT_NAME = "YagoLucene";
082        private static String DEFAULT_ROOT_NAME = ".YagoEntityExtraction";
083        private static String wikiApiPrefix = "http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=";
084        private static String wikiApiSuffix = "&prop=revisions&rvprop=content";
085        private boolean verbose = true;
086        // This will build for location entities. There are too many for memory.
087        // Leave false.
088        private boolean locations = false;
089        private static BufferedWriter logOut;
090
091        /**
092         * Builds the alias text file in the default location.
093         *
094         * @param seedDirectoryPath
095         *            = path location of the stripped down Yago .tsv files.
096         */
097        public void buildCandidateAliasFile(String seedDirectoryPath) {
098                buildCandidateAliasFile(seedDirectoryPath, getDefaultRootPath()
099                                + File.separator + DEFAULT_ALIAS_NAME);
100        }
101
102        /**
103         * Builds the alias text file in the specified location.
104         *
105         * @param seedDirectoryPath
106         *            = path location of the stripped down Yago .tsv files.
107         * @param destinationPath
108         *            = path to build the alias text file.
109         */
110        public void buildCandidateAliasFile(String seedDirectoryPath,
111                        String destinationPath)
112        {
113                writeAliasFile(getEntities(seedDirectoryPath), destinationPath,
114                                seedDirectoryPath);
115        }
116
117        /**
118         * Builds the lucene index in the default path.
119         *
120         * @param seedDirectoryPath
121         *            = path location of the stripped down Yago .tsv files.
122         */
123        public void buildContextLuceneIndex(String seedDirectoryPath) {
124                buildContextLuceneIndex(seedDirectoryPath, getDefaultRootPath()
125                                + File.separator + DEFAULT_CONTEXT_NAME);
126        }
127
128        /**
129         * Builds the lucene index at the specified path.
130         *
131         * @param seedDirectoryPath
132         * @param destinationPath
133         */
134        public void buildContextLuceneIndex(String seedDirectoryPath,
135                        String destinationPath)
136        {
137                try {
138                        buildIndex(getEntities(seedDirectoryPath), destinationPath,
139                                        seedDirectoryPath);
140                } catch (final IOException e) {
141                        e.printStackTrace();
142                }
143        }
144
145        /**
146         * Builds the alias text file and the lucene index in the default root
147         * directory.
148         *
149         * @param seedDirectoryPath
150         */
151        public void buildAll(String seedDirectoryPath) {
152                validateFileStructure();
153                createLogging(getDefaultRootPath() + File.separator + "log.txt");
154                buildAll(seedDirectoryPath, getDefaultRootPath());
155                try {
156                        logOut.flush();
157                        logOut.close();
158                } catch (final IOException e) {
159                        e.printStackTrace();
160                }
161        }
162
163        /**
164         * Builds the alias text file and the lucene index in the specified root
165         * directory.
166         *
167         * @param seedDirectoryPath
168         * @param destinationPath
169         */
170        public void buildAll(String seedDirectoryPath, String destinationPath) {
171                // Get the entities as people and organisations
172                print("Building All...");
173                final HashMap<String, YagoNamedEntity> entities = getEntities(seedDirectoryPath);
174                writeAliasFile(entities, destinationPath + File.separator
175                                + DEFAULT_ALIAS_NAME, seedDirectoryPath);
176                try {
177                        buildIndex(entities, destinationPath + File.separator
178                                        + DEFAULT_CONTEXT_NAME, seedDirectoryPath);
179                } catch (final IOException e) {
180                        e.printStackTrace();
181                }
182                print("Done");
183        }
184
185        /**
186         * @return default root directory path for all YagoEntity resources.
187         */
188        public static String getDefaultRootPath() {
189                return System.getProperty("user.home") + File.separator
190                                + DEFAULT_ROOT_NAME;
191        }
192
193        /**
194         * @return default alias text file path.
195         */
196        public static String getDefaultAliasFilePath() {
197                return getDefaultRootPath() + File.separator + DEFAULT_ALIAS_NAME;
198        }
199
200        /**
201         * @return defualt lucene directory path.
202         */
203        public static String getDefaultIndexDirectoryPath() {
204                return getDefaultRootPath() + File.separator + DEFAULT_CONTEXT_NAME;
205        }
206
207        public static String getAliasFrom(String rootName) {
208                String result;
209                String noGeo = null;
210                if (rootName.startsWith("geoent_")) {
211                        noGeo = rootName.substring(rootName.indexOf('_') + 1,
212                                        rootName.lastIndexOf('_'));
213                } else
214                        noGeo = rootName;
215                final String spaces = noGeo.replaceAll("_", " ");
216                String noParen;
217                if (spaces.contains("("))
218                        noParen = spaces.substring(0, spaces.indexOf("("));
219                else
220                        noParen = spaces;
221                String dropComma;
222                if (noParen.contains(","))
223                        dropComma = noParen.substring(0, spaces.indexOf(","));
224                else
225                        dropComma = noParen;
226                result = dropComma;
227                return result;
228        }
229
230        private void validateFileStructure() {
231                final File rootDir = new File(getDefaultRootPath());
232                if (!rootDir.isDirectory()) {
233                        rootDir.mkdir();
234                }
235                final File indexDir = new File(getDefaultRootPath() + File.separator
236                                + DEFAULT_CONTEXT_NAME);
237                if (!indexDir.isDirectory()) {
238                        indexDir.mkdir();
239                } else {
240                        for (final File f : indexDir.listFiles())
241                                f.delete();
242                }
243        }
244
245        private static void createLogging(String logFilePath) {
246                final File f = new File(logFilePath);
247                if (!f.isFile()) {
248                        try {
249                                f.createNewFile();
250                        } catch (final IOException e) {
251                                e.printStackTrace();
252                        }
253                } else {
254                }
255                FileWriter fstream = null;
256                try {
257                        fstream = new FileWriter(logFilePath);
258                        logOut = new BufferedWriter(fstream);
259                        logOut.write("");
260                } catch (final IOException e) {
261                        // TODO Auto-generated catch block
262                        e.printStackTrace();
263                }
264
265        }
266
267        private void buildIndex(HashMap<String, YagoNamedEntity> entities,
268                        String destinationPath, String seedDirectoryPath)
269                                        throws IOException
270        {
271                print("Building Index...");
272                setEntityContextValues(entities, seedDirectoryPath);
273                print("Initializing Lucene objects...");
274
275                // initialize lucene objects
276                final String[] names = { "uri", "context", "type" };
277                FieldType[] types;
278                final FieldType ti = new FieldType();
279                ti.setIndexed(true);
280                ti.setTokenized(true);
281                ti.setStored(true);
282                final FieldType n = new FieldType();
283                n.setStored(true);
284                n.setIndexed(true);
285                types = new FieldType[3];
286                types[0] = n;
287                types[1] = ti;
288                types[2] = n;
289                final File f = new File(destinationPath);
290                final QuickIndexer qi = new QuickIndexer(new SimpleFSDirectory(f));
291
292                // Initialize wiki objects
293                final DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory
294                                .newInstance();
295                DocumentBuilder docBuilder = null;
296                Document doc;
297                try {
298                        docBuilder = docBuilderFactory.newDocumentBuilder();
299                } catch (final ParserConfigurationException e) {
300                        e.printStackTrace();
301                }
302                doc = null;
303                final WikiModel wikiModel = new WikiModel(
304                                "http://www.mywiki.com/wiki/${image}",
305                                "http://www.mywiki.com/wiki/${title}");
306                int count = 0;
307                print("Building Lucene Index...");
308                for (final YagoNamedEntity entity : entities.values()) {
309                        count++;
310                        if (count % 5000 == 0)
311                                print("Processed " + count);
312                        // if wikiURL, add wiki to context
313                        if (entity.wikiURL != null) {
314                                final String title = entity.wikiURL.substring(entity.wikiURL
315                                                .lastIndexOf("/") + 1);
316                                try {
317                                        doc = docBuilder.parse(wikiApiPrefix + title
318                                                        + wikiApiSuffix);
319                                } catch (final SAXException e) {
320                                        e.printStackTrace();
321                                } catch (final IOException e) {
322                                        e.printStackTrace();
323                                }
324                                doc.getDocumentElement().normalize();
325                                final NodeList revisions = doc.getElementsByTagName("rev");
326                                if (revisions.getLength() > 0) {
327                                        final String markup = revisions.item(0).getTextContent();
328
329                                        // convert markup dump to plaintext.
330                                        final String plainStr = wikiModel.render(
331                                                        new PlainTextConverter(), markup);
332                                        // add it to the context.
333                                        entity.addContext(plainStr);
334                                }
335                        }
336                        final String[] values = { entity.rootName, entity.getContext(),
337                                        entity.type.toString() };
338                        qi.addDocumentFromFields(names, values, types);
339                }
340                qi.finalise();
341        }
342
343        private void setEntityContextValues(
344                        final HashMap<String, YagoNamedEntity> entities,
345                        String seedDirectoryPath)
346        {
347                print("Setting Context Values...");
348                BufferedReader in = null;
349                // Created
350                try {
351                        in = openFileAsReadStream(seedDirectoryPath + File.separator
352                                        + "created_stripped.tsv");
353                } catch (final FileNotFoundException e) {
354                        e.printStackTrace();
355                }
356                StreamLooper sl = new StreamLooper(in) {
357                        @Override
358                        protected void doWork(String s) {
359                                final String[] values = s.split("\\s+");
360                                final String rootName = values[1];
361                                final String context = convertResource(values[2]);
362                                if (entities.keySet().contains(rootName)) {
363                                        entities.get(rootName).addContext(context);
364                                }
365                        }
366                };
367                sl.loop();
368
369                // wikiAnchorText
370                try {
371                        in = openFileAsReadStream(seedDirectoryPath + File.separator
372                                        + "hasWikipediaAnchorText_stripped.tsv");
373                } catch (final FileNotFoundException e) {
374                        e.printStackTrace();
375                }
376                sl = new StreamLooper(in) {
377                        @Override
378                        protected void doWork(String s) {
379                                final String[] values = s.split("\\s+");
380                                final String rootName = values[1];
381                                final String context = convertLiteral(values[2]);
382                                if (entities.keySet().contains(rootName)) {
383                                        entities.get(rootName).addContext(context);
384                                }
385                        }
386                };
387                sl.loop();
388
389                // wikiUrl
390
391                try {
392                        in = openFileAsReadStream(seedDirectoryPath + File.separator
393                                        + "hasWikipediaUrl_stripped.tsv");
394                } catch (final FileNotFoundException e) {
395                        e.printStackTrace();
396                }
397                sl = new StreamLooper(in) {
398                        @Override
399                        protected void doWork(String s) {
400                                final String[] values = s.split("\\s+");
401                                final String rootName = values[1];
402                                if (entities.keySet().contains(rootName)) {
403                                        entities.get(rootName).wikiURL = values[2].replaceAll("\"",
404                                                        "");
405                                }
406                        }
407                };
408                sl.loop();
409                // validate
410                print("Validating Context...");
411                int noContext = 0;
412                for (final YagoNamedEntity ne : entities.values()) {
413                        for (final String alias : ne.aliasList) {
414                                ne.addContext(alias);
415                        }
416                        if ((ne.getContext() == null || ne.getContext().equals(""))
417                                        && ne.wikiURL == null)
418                        {
419                                noContext++;
420                        }
421                }
422                print("No Context: " + noContext);
423        }
424
425        private void setEntityAliasValues(
426                        final HashMap<String, YagoNamedEntity> entities,
427                        String seedDirectoryPath)
428        {
429                print("Setting Alias Values...");
430                // Populate 'isCalled'
431                BufferedReader in = null;
432                try {
433                        in = openFileAsReadStream(seedDirectoryPath + File.separator
434                                        + "isCalled_stripped.tsv");
435                } catch (final FileNotFoundException e) {
436                        e.printStackTrace();
437                }
438                StreamLooper sl = new StreamLooper(in) {
439                        @Override
440                        protected void doWork(String s) {
441                                final String[] values = s.split("\\s+");
442                                final String rootName = values[1];
443                                final String alias = convertLiteral(values[2]);
444                                if (entities.keySet().contains(rootName)) {
445                                        entities.get(rootName).addAlias(alias);
446                                }
447                        }
448                };
449                sl.loop();
450
451                // populate 'means'
452
453                try {
454                        in = openFileAsReadStream(seedDirectoryPath + File.separator
455                                        + "means_stripped.tsv");
456                } catch (final FileNotFoundException e) {
457                        e.printStackTrace();
458                }
459                sl = new StreamLooper(in) {
460                        @Override
461                        protected void doWork(String s) {
462                                final String[] values = s.split("\\s+");
463                                final String rootName = values[2];
464                                final String alias = convertLiteral(values[1]);
465                                // System.out.println(alias);
466                                if (entities.keySet().contains(rootName)) {
467                                        entities.get(rootName).addAlias(alias);
468                                }
469                        }
470                };
471                sl.loop();
472                print("Validating Aliases...");
473                for (final YagoNamedEntity ne : entities.values()) {
474                        final String alias = getAliasFrom(ne.rootName);
475                        ne.addAlias(alias);
476                }
477        }
478
479        private void writeAliasFile(HashMap<String, YagoNamedEntity> entities,
480                        String destinationPath, String seedDirectoryPath)
481        {
482                setEntityAliasValues(entities, seedDirectoryPath);
483
484                BufferedWriter w;
485                try {
486                        w = openFileAsWriteStream(destinationPath);
487                        w.write("");
488                        for (final YagoNamedEntity ne : entities.values()) {
489                                if (ne.aliasList.size() > 0) {
490                                        w.append("+" + ne.rootName + "\n");
491                                        for (final String alias : ne.aliasList) {
492                                                w.append("." + alias + "\n");
493                                        }
494                                }
495                        }
496                } catch (final IOException e) {
497                        e.printStackTrace();
498                }
499        }
500
501        private HashMap<String, YagoNamedEntity> getEntities(
502                        String seedDirectoryPath)
503                        {
504                print("Getting Entities...");
505                final HashMap<String, YagoNamedEntity> result = new HashMap<String, YagoNamedEntity>();
506                BufferedReader in = null;
507                try {
508                        in = openFileAsReadStream(seedDirectoryPath + File.separator
509                                        + "wordnet_person_100007846.txt");
510                } catch (final FileNotFoundException e2) {
511                        e2.printStackTrace();
512                }
513                // get People
514                StreamLooper sl = new StreamLooper(in) {
515                        @Override
516                        protected void doWork(String s) {
517                                final String[] values = s.split("\\s+");
518                                final String rootName = convertLiteral(values[1]);
519                                if (!rootName.startsWith("Category:")) {
520                                        final YagoNamedEntity ne = new YagoNamedEntity(rootName,
521                                                        NamedEntity.Type.Person);
522                                        result.put(rootName, ne);
523                                }
524                        }
525                };
526                sl.loop();
527
528                // get Organisations
529                try {
530                        in = openFileAsReadStream(seedDirectoryPath + File.separator
531                                        + "wordnet_organization_108008335.txt");
532                } catch (final FileNotFoundException e1) {
533                        e1.printStackTrace();
534                }
535                sl = new StreamLooper(in) {
536                        @Override
537                        protected void doWork(String s) {
538                                final String[] values = s.split("\\s+");
539                                final String rootName = convertLiteral(values[1]);
540                                if (!(rootName.startsWith("Category:") || rootName
541                                                .startsWith("geoent_")))
542                                {
543                                        final YagoNamedEntity ne = new YagoNamedEntity(rootName,
544                                                        NamedEntity.Type.Organisation);
545                                        result.put(rootName, ne);
546                                }
547                        }
548                };
549                sl.loop();
550
551                if (locations) {
552                        // get Locations
553                        try {
554                                in = openFileAsReadStream(seedDirectoryPath + File.separator
555                                                + "wordnet_location_100027167.txt");
556                        } catch (final FileNotFoundException e1) {
557                                e1.printStackTrace();
558                        }
559                        sl = new StreamLooper(in) {
560                                @Override
561                                protected void doWork(String s) {
562                                        final String[] values = s.split("\\s+");
563                                        final String rootName = convertLiteral(values[1]);
564                                        if (!rootName.startsWith("Category:")) {
565                                                final YagoNamedEntity ne = new YagoNamedEntity(rootName,
566                                                                NamedEntity.Type.Location);
567                                                result.put(rootName, ne);
568                                        }
569                                }
570                        };
571                        sl.loop();
572                }
573                print("Total Entities: " + result.size());
574                return result;
575                        }
576
577        public static BufferedReader openFileAsReadStream(String path)
578                        throws FileNotFoundException
579        {
580                FileReader fr = null;
581                fr = new FileReader(path);
582                final BufferedReader br = new BufferedReader(fr);
583                return br;
584        }
585
586        public static BufferedWriter openFileAsWriteStream(String path)
587                        throws IOException
588        {
589                FileWriter fw = null;
590                fw = new FileWriter(path);
591                final BufferedWriter bw = new BufferedWriter(fw);
592                return bw;
593        }
594
595        private static String convertLiteral(String literal) {
596                final String escaped = StringEscapeUtils.unescapeJava(literal);
597                String first = null;
598                if (escaped.startsWith("\""))
599                        first = escaped.substring(1);
600                else
601                        first = escaped;
602                if (first.endsWith("\""))
603                        return first.substring(0, first.length() - 1);
604                else
605                        return first;
606        }
607
608        private static String convertResource(String literal) {
609                final String escaped = StringEscapeUtils.unescapeJava(literal);
610                return escaped.replaceAll("_", " ");
611        }
612
613        private void print(String message) {
614                if (verbose)
615                        System.out.println(message);
616                if (logOut != null) {
617                        log(message);
618                }
619        }
620
621        private void log(String message) {
622                try {
623                        logOut.append(message + "\n");
624                } catch (final IOException e) {
625                        e.printStackTrace();
626                }
627        }
628
629        /**
630         * Defualt main.
631         *
632         * @param args
633         *            = path to the seed directory.
634         */
635        public static void main(String[] args) {
636                new EntityExtractionResourceBuilder().buildCandidateAliasFile(args[0]);
637        }
638
639        /**
640         * Helper class to iterate through the lines of a Reader to do a bit of work
641         * on each.
642         *
643         * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
644         *
645         */
646        public static abstract class StreamLooper {
647                BufferedReader reader;
648
649                public StreamLooper(BufferedReader reader) {
650                        this.reader = reader;
651                }
652
653                /**
654                 * Iterates through each line to do the work.
655                 */
656                public void loop() {
657                        String s = null;
658                        try {
659                                while ((s = reader.readLine()) != null) {
660                                        doWork(s);
661                                }
662                                reader.close();
663                        } catch (final IOException e) {
664                                e.printStackTrace();
665                        }
666                }
667
668                /**
669                 * Do what you want to each line here.
670                 *
671                 * @param s
672                 */
673                protected abstract void doWork(String s);
674        }
675
676}