001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.io.BufferedReader;
033import java.io.BufferedWriter;
034import java.io.File;
035import java.io.FileNotFoundException;
036import java.io.FileReader;
037import java.io.FileWriter;
038import java.io.IOException;
039import java.util.HashSet;
040
041/**
042 * Builds the seed directory required by {@link EntityExtractionResourceBuilder}
043 * . This should only be done as a once off, then keep the seed directory. That
044 * is why it is so hacky.
045 * 
046 * Usage: 1)Create a directory in the decompressed Yago tsv folder called
047 * "seedDirectory". 2) grep type_star.tsv for: a)wordnet_organization_108008335
048 * into a file called wordnet_organization_108008335.txt inside the
049 * seedDirectory. b)wordnet_person_100007846 into a file called
050 * wordnet_person_100007846.txt inside the seedDirectory.
051 * c)wordnet_location_100027167 into a file called
052 * wordnet_location_100027167.txt inside the seedDirectory. 3) run main with the
053 * path of the tsv directory as an argument. (use -Xmx2g, 3g if possible) 4)
054 * seedDirectory is now ready to be passed to
055 * {@link EntityExtractionResourceBuilder} as an argument to build the
056 * resources.
057 * 
058 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
059 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
060 */
061public class SeedBuilder {
062        private String yagoDirectory;
063        private static String seedDirectory = "seedDirectory";
064
065        private SeedBuilder(String yagoTSVDirectory) {
066                this.yagoDirectory = yagoTSVDirectory;
067        }
068
069        /**
070         * @param args
071         *            = path to the Yago2 tsv directory.
072         */
073        public static void main(String[] args) {
074                final SeedBuilder sb = new SeedBuilder(args[0]);
075                sb.build();
076        }
077
078        public void build() {
079                System.out.println("Building hash...");
080                HashSet<String> filters = null;
081                try {
082                        filters = buildEntityHash(yagoDirectory);
083                } catch (final FileNotFoundException e) {
084                        e.printStackTrace();
085                }
086                System.out.println("Total Entities : " + filters.size());
087                FileFilterer ff = new FileFilterer(
088                                yagoDirectory + File.separator + "means.tsv",
089                                yagoDirectory + File.separator + seedDirectory + File.separator + "means_stripped.tsv",
090                                filters)
091                {
092                        @Override
093                        protected String getCompareValue(String line) {
094                                final String[] values = line.split("\\s+");
095                                return values[2];
096                        }
097                };
098                ff.filter();
099                ff = new FileFilterer(
100                                yagoDirectory + File.separator + "isCalled.tsv",
101                                yagoDirectory + File.separator + seedDirectory + File.separator + "isCalled_stripped.tsv",
102                                filters)
103                {
104                        @Override
105                        protected String getCompareValue(String line) {
106                                final String[] values = line.split("\\s+");
107                                return values[1];
108                        }
109                };
110                ff.filter();
111                ff = new FileFilterer(
112                                yagoDirectory + File.separator + "created.tsv",
113                                yagoDirectory + File.separator + seedDirectory + File.separator + "created_stripped.tsv",
114                                filters)
115                {
116                        @Override
117                        protected String getCompareValue(String line) {
118                                final String[] values = line.split("\\s+");
119                                return values[1];
120                        }
121                };
122                ff.filter();
123                ff = new FileFilterer(
124                                yagoDirectory + File.separator + "hasWikipediaUrl.tsv",
125                                yagoDirectory + File.separator + seedDirectory + File.separator + "hasWikipediaUrl_stripped.tsv",
126                                filters)
127                {
128                        @Override
129                        protected String getCompareValue(String line) {
130                                final String[] values = line.split("\\s+");
131                                return values[1];
132                        }
133                };
134                ff.filter();
135                ff = new FileFilterer(
136                                yagoDirectory + File.separator + "hasAnchorText.tsv",
137                                yagoDirectory + File.separator + seedDirectory + File.separator + "hasAnchorText_stripped.tsv",
138                                filters)
139                {
140                        @Override
141                        protected String getCompareValue(String line) {
142                                final String[] values = line.split("\\s+");
143                                return values[1];
144                        }
145                };
146                ff.filter();
147                ff = new FileFilterer(
148                                yagoDirectory + File.separator + "hasWikipediaAnchorText.tsv",
149                                yagoDirectory + File.separator + seedDirectory + File.separator + "hasWikipediaAnchorText_stripped.tsv",
150                                filters)
151                {
152                        @Override
153                        protected String getCompareValue(String line) {
154                                final String[] values = line.split("\\s+");
155                                return values[1];
156                        }
157                };
158                ff.filter();
159                System.out.println("Done");
160        }
161
162        private HashSet<String> buildEntityHash(String directoryPath) throws FileNotFoundException {
163                final HashSet<String> result = new HashSet<String>();
164                final BufferedReader org = openIn(directoryPath + File.separator
165                                + seedDirectory + File.separator + "wordnet_organization_108008335.txt");
166                String s = null;
167                int clashes = 0;
168                try {
169                        while ((s = org.readLine()) != null) {
170                                final String[] values = s.split("\\s+");
171                                if (result.contains(values[1])) {
172                                        clashes++;
173                                }
174                                else
175                                        result.add(values[1]);
176                        }
177                        org.close();
178                } catch (final IOException e) {
179
180                        e.printStackTrace();
181                }
182
183                final BufferedReader per = openIn(directoryPath + File.separator
184                                + seedDirectory + File.separator + "wordnet_person_100007846.txt");
185                s = null;
186                try {
187                        while ((s = per.readLine()) != null) {
188                                final String[] values = s.split("\\s+");
189                                if (result.contains(values[1])) {
190                                        clashes++;
191                                }
192                                else
193                                        result.add(values[1]);
194                        }
195                        per.close();
196                } catch (final IOException e) {
197                        e.printStackTrace();
198                }
199                final BufferedReader loc = openIn(directoryPath + File.separator
200                                + seedDirectory + File.separator + "wordnet_location_100027167.txt");
201                s = null;
202                try {
203                        while ((s = loc.readLine()) != null) {
204                                final String[] values = s.split("\\s+");
205                                if (result.contains(values[1])) {
206                                        clashes++;
207                                }
208                                else
209                                        result.add(values[1]);
210                        }
211                        loc.close();
212                } catch (final IOException e) {
213
214                        e.printStackTrace();
215                }
216                System.out.println("Ent Clashes: " + clashes);
217                return result;
218        }
219
220        private static BufferedReader openIn(String path) throws FileNotFoundException {
221                FileReader fr = null;
222                fr = new FileReader(path);
223                final BufferedReader br = new BufferedReader(fr);
224                return br;
225        }
226
227        private abstract class FileFilterer {
228
229                private HashSet<String> filterValues;
230                private BufferedReader in;
231                private BufferedWriter out;
232                private String inString;
233
234                public FileFilterer(String fileToFilter, String filteredResultsFile,
235                                HashSet<String> validFilterValues)
236                {
237                        this.filterValues = validFilterValues;
238                        try {
239                                in = openIn(fileToFilter);
240                        } catch (final FileNotFoundException e) {
241
242                                e.printStackTrace();
243                        }
244                        openOut(filteredResultsFile);
245                        inString = fileToFilter;
246                }
247
248                private void openOut(String filteredResultsFile) {
249                        FileWriter fw = null;
250                        try {
251                                fw = new FileWriter(filteredResultsFile);
252                        } catch (final IOException e) {
253
254                                e.printStackTrace();
255                        }
256                        out = new BufferedWriter(fw);
257                        try {
258                                out.write("");
259                        } catch (final IOException e) {
260
261                                e.printStackTrace();
262                        }
263                }
264
265                private void filter() {
266                        String s;
267                        System.out.println("Filtering : " + inString);
268                        int count = 0;
269                        int vcount = 0;
270                        try {
271                                while ((s = in.readLine()) != null) {
272                                        count++;
273                                        if (filterValues.contains(getCompareValue(s))) {
274                                                out.append(s + "\n");
275                                                vcount++;
276                                        }
277                                }
278                        } catch (final IOException e) {
279                                e.printStackTrace();
280                        }
281                        try {
282                                in.close();
283                                out.flush();
284                                out.close();
285                        } catch (final IOException e) {
286                                e.printStackTrace();
287                        }
288                        System.out.println("Finished : " + inString + "\nFiltered " + count
289                                        + " to " + vcount);
290                }
291
292                protected abstract String getCompareValue(String line);
293
294        }
295}