001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.namedentity; 031 032import java.io.BufferedReader; 033import java.io.BufferedWriter; 034import java.io.File; 035import java.io.FileNotFoundException; 036import java.io.FileReader; 037import java.io.FileWriter; 038import java.io.IOException; 039import java.util.HashSet; 040 041/** 042 * Builds the seed directory required by {@link EntityExtractionResourceBuilder} 043 * . This should only be done as a once off, then keep the seed directory. That 044 * is why it is so hacky. 045 * 046 * Usage: 1)Create a directory in the decompressed Yago tsv folder called 047 * "seedDirectory". 2) grep type_star.tsv for: a)wordnet_organization_108008335 048 * into a file called wordnet_organization_108008335.txt inside the 049 * seedDirectory. b)wordnet_person_100007846 into a file called 050 * wordnet_person_100007846.txt inside the seedDirectory. 051 * c)wordnet_location_100027167 into a file called 052 * wordnet_location_100027167.txt inside the seedDirectory. 3) run main with the 053 * path of the tsv directory as an argument. (use -Xmx2g, 3g if possible) 4) 054 * seedDirectory is now ready to be passed to 055 * {@link EntityExtractionResourceBuilder} as an argument to build the 056 * resources. 057 * 058 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 059 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 060 */ 061public class SeedBuilder { 062 private String yagoDirectory; 063 private static String seedDirectory = "seedDirectory"; 064 065 private SeedBuilder(String yagoTSVDirectory) { 066 this.yagoDirectory = yagoTSVDirectory; 067 } 068 069 /** 070 * @param args 071 * = path to the Yago2 tsv directory. 072 */ 073 public static void main(String[] args) { 074 final SeedBuilder sb = new SeedBuilder(args[0]); 075 sb.build(); 076 } 077 078 public void build() { 079 System.out.println("Building hash..."); 080 HashSet<String> filters = null; 081 try { 082 filters = buildEntityHash(yagoDirectory); 083 } catch (final FileNotFoundException e) { 084 e.printStackTrace(); 085 } 086 System.out.println("Total Entities : " + filters.size()); 087 FileFilterer ff = new FileFilterer( 088 yagoDirectory + File.separator + "means.tsv", 089 yagoDirectory + File.separator + seedDirectory + File.separator + "means_stripped.tsv", 090 filters) 091 { 092 @Override 093 protected String getCompareValue(String line) { 094 final String[] values = line.split("\\s+"); 095 return values[2]; 096 } 097 }; 098 ff.filter(); 099 ff = new FileFilterer( 100 yagoDirectory + File.separator + "isCalled.tsv", 101 yagoDirectory + File.separator + seedDirectory + File.separator + "isCalled_stripped.tsv", 102 filters) 103 { 104 @Override 105 protected String getCompareValue(String line) { 106 final String[] values = line.split("\\s+"); 107 return values[1]; 108 } 109 }; 110 ff.filter(); 111 ff = new FileFilterer( 112 yagoDirectory + File.separator + "created.tsv", 113 yagoDirectory + File.separator + seedDirectory + File.separator + "created_stripped.tsv", 114 filters) 115 { 116 @Override 117 protected String getCompareValue(String line) { 118 final String[] values = line.split("\\s+"); 119 return values[1]; 120 } 121 }; 122 ff.filter(); 123 ff = new FileFilterer( 124 yagoDirectory + File.separator + "hasWikipediaUrl.tsv", 125 yagoDirectory + File.separator + seedDirectory + File.separator + "hasWikipediaUrl_stripped.tsv", 126 filters) 127 { 128 @Override 129 protected String getCompareValue(String line) { 130 final String[] values = line.split("\\s+"); 131 return values[1]; 132 } 133 }; 134 ff.filter(); 135 ff = new FileFilterer( 136 yagoDirectory + File.separator + "hasAnchorText.tsv", 137 yagoDirectory + File.separator + seedDirectory + File.separator + "hasAnchorText_stripped.tsv", 138 filters) 139 { 140 @Override 141 protected String getCompareValue(String line) { 142 final String[] values = line.split("\\s+"); 143 return values[1]; 144 } 145 }; 146 ff.filter(); 147 ff = new FileFilterer( 148 yagoDirectory + File.separator + "hasWikipediaAnchorText.tsv", 149 yagoDirectory + File.separator + seedDirectory + File.separator + "hasWikipediaAnchorText_stripped.tsv", 150 filters) 151 { 152 @Override 153 protected String getCompareValue(String line) { 154 final String[] values = line.split("\\s+"); 155 return values[1]; 156 } 157 }; 158 ff.filter(); 159 System.out.println("Done"); 160 } 161 162 private HashSet<String> buildEntityHash(String directoryPath) throws FileNotFoundException { 163 final HashSet<String> result = new HashSet<String>(); 164 final BufferedReader org = openIn(directoryPath + File.separator 165 + seedDirectory + File.separator + "wordnet_organization_108008335.txt"); 166 String s = null; 167 int clashes = 0; 168 try { 169 while ((s = org.readLine()) != null) { 170 final String[] values = s.split("\\s+"); 171 if (result.contains(values[1])) { 172 clashes++; 173 } 174 else 175 result.add(values[1]); 176 } 177 org.close(); 178 } catch (final IOException e) { 179 180 e.printStackTrace(); 181 } 182 183 final BufferedReader per = openIn(directoryPath + File.separator 184 + seedDirectory + File.separator + "wordnet_person_100007846.txt"); 185 s = null; 186 try { 187 while ((s = per.readLine()) != null) { 188 final String[] values = s.split("\\s+"); 189 if (result.contains(values[1])) { 190 clashes++; 191 } 192 else 193 result.add(values[1]); 194 } 195 per.close(); 196 } catch (final IOException e) { 197 e.printStackTrace(); 198 } 199 final BufferedReader loc = openIn(directoryPath + File.separator 200 + seedDirectory + File.separator + "wordnet_location_100027167.txt"); 201 s = null; 202 try { 203 while ((s = loc.readLine()) != null) { 204 final String[] values = s.split("\\s+"); 205 if (result.contains(values[1])) { 206 clashes++; 207 } 208 else 209 result.add(values[1]); 210 } 211 loc.close(); 212 } catch (final IOException e) { 213 214 e.printStackTrace(); 215 } 216 System.out.println("Ent Clashes: " + clashes); 217 return result; 218 } 219 220 private static BufferedReader openIn(String path) throws FileNotFoundException { 221 FileReader fr = null; 222 fr = new FileReader(path); 223 final BufferedReader br = new BufferedReader(fr); 224 return br; 225 } 226 227 private abstract class FileFilterer { 228 229 private HashSet<String> filterValues; 230 private BufferedReader in; 231 private BufferedWriter out; 232 private String inString; 233 234 public FileFilterer(String fileToFilter, String filteredResultsFile, 235 HashSet<String> validFilterValues) 236 { 237 this.filterValues = validFilterValues; 238 try { 239 in = openIn(fileToFilter); 240 } catch (final FileNotFoundException e) { 241 242 e.printStackTrace(); 243 } 244 openOut(filteredResultsFile); 245 inString = fileToFilter; 246 } 247 248 private void openOut(String filteredResultsFile) { 249 FileWriter fw = null; 250 try { 251 fw = new FileWriter(filteredResultsFile); 252 } catch (final IOException e) { 253 254 e.printStackTrace(); 255 } 256 out = new BufferedWriter(fw); 257 try { 258 out.write(""); 259 } catch (final IOException e) { 260 261 e.printStackTrace(); 262 } 263 } 264 265 private void filter() { 266 String s; 267 System.out.println("Filtering : " + inString); 268 int count = 0; 269 int vcount = 0; 270 try { 271 while ((s = in.readLine()) != null) { 272 count++; 273 if (filterValues.contains(getCompareValue(s))) { 274 out.append(s + "\n"); 275 vcount++; 276 } 277 } 278 } catch (final IOException e) { 279 e.printStackTrace(); 280 } 281 try { 282 in.close(); 283 out.flush(); 284 out.close(); 285 } catch (final IOException e) { 286 e.printStackTrace(); 287 } 288 System.out.println("Finished : " + inString + "\nFiltered " + count 289 + " to " + vcount); 290 } 291 292 protected abstract String getCompareValue(String line); 293 294 } 295}