001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.namedentity; 031 032import java.io.File; 033import java.io.IOException; 034import java.util.HashMap; 035import java.util.List; 036import java.util.Map; 037 038import org.apache.commons.lang.StringUtils; 039import org.apache.lucene.analysis.standard.StandardAnalyzer; 040import org.apache.lucene.document.FieldType; 041import org.apache.lucene.queryparser.classic.ParseException; 042import org.apache.lucene.store.Directory; 043import org.apache.lucene.store.SimpleFSDirectory; 044import org.apache.lucene.util.Version; 045import org.openimaj.text.nlp.namedentity.NamedEntity.Type; 046 047/** 048 * Factory Object for building {@link YagoEntityContextScorer} 049 * 050 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 051 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 052 */ 053public class YagoEntityContextScorerFactory { 054 055 /** 056 * Create from a lucene index file. 057 * 058 * @param indexPath 059 * @return {@link YagoEntityContextScorer} 060 */ 061 public static YagoEntityContextScorer createFromIndexFile(String indexPath) 062 { 063 YagoEntityContextScorer yci = new YagoEntityContextScorer(); 064 File f = new File(indexPath); 065 if (f.isDirectory()) { 066 try { 067 yci.index = new SimpleFSDirectory(f); 068 } catch (IOException e) { 069 e.printStackTrace(); 070 } 071 } else 072 return null; 073 return yci; 074 } 075 076 /** 077 * Class that uses an underlying lucene index to match tokens to companies. 078 * Use the enclosing factory class to instantiate. 079 * 080 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk) 081 * 082 */ 083 public static class YagoEntityContextScorer extends 084 EntityContextScorer<List<String>, NamedEntity> { 085 086 private Directory index = null; 087 /** 088 * lucene index field names. 089 */ 090 public final String[] names = { "uri", "context", "type" }; 091 private FieldType[] types; 092 private IgnoreTokenStripper ss; 093 private QuickSearcher qs; 094 095 private YagoEntityContextScorer() { 096 FieldType ti = new FieldType(); 097 ti.setIndexed(true); 098 ti.setTokenized(true); 099 ti.setStored(true); 100 FieldType n = new FieldType(); 101 n.setStored(true); 102 n.setIndexed(true); 103 types = new FieldType[3]; 104 types[0] = n; 105 types[1] = ti; 106 ss = new IgnoreTokenStripper(IgnoreTokenStripper.Language.English); 107 qs = null; 108 } 109 110 @Override 111 public HashMap<NamedEntity, Float> getScoredEntitiesFromContext( 112 List<String> context) { 113 if (qs == null) 114 instantiateQS(); 115 String contextString = StringUtils.join( 116 ss.getNonStopWords(context), " "); 117 try { 118 // search on the context field 119 String[] retFields = new String[] { names[0], names[2] }; 120 HashMap<String[], Float> searchresults = qs.search(names[1], 121 retFields, contextString, 1); 122 HashMap<NamedEntity, Float> results = new HashMap<NamedEntity, Float>(); 123 for (String[] srv : searchresults.keySet()) { 124 NamedEntity yne = new NamedEntity(srv[0], 125 Enum.valueOf(Type.class, srv[1])); 126 results.put(yne, searchresults.get(srv)); 127 } 128 return results; 129 130 } catch (ParseException e) { 131 132 e.printStackTrace(); 133 } catch (IOException e) { 134 135 e.printStackTrace(); 136 } 137 return null; 138 } 139 140 @Override 141 public Map<NamedEntity, Float> getScoresForEntityList( 142 List<String> entityUris, List<String> context) { 143 if (qs == null) 144 instantiateQS(); 145 String contextString = StringUtils.join( 146 ss.getNonStopWords(context), " "); 147 if (entityUris.size() > 0) { 148 String[] retFields = new String[] { names[0], names[2] }; 149 HashMap<String[], Float> searchresults= qs.searchFiltered(names[1], retFields, contextString, 150 names[0], entityUris); 151 HashMap<NamedEntity, Float> results = new HashMap<NamedEntity, Float>(); 152 for (String[] srv : searchresults.keySet()) { 153 NamedEntity yne = new NamedEntity(srv[0], 154 Enum.valueOf(Type.class, srv[1])); 155 results.put(yne, searchresults.get(srv)); 156 } 157 return results; 158 } else 159 return new HashMap<NamedEntity, Float>(); 160 } 161 162 @Override 163 public Map<NamedEntity, Float> getScoresForEntityList( 164 List<String> entityUris, String context) { 165 if (qs == null) 166 instantiateQS(); 167 if (entityUris.size() > 0) { 168 String[] retFields = new String[] { names[0], names[2] }; 169 HashMap<String[], Float> searchresults= qs.searchFiltered(names[1], retFields, context, 170 names[0], entityUris); 171 HashMap<NamedEntity, Float> results = new HashMap<NamedEntity, Float>(); 172 for (String[] srv : searchresults.keySet()) { 173 NamedEntity yne = new NamedEntity(srv[0], 174 Enum.valueOf(Type.class, srv[1])); 175 results.put(yne, searchresults.get(srv)); 176 } 177 return results; 178 } else 179 return new HashMap<NamedEntity, Float>(); 180 } 181 182 private void instantiateQS() { 183 qs = new QuickSearcher(index, new StandardAnalyzer( 184 Version.LUCENE_40)); 185 } 186 187 } 188 189}