001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.textpipe.annotators; 031 032import java.util.List; 033 034import org.openimaj.text.nlp.namedentity.NamedEntity; 035import org.openimaj.text.nlp.namedentity.YagoEntityExactMatcherFactory; 036import org.openimaj.text.nlp.namedentity.YagoEntityExactMatcherFactory.YagoEntityExactMatcher; 037import org.openimaj.text.nlp.textpipe.annotations.NamedEntityAnnotation; 038import org.openimaj.text.nlp.textpipe.annotations.RawTextAnnotation; 039import org.openimaj.text.nlp.textpipe.annotations.SentenceAnnotation; 040import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation; 041 042public class YagoNEAnnotator extends AbstractNEAnnotator { 043 044 private int cw = 1; // On either side of current sentence. 045 046 public YagoEntityExactMatcher yagoMatcher; 047 048 public YagoNEAnnotator() { 049 yagoMatcher = YagoEntityExactMatcherFactory.getMatcher(); 050 } 051 052 @Override 053 void performAnnotation(RawTextAnnotation annotation) 054 throws MissingRequiredAnnotationException 055 { 056 // Get the sentences 057 final List<SentenceAnnotation> sentences = annotation.getAnnotationsFor(SentenceAnnotation.class); 058 for (int i = 0; i < sentences.size(); i++) { 059 // get context for sentence 060 final String context = getContextFrom(sentences.subList(Math.max(0, i - cw), 061 Math.min(i + cw + 1, sentences.size()))); 062 annotateSentence(sentences.get(i), context); 063 } 064 } 065 066 private void annotateSentence(SentenceAnnotation sentence, 067 String context) 068 { 069 070 final List<NamedEntity> ents = yagoMatcher.matchExact(sentence.getAnnotationsFor(TokenAnnotation.class), context); 071 072 for (final NamedEntity ent : ents) { 073 final NamedEntityAnnotation nea = new NamedEntityAnnotation(); 074 nea.namedEntity = ent; 075 nea.tokensMatched.addAll(sentence.getAnnotationsFor(TokenAnnotation.class).subList(ent.startToken, 076 ent.stopToken)); 077 sentence.addAnnotation(nea); 078 } 079 080 /* 081 * List<List<TokenAnnotation>> validEntityPhrases = 082 * getValidEntityPhrases(sentence); for(List<TokenAnnotation> entPhrase: 083 * validEntityPhrases){ List<NamedEntity> ents = 084 * yagoMatcher.matchExact(entPhrase, context); for(NamedEntity ent: 085 * ents){ NamedEntityAnnotation nea = new NamedEntityAnnotation(); 086 * nea.namedEntity=ent; 087 * nea.tokensMatched.addAll(sentence.getAnnotationsFor 088 * (TokenAnnotation.class).subList(ent.startToken, ent.stopToken)); 089 * sentence.addAnnotation(nea); } } 090 */ 091 } 092 093 // private List<List<TokenAnnotation>> 094 // getValidEntityPhrases(SentenceAnnotation sentence) { 095 // List<List<TokenAnnotation>> results = new 096 // ArrayList<List<TokenAnnotation>>(); 097 // List<TokenAnnotation> current = new ArrayList<TokenAnnotation>(); 098 // for(TokenAnnotation 099 // tok:sentence.getAnnotationsFor(TokenAnnotation.class)){ 100 // if(start(tok)){ 101 // current.add(tok); 102 // } 103 // else if(cont(tok)&¤t.size()>0){ 104 // current.add(tok); 105 // } 106 // else if(current.size()>0){ 107 // results.add(current); 108 // current=new ArrayList<TokenAnnotation>(); 109 // } 110 // } 111 // if(current.size()>0)results.add(current); 112 // return results; 113 // } 114 // 115 // private boolean cont(TokenAnnotation tok) { 116 // return (!tok.getAnnotationsFor(PhraseAnnotation.class).get(0).start); 117 // } 118 // 119 // private boolean start(TokenAnnotation tok) { 120 // final PhraseAnnotation phrase = 121 // tok.getAnnotationsFor(PhraseAnnotation.class).get(0); 122 // if (phrase.phrase.toString().equals("NP")) 123 // return phrase.start; 124 // else 125 // return false; 126 // } 127 128 private String getContextFrom(List<SentenceAnnotation> sents) { 129 final StringBuffer result = new StringBuffer(); 130 for (final SentenceAnnotation sent : sents) { 131 result.append(sent.text + " "); 132 } 133 return result.toString(); 134 } 135 136 @Override 137 void checkForRequiredAnnotations(RawTextAnnotation annotation) 138 throws MissingRequiredAnnotationException 139 { 140 if (!annotation.getAnnotationKeyList().contains(SentenceAnnotation.class)) 141 throw new MissingRequiredAnnotationException("No SentenceAnnotations found"); 142 } 143 144}