001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.io.BufferedWriter;
033import java.io.File;
034import java.io.FileWriter;
035import java.io.IOException;
036import java.io.UnsupportedEncodingException;
037import java.util.ArrayList;
038import java.util.HashMap;
039import java.util.HashSet;
040import java.util.List;
041import java.util.Map;
042import java.util.Set;
043
044import javax.xml.parsers.DocumentBuilder;
045import javax.xml.parsers.DocumentBuilderFactory;
046import javax.xml.parsers.ParserConfigurationException;
047
048import org.apache.commons.lang.StringUtils;
049import org.openimaj.experiment.evaluation.classification.BasicClassificationResult;
050import org.openimaj.experiment.evaluation.classification.ClassificationEvaluator;
051import org.openimaj.experiment.evaluation.classification.ClassificationResult;
052import org.openimaj.experiment.evaluation.classification.analysers.roc.ROCAnalyser;
053import org.openimaj.experiment.evaluation.classification.analysers.roc.ROCResult;
054import org.openimaj.ml.annotation.ScoredAnnotation;
055import org.openimaj.text.nlp.EntityTweetTokeniser;
056import org.openimaj.text.nlp.TweetTokeniserException;
057import org.openimaj.text.nlp.namedentity.YagoEntityCandidateFinderFactory.YagoEntityCandidateFinder;
058import org.openimaj.text.nlp.namedentity.YagoEntityContextScorerFactory.YagoEntityContextScorer;
059import org.w3c.dom.Document;
060import org.w3c.dom.NamedNodeMap;
061import org.w3c.dom.Node;
062import org.w3c.dom.NodeList;
063import org.xml.sax.SAXException;
064
065/**
066 * Experiment for examining the ability of a
067 * Yago based organisation extractor.
068 * 
069 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
070 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
071 * 
072 */
073public class YagoCompanyAnnotatorEvaluator {
074        private static final String CLASSIFICATION = "Organistaion";
075        private static BufferedWriter logOut;
076        private static boolean logging;
077        private DocumentBuilderFactory docBuilderFactory;
078        private DocumentBuilder docBuilder;
079        private Map<FileEntityLocation, Set<String>> actual;
080        private Map<FileEntityLocation, ClassificationResult<String>> results;
081        private final YagoEntityCompleteAnnotator ycca;
082        private EntityTweetTokeniser tt; 
083        private ClassificationEvaluator<ROCResult<String>, String, FileEntityLocation> ce;
084        private ROCAnalyser<FileEntityLocation, String> ra;
085        private boolean verbose=false;
086
087        /**
088         * @param args
089         *            the first argument must be the alias list/index directory
090         */
091        public static void main(String[] args) {
092                if (args.length < 1) {
093                        System.out.println("You have not given me a directory for the Test data.");
094                        System.exit(1);
095                }
096                if(args.length==2){
097                        createLogging(args[1]);
098                        logging = true;
099                }
100                else{
101                        System.out.println("No logging file specified.");
102                        logging=false;
103                }
104                final YagoCompanyAnnotatorEvaluator ya = new YagoCompanyAnnotatorEvaluator();
105                ya.run(args[0]);
106        }
107
108        private static void createLogging(String logFilePath) {
109                File f = new File(logFilePath);
110                if(!f.isFile()){
111                        try {
112                                f.createNewFile();                              
113                        } catch (IOException e) {                               
114                                e.printStackTrace();
115                        }
116                }
117                else{
118                }
119                FileWriter fstream = null; 
120                try {
121                        fstream = new FileWriter(logFilePath);
122                        logOut = new BufferedWriter(fstream);
123                        logOut.write("");
124                } catch (IOException e) {
125                        e.printStackTrace();
126                }
127                
128        }
129
130        /**
131         * instantiates the annotator
132         */
133        public YagoCompanyAnnotatorEvaluator() {
134                YagoEntityCandidateFinder ycf = null;
135                ycf = YagoEntityCandidateFinderFactory.createFromAliasFile(EntityExtractionResourceBuilder.getDefaultAliasFilePath());
136                YagoEntityContextScorer ycs = null;
137                ycs = YagoEntityContextScorerFactory.createFromIndexFile(EntityExtractionResourceBuilder.getDefaultIndexDirectoryPath());
138                ycca = new YagoEntityCompleteAnnotator(ycs,ycf);
139        }
140
141        /**
142         * @param testDirectory
143         *            given a directory, run the evaluation
144         */
145        public void run(String testDirectory) {
146                System.out.println("Started....");
147                buildTruthAndClassifications(testDirectory);
148                ra = new ROCAnalyser<YagoCompanyAnnotatorEvaluator.FileEntityLocation, String>();
149                ce = new ClassificationEvaluator<ROCResult<String>, String, FileEntityLocation>(results, actual, ra);
150                final ROCResult<String> analysisResult = ce.analyse(ce.evaluate());
151                System.out.println(analysisResult.getDetailReport());
152                doMyCalcs();
153                if(logging)
154                        try {
155                                logOut.flush();
156                                logOut.close();
157                        } catch (IOException e) {                               
158                                e.printStackTrace();
159                        }
160        }
161
162        private void doMyCalcs() {
163                double fp=0;
164                double tp=0;
165                double fn=0;
166                for(FileEntityLocation fe:results.keySet()){
167                        if(actual.keySet().contains(fe))tp++;
168                        else fp++;
169                }
170                for(FileEntityLocation fe:actual.keySet()){
171                        if(!results.keySet().contains(fe))fn++;
172                }
173                System.out.println("Precision : "+(tp/(tp+fp)));
174                System.out.println("Recall : "+(tp/(tp+fn)));
175        }
176
177        /**
178         * @param testDirectory
179         */
180        private void buildTruthAndClassifications(String testDirectory) {
181                final File f = new File(testDirectory);
182                actual = new HashMap<FileEntityLocation, Set<String>>();
183                results = new HashMap<FileEntityLocation, ClassificationResult<String>>();
184                if (f.isDirectory()) {
185                        // Initialize XML parsing objects
186                        docBuilderFactory = DocumentBuilderFactory.newInstance();
187                        docBuilder = null;
188                        try {
189                                docBuilder = docBuilderFactory.newDocumentBuilder();
190                        } catch (final ParserConfigurationException e) {                                
191                                e.printStackTrace();
192                        }
193
194                        for (final File s : f.listFiles()) {
195                                final String name = s.getName();
196                                print("#################Processing " + name);
197                                if (name.substring(name.lastIndexOf(".") + 1).equals("xml")) {
198                                        Document doc = null;
199                                        try {
200                                                doc = docBuilder.parse(s);
201                                        } catch (final SAXException e) {
202                                                
203                                                e.printStackTrace();
204                                        } catch (final IOException e) {                                         
205                                                e.printStackTrace();
206                                        }
207                                        doc.getDocumentElement().normalize();
208                                        final HashMap<Integer, String> res = getResultsFrom(doc.getElementsByTagName("TextWithNodes").item(0)
209                                                        .getTextContent(), s.getAbsolutePath());
210                                        final HashMap<Integer, String> act = getActualFrom(doc.getElementsByTagName("TextWithNodes").item(0)
211                                                        .getTextContent(), doc.getElementsByTagName("AnnotationSet"), s.getAbsolutePath());
212                                        print("---------MY MISSES----------");
213                                        for (final int key : act.keySet()) {
214                                                if (!res.keySet().contains(key)) {
215                                                        print(act.get(key));
216                                                }
217                                        }
218                                        print("---------THEIR MISSES----------");
219                                        for (final int key : res.keySet()) {
220                                                if (!act.keySet().contains(key)) {
221                                                        print(res.get(key));
222                                                }
223                                        }
224                                }
225                        }
226                }
227        }
228
229        private HashMap<Integer, String> getResultsFrom(String textContent, String filePath) {
230                print("---------RESULTS----------");
231                try {
232                        tt = new EntityTweetTokeniser(textContent);
233                } catch (final UnsupportedEncodingException e) {                
234                        e.printStackTrace();
235                } catch (final TweetTokeniserException e) {                     
236                        e.printStackTrace();
237                }
238                final ArrayList<String> tokens = (ArrayList<String>) tt.getStringTokens();
239                final List<ScoredAnnotation<HashMap<String, Object>>> annos = ycca.annotate(tokens);
240                final HashMap<Integer, String> r = new HashMap<Integer, String>();
241                for (final ScoredAnnotation<HashMap<String, Object>> anno : annos) {
242                        if (anno.annotation.get(EntityAnnotator.TYPE)==NamedEntity.Type.Organisation.toString()){
243                                final FileEntityLocation fe = getFE(anno, textContent, tokens);
244                                final BasicClassificationResult<String> c = new BasicClassificationResult<String>();
245                                c.put(CLASSIFICATION, 1);
246                                fe.file = filePath;
247                                results.put(fe, c);
248                                if (fe.start >= 0 && fe.start < textContent.length()
249                                                && fe.stop >= 0 && fe.stop < textContent.length()
250                                                && fe.stop > fe.start) {
251                                        final String s = textContent.substring(fe.start, fe.stop)
252                                                        + " " + fe.start + ", " + fe.stop;
253                                        r.put(fe.start + fe.stop, s);
254                                        print(s);
255                                } else
256                                        System.err.println("Substring out of range for :"
257                                                        + anno.annotation.get(EntityAnnotator.URI));
258                        }
259                        //else System.out.println("Skipped person : "+anno.annotation.get(EntityAnnotator.URI));
260                }
261                return r;
262        }
263
264        private FileEntityLocation getFE(ScoredAnnotation<HashMap<String, Object>> anno, String textContent,
265                        ArrayList<String> tokens)
266        {
267                // calculate the start char index
268                final int sInd = (Integer) anno.annotation.get(EntityAnnotator.START_TOKEN);
269                final String sToken = tokens.get(sInd);
270                // join all previous tokens with empty and get length
271                int minStartChar = StringUtils.join(tokens.subList(0, sInd), "").length();
272                // get the index of the first occurrence of the token after the minimum
273                int startCharOff = textContent.substring(minStartChar).indexOf(sToken);
274                final int startChar = minStartChar + startCharOff;
275                // calculate the end char index
276                final int eInd = (Integer) anno.annotation.get(EntityAnnotator.END_TOKEN);
277                final String eToken = tokens.get(eInd);
278                minStartChar = StringUtils.join(tokens.subList(0, eInd), "").length();
279                startCharOff = textContent.substring(minStartChar).indexOf(eToken);
280                final int endChar = minStartChar + startCharOff + eToken.length();
281                final FileEntityLocation fe = new FileEntityLocation();
282                fe.start = startChar;
283                fe.stop = endChar;
284                return fe;
285        }
286
287        private HashMap<Integer, String> getActualFrom(String textContent, NodeList anoSets, String filePath) {
288                print("---------Actual----------");
289                final HashSet<String> c = new HashSet<String>();
290                c.add(CLASSIFICATION);
291                final HashMap<Integer, String> r = new HashMap<Integer, String>();
292                for (int i = 0; i < anoSets.getLength(); i++) {
293                        final Node n = anoSets.item(i);
294                        final NamedNodeMap m = n.getAttributes();
295                        if (m.getNamedItem("Name") != null && m.getNamedItem("Name").getNodeValue().equals("Key")) {
296                                final NodeList anoChildren = n.getChildNodes();
297                                for (int j = 0; j < anoChildren.getLength(); j++) {
298                                        final Node child = anoChildren.item(j);
299                                        if (child.hasAttributes() && child.getAttributes().getNamedItem("Type") != null
300                                                        && child.getAttributes().getNamedItem("Type").getNodeValue().equals("Organization"))
301                                        {
302                                                final int startchar = Integer.parseInt(child.getAttributes().getNamedItem("StartNode")
303                                                                .getNodeValue());
304                                                final int endchar = Integer
305                                                                .parseInt(child.getAttributes().getNamedItem("EndNode").getNodeValue());
306                                                final FileEntityLocation fe = new FileEntityLocation();
307                                                fe.file = filePath;
308                                                fe.start = startchar;
309                                                fe.stop = endchar;
310                                                actual.put(fe, c);
311                                                final String s = textContent.substring(fe.start, fe.stop) + " " + fe.start + ", " + fe.stop;
312                                                r.put(fe.start + fe.stop, s);
313                                                print(s);
314                                        }
315                                }
316                        }
317                }
318                return r;
319        }
320
321        /**
322         * @param path to the Gate document
323         * @return plain text of document.
324         */
325        public static String getRawStringFromTest(String path) {
326                final File f = new File(path);
327                Document doc = null;
328                DocumentBuilderFactory factory = null;
329                DocumentBuilder docBuilder = null;
330                // Initialize XML parsing objects
331                factory = DocumentBuilderFactory.newInstance();
332                docBuilder = null;
333                try {
334                        docBuilder = factory.newDocumentBuilder();
335                } catch (final ParserConfigurationException e) {                        
336                        e.printStackTrace();
337                }
338                try {
339                        doc = docBuilder.parse(f);
340                } catch (final SAXException e) {                        
341                        e.printStackTrace();
342                } catch (final IOException e) {                 
343                        e.printStackTrace();
344                }
345                doc.getDocumentElement().normalize();
346                return doc.getElementsByTagName("TextWithNodes").item(0).getTextContent();
347        }
348        
349        private void print(String message){
350                if(verbose)System.out.println(message);
351                if(logging)
352                        try {
353                                logOut.append(message+"\n");
354                        } catch (IOException e) {
355                                e.printStackTrace();
356                        }
357        }
358
359        /**
360         * An object which uniquely identifies and equates a start/stop in a
361         * specific file.
362         * 
363         * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
364         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
365         * 
366         */
367        public static class FileEntityLocation {
368                String file;
369                int start;
370                int stop;
371
372                @Override
373                public int hashCode() {
374                        final int prime = 31;
375                        int result = 1;
376                        result = prime * result + ((file == null) ? 0 : file.hashCode());
377                        result = prime * result + start;
378                        result = prime * result + stop;
379                        return result;
380                }
381
382                @Override
383                public boolean equals(Object obj) {
384                        if (!(obj instanceof FileEntityLocation))
385                                return false;
386                        final FileEntityLocation comp = (FileEntityLocation) obj;
387                        if (!comp.file.equals(this.file))
388                                return false;
389                        if (comp.start != this.start)
390                                return false;
391                        if (comp.stop != this.stop)
392                                return false;
393                        return true;
394                }
395
396        }
397
398}