Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.textpipe.annotators;
031
032import java.io.IOException;
033import java.io.InputStream;
034import java.util.List;
035
036import opennlp.tools.chunker.ChunkerME;
037import opennlp.tools.chunker.ChunkerModel;
038
039import org.openimaj.text.nlp.textpipe.annotations.AnnotationUtils;
040import org.openimaj.text.nlp.textpipe.annotations.PhraseAnnotation;
041import org.openimaj.text.nlp.textpipe.annotations.PhraseAnnotation.Phrase;
042import org.openimaj.text.nlp.textpipe.annotations.RawTextAnnotation;
043import org.openimaj.text.nlp.textpipe.annotations.TokenAnnotation;
044
045/**
046 * Phrase chunker instantiating a {@link ChunkerME} backed by a {@link ChunkerModel}
047 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
048 *
049 */
050public class OpenNLPPhraseChunkAnnotator extends AbstractPhraseAnnotator {
051        /**
052         * The system property
053         */
054        public static final String PHRASE_MODEL_PROP = "org.openimaj.text.opennlp.models.chunker";
055        ChunkerME chunker;
056
057        /**
058         *
059         */
060        public OpenNLPPhraseChunkAnnotator() {
061                super();
062                InputStream modelIn = null;
063                ChunkerModel model = null;
064                try {
065
066                        modelIn = OpenNLPPhraseChunkAnnotator.class.getResourceAsStream(System.getProperty(PHRASE_MODEL_PROP));
067                        model = new ChunkerModel(modelIn);
068                } catch (IOException e) {
069                        // Model loading failed, handle the error
070                        e.printStackTrace();
071                } finally {
072                        if (modelIn != null) {
073                                try {
074                                        modelIn.close();
075                                } catch (IOException e) {
076                                }
077                        }
078                }
079                chunker = new ChunkerME(model);
080        }
081
082        @Override
083        protected void phraseChunk(List<TokenAnnotation> tokens) {
084                String[] tags = chunker.chunk(AnnotationUtils
085                                .ListToArray(AnnotationUtils
086                                                .getStringTokensFromTokenAnnotationList(tokens)),
087                                AnnotationUtils.ListToArray(AnnotationUtils
088                                                .getStringPOSsFromTokenAnnotationList(tokens)));
089                for (int i = 0; i < tags.length; i++) {
090                        if (tags[i].contains("-")) {
091                                String[] comps = tags[i].split("-");
092                                boolean start = comps[0].equals("B");
093                                tokens.get(i).addAnnotation(
094                                                new PhraseAnnotation(Phrase
095                                                                .getPhrasefromString(comps[1]), start));
096                        }
097                        else tokens.get(i).addAnnotation(
098                                        new PhraseAnnotation(Phrase
099                                                        .getPhrasefromString(tags[i]),true));
100                }
101        }
102
103        @Override
104        void checkForRequiredAnnotations(RawTextAnnotation annotation)
105                        throws MissingRequiredAnnotationException {
106                // TODO Auto-generated method stub
107
108        }
109
110}