001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.io.BufferedReader;
033import java.io.DataInputStream;
034import java.io.InputStream;
035import java.io.InputStreamReader;
036import java.util.ArrayList;
037import java.util.HashSet;
038import java.util.List;
039import java.util.regex.Matcher;
040import java.util.regex.Pattern;
041
042/**
043 * Class to remove stopwords from a list of tokens, or to check if a word is a
044 * stopword.
045 * 
046 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
047 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
048 */
049public class IgnoreTokenStripper {
050        /**
051         * Language to build stripper from.
052         */
053        public enum Language {
054                English
055        };
056
057        private String units = "one|two|three|four|five|six|seven|eight|nine";
058        private String tens = "twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety";
059        private String teens = "ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen";
060        private String and = "\\s*-?\\s*and\\s*-?\\s*";
061        private String toNN = "[" + units + "|" + teens + "] | [" + tens + "]\\s*-?\\s*[" + units + "]";
062        private String toNNN = toNN + " | [[" + units + "]\\s*-?\\s*hundred [" + and + "[" + toNN + "]+]+]";
063        /*
064         * This currently recognizes written numbers up to nine hundred and ninety
065         * nine.
066         */
067        private Pattern writtenNumbers = Pattern.compile("[" + toNNN + "]+");
068
069        private HashSet<String> ignoreTokens;
070
071        /**
072         * Constructor for specified language.
073         * 
074         * @param language
075         */
076        public IgnoreTokenStripper(Language language) {
077                this.ignoreTokens = new HashSet<String>();
078                for (final InputStream fstream : getListStreams(language)) {
079                        addToIgnoreSet(fstream);
080                }
081        }
082
083        private void addToIgnoreSet(InputStream fstream) {
084                try {
085                        final DataInputStream in = new DataInputStream(fstream);
086                        final BufferedReader br = new BufferedReader(new InputStreamReader(in));
087                        String strLine;
088                        // Read File Line By Line
089                        while ((strLine = br.readLine()) != null) {
090                                ignoreTokens.add(strLine.trim());
091                        }
092                        // Close the input stream
093                        in.close();
094                } catch (final Exception e) {// Catch exception if any
095                        System.err.println("Error: " + e.getMessage());
096                }
097        }
098
099        private List<InputStream> getListStreams(Language language) {
100                final ArrayList<InputStream> res = new ArrayList<InputStream>();
101                if (language.equals(Language.English)) {
102                        res.add(this.getClass().getResourceAsStream(
103                                        "/org/openimaj/text/stopwords/en_stopwords.txt"));
104                        res.add(this.getClass().getResourceAsStream(
105                                        "/org/openimaj/text/stopwords/en_nouns.txt"));
106                        res.add(this.getClass().getResourceAsStream(
107                                        "/org/openimaj/text/stopwords/en_countries.txt"));
108                        return res;
109                }
110                else
111                        return null;
112        }
113
114        /**
115         * Strips given list of tokens of all ignore words.
116         * 
117         * @param intokens
118         * @return list of clean tokens.
119         */
120        public ArrayList<String> getNonStopWords(List<String> intokens) {
121                final ArrayList<String> result = new ArrayList<String>();
122                for (final String string : intokens) {
123                        if (!isIgnoreToken(string)) {
124                                result.add(string);
125                        }
126                }
127                return result;
128        }
129
130        /**
131         * Checks if given token is an ignore word
132         * 
133         * @param token
134         * @return true if ignore Token
135         */
136        public boolean isIgnoreToken(String token) {
137                // check if in ignore list
138                if (ignoreTokens.contains(token))
139                        return true;
140                // check if it is a number
141                try {
142                        Double.parseDouble(token);
143                        return true;
144                } catch (final Exception e) {
145                }
146                // check if it is a number written as a word
147                final Matcher m = writtenNumbers.matcher(token.toLowerCase());
148                if (m.matches())
149                        return true;
150                return false;
151        }
152}