Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp;
031
032import gov.sandia.cognition.text.token.DefaultToken;
033import gov.sandia.cognition.text.token.Token;
034
035import java.io.UnsupportedEncodingException;
036import java.util.ArrayList;
037import java.util.Iterator;
038import java.util.List;
039import java.util.Locale;
040import java.util.regex.Matcher;
041import java.util.regex.Pattern;
042
043import org.apache.commons.lang.StringEscapeUtils;
044import org.openimaj.text.nlp.patterns.AbbreviationPatternProvider;
045import org.openimaj.text.nlp.patterns.ComplicatedNumberPatternProvider;
046import org.openimaj.text.nlp.patterns.EmailPatternProvider;
047import org.openimaj.text.nlp.patterns.EmbeddedApostrophePatternProvider;
048import org.openimaj.text.nlp.patterns.EmbeddedDashPatternProvider;
049import org.openimaj.text.nlp.patterns.EmoticonPatternProvider;
050import org.openimaj.text.nlp.patterns.EntityPatternProvider;
051import org.openimaj.text.nlp.patterns.PunctuationPatternProvider;
052import org.openimaj.text.nlp.patterns.TimePatternProvider;
053import org.openimaj.text.nlp.patterns.TruncatedURLPatternProvider;
054import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider;
055import org.openimaj.text.nlp.patterns.URLPatternProvider;
056import org.openimaj.text.util.RegexUtil;
057
058/**
059 * A tokeniser built to work with short text, like that found in twitter.
060 * Protects various elements of the text with an assumption that if the user
061 * made the mark, it was an important mark that carries meaning because of the
062 * relatively high premium of each key stroke.
063 *
064 * Based on the twokenise by Brendan O'Connor
065 *
066 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
067 *
068 */
069public class TweetTokeniser implements Iterable<Token> {
070
071        private String text;
072        private ArrayList<Token> tokenize;
073        private ArrayList<Token> protectedTokens;
074        private ArrayList<Token> unprotectedTokens;
075
076        private final static Locale[] invalidLanguages = new Locale[] { new Locale("zh"), new Locale("ko"), new Locale("jp"), };
077
078        /**
079         * Check whether this locale is supported by this tokeniser. The unsupported
080         * languages are those which don't need space characters to delimit words,
081         * namely the CJK languages.
082         *
083         * @param locale
084         * @return true if the local is supported
085         */
086        public static boolean isValid(Locale locale) {
087                return isValid(locale.getLanguage());
088        }
089
090        /**
091         * Check whether this locale (specified by the two letter country code,
092         * {@link Locale}) is supported by this tokeniser. The unsupported languages
093         * are those which don't need space characters to delimit words, namely the
094         * CJK languages.
095         *
096         * @param locale
097         * @return true if the local is supported
098         */
099        public static boolean isValid(String locale) {
100                for (final Locale invalidLocal : invalidLanguages) {
101                        if (invalidLocal.getLanguage().equals(locale))
102                                return false;
103                }
104                return true;
105        }
106
107        // public static String regex_or(String ... items )
108        // {
109        // String r = StringUtils.join(items, "|");
110        // r = '(' + r + ')';
111        // return r;
112        // }
113        // public String pos_lookahead(String r){
114        // return "(?=" + r + ')';
115        // }
116        //
117        // public static String neg_lookahead(String r) {
118        // return "(?!" + r + ')';
119        // }
120        // public String optional(String r){
121        // return String.format("(%s)?",r);
122        // }
123
124        static EmoticonPatternProvider emoticons = new EmoticonPatternProvider();
125        static PunctuationPatternProvider punctuation = new PunctuationPatternProvider();
126        static EntityPatternProvider entity = new EntityPatternProvider();
127        static TruncatedURLPatternProvider truncatedURL = new TruncatedURLPatternProvider();
128        static URLPatternProvider url = new URLPatternProvider();
129        static TimePatternProvider time = new TimePatternProvider();
130        static ComplicatedNumberPatternProvider number = new ComplicatedNumberPatternProvider();
131        static TwitterStuffPatternProvider twitterPart = new TwitterStuffPatternProvider();
132        static EmailPatternProvider email = new EmailPatternProvider();
133        static AbbreviationPatternProvider abbrev = new AbbreviationPatternProvider(entity);
134        private static final String spaceRegex = "\\s+";
135        static String Separators = RegexUtil.regex_or_match("--+", "\u2015");
136        static String Decorations = new String(" [\u266b]+ ").replace(" ", "");
137        static EmbeddedApostrophePatternProvider embedded = new EmbeddedApostrophePatternProvider(punctuation);
138        static EmbeddedDashPatternProvider embeddedDash = new EmbeddedDashPatternProvider(punctuation);
139
140        static String[] ProtectThese = new String[] {
141                twitterPart.patternString(),
142                emoticons.patternString(),
143                truncatedURL.patternString(),
144                url.patternString(),
145                email.patternString(),
146                entity.patternString(),
147                time.patternString(),
148                number.patternString(),
149                // embeddedDash.patternString(),
150                embedded.patternString(),
151                punctuation.patternString(),
152                abbrev.patternString(),
153                Separators,
154                Decorations,
155        };
156
157
158
159        static String oredProtect = RegexUtil.regex_or_match(ProtectThese);
160        static Pattern Protect_RE = Pattern.compile(oredProtect, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE);
161
162        // static Pattern Protect_RE = twitterPart.pattern();
163
164        /**
165         * @param s
166         *            Tokenise this string
167         * @throws UnsupportedEncodingException
168         * @throws TweetTokeniserException
169         */
170        public TweetTokeniser(String s) throws UnsupportedEncodingException, TweetTokeniserException {
171                // System.out.println(EdgePunct);
172                // System.out.println(new String(""));
173                this.text = new String(s);
174                // System.out.println("TWEET:" + text);
175                fixEncoding();
176                squeeze_whitespace();
177                simple_tokenize();
178        }
179
180        private void simple_tokenize() throws TweetTokeniserException {
181                this.tokenize = new ArrayList<Token>();
182                edge_punct_munge();
183
184                final ArrayList<String> goods = new ArrayList<String>();
185                final ArrayList<String> bads = new ArrayList<String>();
186                final ArrayList<Token> res = new ArrayList<Token>();
187                final ArrayList<Token> goodt = new ArrayList<Token>();
188                final ArrayList<Token> badt = new ArrayList<Token>();
189                int i = 0;
190                final Matcher matches = Protect_RE.matcher(this.text);
191                if (matches != null) {
192                        while (matches.find()) {
193                                final String goodString = this.text.substring(i, matches.start());
194                                goods.add(goodString);
195                                final List<Token> goodStrings = unprotected_tokenize(goodString);
196                                res.addAll(goodStrings);
197                                goodt.addAll(goodStrings);
198                                final String badString = this.text.substring(matches.start(), matches.end());
199                                bads.add(badString);
200                                final DefaultToken badTok = new DefaultToken(badString, 0);
201                                res.add(badTok);
202                                badt.add(badTok);
203                                i = matches.end();
204                        }
205                        final String finalGood = this.text.substring(i, this.text.length());
206                        final List<Token> goodStrings = unprotected_tokenize(finalGood);
207                        res.addAll(goodStrings);
208                        goodt.addAll(goodStrings);
209                } else {
210                        final String goodString = this.text.substring(0, this.text.length());
211                        final List<Token> goodStrings = unprotected_tokenize(goodString);
212                        res.addAll(goodStrings);
213                        goodt.addAll(goodStrings);
214                }
215
216                this.tokenize = post_process(res);
217                this.protectedTokens = post_process(badt);
218                this.unprotectedTokens = post_process(goodt);
219        }
220
221        private ArrayList<Token> post_process(ArrayList<Token> res) {
222                return res;
223        }
224
225        private List<Token> unprotected_tokenize(String goodString) {
226                final String[] strings = goodString.split("\\s+");
227                final List<Token> t = new ArrayList<Token>();
228                for (final String s : strings) {
229                        if (s.isEmpty())
230                                continue;
231                        t.add(new DefaultToken(s, 0));
232                }
233                return t;
234        }
235
236        private void edge_punct_munge() {
237                // this.text = EdgePunctuationPatternProvider.fixedges(this.text);
238        }
239
240        private void squeeze_whitespace() {
241                this.text = this.text.replaceAll(spaceRegex, " ");
242        }
243
244        private void fixEncoding() throws UnsupportedEncodingException {
245                this.text = new String(text.getBytes("UTF-8"), "UTF-8");
246                this.text = StringEscapeUtils.unescapeHtml(this.text);
247                // System.out.println("UTF-8:" + text);
248        }
249
250        @Override
251        public Iterator<Token> iterator() {
252                return this.tokenize.iterator();
253        }
254
255        /**
256         * @return all the tokens detected (as {@link Token} instances)
257         */
258        public List<Token> getTokens() {
259                return this.tokenize;
260        }
261
262        /**
263         * @return return all tokens as a {@link List} of {@link String}
264         */
265        public List<String> getStringTokens() {
266                final List<String> stringTokens = new ArrayList<String>();
267                for (final Token token : this.tokenize) {
268                        stringTokens.add(token.getText());
269                }
270                return stringTokens;
271        }
272
273        /**
274         * @return return all tokens protected by the Twokenizer regex
275         */
276        public List<String> getProtectedStringTokens() {
277                final List<String> stringTokens = new ArrayList<String>();
278                for (final Token token : this.protectedTokens) {
279                        stringTokens.add(token.getText());
280                }
281                return stringTokens;
282        }
283
284        /**
285         * @return return all the tokens not protected by the Twokenizer regex
286         */
287        public List<String> getUnprotectedStringTokens() {
288                final List<String> stringTokens = new ArrayList<String>();
289                for (final Token token : this.unprotectedTokens) {
290                        stringTokens.add(token.getText());
291                }
292                return stringTokens;
293        }
294
295}