Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp;
031
032import gov.sandia.cognition.text.token.DefaultToken;
033import gov.sandia.cognition.text.token.Token;
034
035import java.io.UnsupportedEncodingException;
036import java.util.ArrayList;
037import java.util.Iterator;
038import java.util.List;
039import java.util.Locale;
040import java.util.regex.Matcher;
041import java.util.regex.Pattern;
042
043import org.apache.commons.lang.StringEscapeUtils;
044import org.openimaj.text.nlp.patterns.AbbreviationPatternProvider;
045import org.openimaj.text.nlp.patterns.ComplicatedNumberPatternProvider;
046import org.openimaj.text.nlp.patterns.EmailPatternProvider;
047import org.openimaj.text.nlp.patterns.EmbeddedApostrophePatternProvider;
048import org.openimaj.text.nlp.patterns.EmbeddedDashPatternProvider;
049import org.openimaj.text.nlp.patterns.EmoticonPatternProvider;
050import org.openimaj.text.nlp.patterns.EntityPatternProvider;
051import org.openimaj.text.nlp.patterns.PunctuationPatternProvider;
052import org.openimaj.text.nlp.patterns.TimePatternProvider;
053import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider;
054import org.openimaj.text.nlp.patterns.URLPatternProvider;
055import org.openimaj.text.util.RegexUtil;
056
057
058
059/**
060 * A tokeniser built to work with short text, like that found in twitter.
061 * Protects various elements of the text with an assumption that if the user made the mark, it was an important mark that carries meaning
062 * because of the relatively high premium of each key stroke.
063 * 
064 * Based on the twokenise by Brendan O'Connor 
065 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
066 *
067 */
068public class EntityTweetTokeniser implements Iterable<Token>{
069        
070        
071        private String text;
072        private ArrayList<Token> tokenize;
073        private ArrayList<Token> protectedTokens;
074        private ArrayList<Token> unprotectedTokens;
075        
076        private final static Locale[] invalidLanguages = new Locale[]{
077                new Locale("zh"),
078                new Locale("ko"),
079                new Locale("jp"),
080        };
081        
082        
083        /**
084         * Check whether this locale is supported by this tokeniser. The unsupported languages are those which don't need space
085         * characters to delimit words, namely the CJK languages.
086         * @param locale
087         * @return true if the local is supported
088         */
089        public static boolean isValid(Locale locale){
090                return isValid(locale.getLanguage());
091        }
092        /**
093         * Check whether this locale (specified by the two letter country code, {@link Locale}) is
094         * supported by this tokeniser. The unsupported languages are those which don't need space
095         * characters to delimit words, namely the CJK languages.
096         * @param locale
097         * @return true if the local is supported
098         */
099        public static boolean isValid(String locale){
100                for (Locale invalidLocal: invalidLanguages) {
101                        if(invalidLocal.getLanguage().equals(locale)) return false;
102                }
103                return true;
104        }
105        
106        
107//      public static String regex_or(String ... items )
108//      {
109//              String r = StringUtils.join(items, "|");
110//              r = '(' + r + ')';
111//              return r;
112//      }
113//      public String pos_lookahead(String r){
114//              return "(?=" + r + ')';
115//      }
116//              
117//      public static String neg_lookahead(String r) {
118//              return "(?!" + r + ')';
119//      }
120//      public String optional(String r){
121//              return String.format("(%s)?",r);
122//      }
123        
124        static EmoticonPatternProvider emoticons = new EmoticonPatternProvider();
125        static PunctuationPatternProvider punctuation = new PunctuationPatternProvider();
126        static EntityPatternProvider entity = new EntityPatternProvider();
127        static URLPatternProvider url = new URLPatternProvider();
128        static TimePatternProvider time = new TimePatternProvider();
129        static ComplicatedNumberPatternProvider number = new ComplicatedNumberPatternProvider();
130        static TwitterStuffPatternProvider twitterPart = new TwitterStuffPatternProvider();
131        static EmailPatternProvider email = new EmailPatternProvider();
132        static AbbreviationPatternProvider abbrev = new AbbreviationPatternProvider(entity);
133        private static final String spaceRegex = "\\s+";
134        static String Separators = RegexUtil.regex_or_match("--+", "\u2015");
135        static String Decorations = new String(" [\u266b]+ ").replace(" ","");
136        static EmbeddedApostrophePatternProvider embedded = new EmbeddedApostrophePatternProvider(punctuation);
137        static EmbeddedDashPatternProvider embeddedDash = new EmbeddedDashPatternProvider(punctuation);
138        
139        
140        static String [] ProtectThese = new String[]{
141                        twitterPart.patternString(),
142                        emoticons.patternString(),
143                        url.patternString(),
144                        email.patternString(),
145                        entity.patternString(),
146                        time.patternString(),
147                        number.patternString(),
148//                      embeddedDash.patternString(),
149//                      embedded.patternString(),
150                        punctuation.patternString(),
151                        abbrev.patternString(),
152                        Separators,
153                        Decorations,
154        };
155        static String oredProtect = RegexUtil.regex_or_match(ProtectThese);
156        static Pattern Protect_RE = Pattern.compile(oredProtect,Pattern.UNICODE_CASE|Pattern.CASE_INSENSITIVE);
157//      static Pattern Protect_RE = twitterPart.pattern();
158        
159        
160        /**
161         * @param s Tokenise this string
162         * @throws UnsupportedEncodingException
163         * @throws TweetTokeniserException
164         */
165        public EntityTweetTokeniser(String s) throws UnsupportedEncodingException, TweetTokeniserException{
166//              System.out.println(EdgePunct);
167//              System.out.println(new String(""));
168                this.text = new String(s);
169//              System.out.println("TWEET:" + text);
170                fixEncoding();
171                squeeze_whitespace();
172                simple_tokenize();
173        }
174        
175        private void simple_tokenize() throws TweetTokeniserException {
176                this.tokenize = new ArrayList<Token>();
177                edge_punct_munge();
178                
179                ArrayList<String> goods = new ArrayList<String>();
180                ArrayList<String> bads = new ArrayList<String>();
181                ArrayList<Token> res = new ArrayList<Token>();
182                ArrayList<Token> goodt = new ArrayList<Token>();
183                ArrayList<Token> badt = new ArrayList<Token>();
184                int i = 0;
185                Matcher matches = Protect_RE.matcher(this.text);
186                if(matches!=null)
187                {
188                        while(matches.find()) {
189                                String goodString = this.text.substring(i,matches.start());
190                                goods.add(goodString);
191                                List<Token> goodStrings = unprotected_tokenize(goodString);
192                                res.addAll(goodStrings);
193                                goodt.addAll(goodStrings);
194                                String badString = this.text.substring(matches.start(),matches.end());
195                                bads.add(badString);
196                                DefaultToken badTok = new DefaultToken(badString,0);
197                                res.add(badTok);
198                                badt.add(badTok);
199                                i = matches.end();
200                        }
201                        String finalGood =  this.text.substring(i, this.text.length());
202                        List<Token> goodStrings = unprotected_tokenize(finalGood);
203                        res.addAll(goodStrings);
204                        goodt.addAll(goodStrings);
205                }
206                else
207                {
208                        String goodString = this.text.substring(0, this.text.length());
209                        List<Token> goodStrings = unprotected_tokenize(goodString);
210                        res.addAll(goodStrings);
211                        goodt.addAll(goodStrings);
212                }       
213                
214                
215                this.tokenize = post_process(res);
216                this.protectedTokens = post_process(badt);
217                this.unprotectedTokens = post_process(goodt);
218        }
219
220        private ArrayList<Token> post_process(ArrayList<Token> res) {
221                return res;
222        }
223        private List<Token> unprotected_tokenize(String goodString) {
224                String[] strings = goodString.split("\\s+");
225                List<Token> t = new ArrayList<Token>();
226                for (String s : strings) {
227                        if(s.isEmpty()) continue;
228                        t.add(new DefaultToken(s, 0));
229                }
230                return t;
231        }
232        private void edge_punct_munge() {
233//              this.text = EdgePunctuationPatternProvider.fixedges(this.text);
234        }
235
236        private void squeeze_whitespace() {
237                this.text = this.text.replaceAll(spaceRegex, " ");
238        }
239
240        private void fixEncoding() throws UnsupportedEncodingException {
241                this.text = new String(text.getBytes("UTF-8"),"UTF-8");
242                this.text = StringEscapeUtils.unescapeHtml(this.text);
243//              System.out.println("UTF-8:" + text);
244        }
245        @Override
246        public Iterator<Token> iterator() {
247                return this.tokenize.iterator();
248        }
249        
250        public List<Token> getTokens(){
251                return this.tokenize;
252        }
253        
254        public List<String> getStringTokens(){
255                List<String> stringTokens = new ArrayList<String>();
256                for (Token token : this.tokenize) {
257                        stringTokens.add(token.getText());
258                }
259                return stringTokens;
260        }
261
262        public List<String> getProtectedStringTokens() {
263                List<String> stringTokens = new ArrayList<String>();
264                for (Token token : this.protectedTokens) {
265                        stringTokens.add(token.getText());
266                }
267                return stringTokens;
268        }
269        
270        public List<String> getUnprotectedStringTokens() {
271                List<String> stringTokens = new ArrayList<String>();
272                for (Token token : this.unprotectedTokens) {
273                        stringTokens.add(token.getText());
274                }
275                return stringTokens;
276        }
277
278}