001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp; 031 032import gov.sandia.cognition.text.token.DefaultToken; 033import gov.sandia.cognition.text.token.Token; 034 035import java.io.UnsupportedEncodingException; 036import java.util.ArrayList; 037import java.util.Iterator; 038import java.util.List; 039import java.util.Locale; 040import java.util.regex.Matcher; 041import java.util.regex.Pattern; 042 043import org.apache.commons.lang.StringEscapeUtils; 044import org.openimaj.text.nlp.patterns.AbbreviationPatternProvider; 045import org.openimaj.text.nlp.patterns.ComplicatedNumberPatternProvider; 046import org.openimaj.text.nlp.patterns.EmailPatternProvider; 047import org.openimaj.text.nlp.patterns.EmbeddedApostrophePatternProvider; 048import org.openimaj.text.nlp.patterns.EmbeddedDashPatternProvider; 049import org.openimaj.text.nlp.patterns.EmoticonPatternProvider; 050import org.openimaj.text.nlp.patterns.EntityPatternProvider; 051import org.openimaj.text.nlp.patterns.PunctuationPatternProvider; 052import org.openimaj.text.nlp.patterns.TimePatternProvider; 053import org.openimaj.text.nlp.patterns.TruncatedURLPatternProvider; 054import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider; 055import org.openimaj.text.nlp.patterns.URLPatternProvider; 056import org.openimaj.text.util.RegexUtil; 057 058/** 059 * A tokeniser built to work with short text, like that found in twitter. 060 * Protects various elements of the text with an assumption that if the user 061 * made the mark, it was an important mark that carries meaning because of the 062 * relatively high premium of each key stroke. 063 * 064 * Based on the twokenise by Brendan O'Connor 065 * 066 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 067 * 068 */ 069public class TweetTokeniser implements Iterable<Token> { 070 071 private String text; 072 private ArrayList<Token> tokenize; 073 private ArrayList<Token> protectedTokens; 074 private ArrayList<Token> unprotectedTokens; 075 076 private final static Locale[] invalidLanguages = new Locale[] { new Locale("zh"), new Locale("ko"), new Locale("jp"), }; 077 078 /** 079 * Check whether this locale is supported by this tokeniser. The unsupported 080 * languages are those which don't need space characters to delimit words, 081 * namely the CJK languages. 082 * 083 * @param locale 084 * @return true if the local is supported 085 */ 086 public static boolean isValid(Locale locale) { 087 return isValid(locale.getLanguage()); 088 } 089 090 /** 091 * Check whether this locale (specified by the two letter country code, 092 * {@link Locale}) is supported by this tokeniser. The unsupported languages 093 * are those which don't need space characters to delimit words, namely the 094 * CJK languages. 095 * 096 * @param locale 097 * @return true if the local is supported 098 */ 099 public static boolean isValid(String locale) { 100 for (final Locale invalidLocal : invalidLanguages) { 101 if (invalidLocal.getLanguage().equals(locale)) 102 return false; 103 } 104 return true; 105 } 106 107 // public static String regex_or(String ... items ) 108 // { 109 // String r = StringUtils.join(items, "|"); 110 // r = '(' + r + ')'; 111 // return r; 112 // } 113 // public String pos_lookahead(String r){ 114 // return "(?=" + r + ')'; 115 // } 116 // 117 // public static String neg_lookahead(String r) { 118 // return "(?!" + r + ')'; 119 // } 120 // public String optional(String r){ 121 // return String.format("(%s)?",r); 122 // } 123 124 static EmoticonPatternProvider emoticons = new EmoticonPatternProvider(); 125 static PunctuationPatternProvider punctuation = new PunctuationPatternProvider(); 126 static EntityPatternProvider entity = new EntityPatternProvider(); 127 static TruncatedURLPatternProvider truncatedURL = new TruncatedURLPatternProvider(); 128 static URLPatternProvider url = new URLPatternProvider(); 129 static TimePatternProvider time = new TimePatternProvider(); 130 static ComplicatedNumberPatternProvider number = new ComplicatedNumberPatternProvider(); 131 static TwitterStuffPatternProvider twitterPart = new TwitterStuffPatternProvider(); 132 static EmailPatternProvider email = new EmailPatternProvider(); 133 static AbbreviationPatternProvider abbrev = new AbbreviationPatternProvider(entity); 134 private static final String spaceRegex = "\\s+"; 135 static String Separators = RegexUtil.regex_or_match("--+", "\u2015"); 136 static String Decorations = new String(" [\u266b]+ ").replace(" ", ""); 137 static EmbeddedApostrophePatternProvider embedded = new EmbeddedApostrophePatternProvider(punctuation); 138 static EmbeddedDashPatternProvider embeddedDash = new EmbeddedDashPatternProvider(punctuation); 139 140 static String[] ProtectThese = new String[] { 141 twitterPart.patternString(), 142 emoticons.patternString(), 143 truncatedURL.patternString(), 144 url.patternString(), 145 email.patternString(), 146 entity.patternString(), 147 time.patternString(), 148 number.patternString(), 149 // embeddedDash.patternString(), 150 embedded.patternString(), 151 punctuation.patternString(), 152 abbrev.patternString(), 153 Separators, 154 Decorations, 155 }; 156 157 158 159 static String oredProtect = RegexUtil.regex_or_match(ProtectThese); 160 static Pattern Protect_RE = Pattern.compile(oredProtect, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE); 161 162 // static Pattern Protect_RE = twitterPart.pattern(); 163 164 /** 165 * @param s 166 * Tokenise this string 167 * @throws UnsupportedEncodingException 168 * @throws TweetTokeniserException 169 */ 170 public TweetTokeniser(String s) throws UnsupportedEncodingException, TweetTokeniserException { 171 // System.out.println(EdgePunct); 172 // System.out.println(new String("")); 173 this.text = new String(s); 174 // System.out.println("TWEET:" + text); 175 fixEncoding(); 176 squeeze_whitespace(); 177 simple_tokenize(); 178 } 179 180 private void simple_tokenize() throws TweetTokeniserException { 181 this.tokenize = new ArrayList<Token>(); 182 edge_punct_munge(); 183 184 final ArrayList<String> goods = new ArrayList<String>(); 185 final ArrayList<String> bads = new ArrayList<String>(); 186 final ArrayList<Token> res = new ArrayList<Token>(); 187 final ArrayList<Token> goodt = new ArrayList<Token>(); 188 final ArrayList<Token> badt = new ArrayList<Token>(); 189 int i = 0; 190 final Matcher matches = Protect_RE.matcher(this.text); 191 if (matches != null) { 192 while (matches.find()) { 193 final String goodString = this.text.substring(i, matches.start()); 194 goods.add(goodString); 195 final List<Token> goodStrings = unprotected_tokenize(goodString); 196 res.addAll(goodStrings); 197 goodt.addAll(goodStrings); 198 final String badString = this.text.substring(matches.start(), matches.end()); 199 bads.add(badString); 200 final DefaultToken badTok = new DefaultToken(badString, 0); 201 res.add(badTok); 202 badt.add(badTok); 203 i = matches.end(); 204 } 205 final String finalGood = this.text.substring(i, this.text.length()); 206 final List<Token> goodStrings = unprotected_tokenize(finalGood); 207 res.addAll(goodStrings); 208 goodt.addAll(goodStrings); 209 } else { 210 final String goodString = this.text.substring(0, this.text.length()); 211 final List<Token> goodStrings = unprotected_tokenize(goodString); 212 res.addAll(goodStrings); 213 goodt.addAll(goodStrings); 214 } 215 216 this.tokenize = post_process(res); 217 this.protectedTokens = post_process(badt); 218 this.unprotectedTokens = post_process(goodt); 219 } 220 221 private ArrayList<Token> post_process(ArrayList<Token> res) { 222 return res; 223 } 224 225 private List<Token> unprotected_tokenize(String goodString) { 226 final String[] strings = goodString.split("\\s+"); 227 final List<Token> t = new ArrayList<Token>(); 228 for (final String s : strings) { 229 if (s.isEmpty()) 230 continue; 231 t.add(new DefaultToken(s, 0)); 232 } 233 return t; 234 } 235 236 private void edge_punct_munge() { 237 // this.text = EdgePunctuationPatternProvider.fixedges(this.text); 238 } 239 240 private void squeeze_whitespace() { 241 this.text = this.text.replaceAll(spaceRegex, " "); 242 } 243 244 private void fixEncoding() throws UnsupportedEncodingException { 245 this.text = new String(text.getBytes("UTF-8"), "UTF-8"); 246 this.text = StringEscapeUtils.unescapeHtml(this.text); 247 // System.out.println("UTF-8:" + text); 248 } 249 250 @Override 251 public Iterator<Token> iterator() { 252 return this.tokenize.iterator(); 253 } 254 255 /** 256 * @return all the tokens detected (as {@link Token} instances) 257 */ 258 public List<Token> getTokens() { 259 return this.tokenize; 260 } 261 262 /** 263 * @return return all tokens as a {@link List} of {@link String} 264 */ 265 public List<String> getStringTokens() { 266 final List<String> stringTokens = new ArrayList<String>(); 267 for (final Token token : this.tokenize) { 268 stringTokens.add(token.getText()); 269 } 270 return stringTokens; 271 } 272 273 /** 274 * @return return all tokens protected by the Twokenizer regex 275 */ 276 public List<String> getProtectedStringTokens() { 277 final List<String> stringTokens = new ArrayList<String>(); 278 for (final Token token : this.protectedTokens) { 279 stringTokens.add(token.getText()); 280 } 281 return stringTokens; 282 } 283 284 /** 285 * @return return all the tokens not protected by the Twokenizer regex 286 */ 287 public List<String> getUnprotectedStringTokens() { 288 final List<String> stringTokens = new ArrayList<String>(); 289 for (final Token token : this.unprotectedTokens) { 290 stringTokens.add(token.getText()); 291 } 292 return stringTokens; 293 } 294 295}