001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp; 031 032import gov.sandia.cognition.text.token.DefaultToken; 033import gov.sandia.cognition.text.token.Token; 034 035import java.io.UnsupportedEncodingException; 036import java.util.ArrayList; 037import java.util.Iterator; 038import java.util.List; 039import java.util.Locale; 040import java.util.regex.Matcher; 041import java.util.regex.Pattern; 042 043import org.apache.commons.lang.StringEscapeUtils; 044import org.openimaj.text.nlp.patterns.AbbreviationPatternProvider; 045import org.openimaj.text.nlp.patterns.ComplicatedNumberPatternProvider; 046import org.openimaj.text.nlp.patterns.EmailPatternProvider; 047import org.openimaj.text.nlp.patterns.EmbeddedApostrophePatternProvider; 048import org.openimaj.text.nlp.patterns.EmbeddedDashPatternProvider; 049import org.openimaj.text.nlp.patterns.EmoticonPatternProvider; 050import org.openimaj.text.nlp.patterns.EntityPatternProvider; 051import org.openimaj.text.nlp.patterns.PunctuationPatternProvider; 052import org.openimaj.text.nlp.patterns.TimePatternProvider; 053import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider; 054import org.openimaj.text.nlp.patterns.URLPatternProvider; 055import org.openimaj.text.util.RegexUtil; 056 057 058 059/** 060 * A tokeniser built to work with short text, like that found in twitter. 061 * Protects various elements of the text with an assumption that if the user made the mark, it was an important mark that carries meaning 062 * because of the relatively high premium of each key stroke. 063 * 064 * Based on the twokenise by Brendan O'Connor 065 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 066 * 067 */ 068public class EntityTweetTokeniser implements Iterable<Token>{ 069 070 071 private String text; 072 private ArrayList<Token> tokenize; 073 private ArrayList<Token> protectedTokens; 074 private ArrayList<Token> unprotectedTokens; 075 076 private final static Locale[] invalidLanguages = new Locale[]{ 077 new Locale("zh"), 078 new Locale("ko"), 079 new Locale("jp"), 080 }; 081 082 083 /** 084 * Check whether this locale is supported by this tokeniser. The unsupported languages are those which don't need space 085 * characters to delimit words, namely the CJK languages. 086 * @param locale 087 * @return true if the local is supported 088 */ 089 public static boolean isValid(Locale locale){ 090 return isValid(locale.getLanguage()); 091 } 092 /** 093 * Check whether this locale (specified by the two letter country code, {@link Locale}) is 094 * supported by this tokeniser. The unsupported languages are those which don't need space 095 * characters to delimit words, namely the CJK languages. 096 * @param locale 097 * @return true if the local is supported 098 */ 099 public static boolean isValid(String locale){ 100 for (Locale invalidLocal: invalidLanguages) { 101 if(invalidLocal.getLanguage().equals(locale)) return false; 102 } 103 return true; 104 } 105 106 107// public static String regex_or(String ... items ) 108// { 109// String r = StringUtils.join(items, "|"); 110// r = '(' + r + ')'; 111// return r; 112// } 113// public String pos_lookahead(String r){ 114// return "(?=" + r + ')'; 115// } 116// 117// public static String neg_lookahead(String r) { 118// return "(?!" + r + ')'; 119// } 120// public String optional(String r){ 121// return String.format("(%s)?",r); 122// } 123 124 static EmoticonPatternProvider emoticons = new EmoticonPatternProvider(); 125 static PunctuationPatternProvider punctuation = new PunctuationPatternProvider(); 126 static EntityPatternProvider entity = new EntityPatternProvider(); 127 static URLPatternProvider url = new URLPatternProvider(); 128 static TimePatternProvider time = new TimePatternProvider(); 129 static ComplicatedNumberPatternProvider number = new ComplicatedNumberPatternProvider(); 130 static TwitterStuffPatternProvider twitterPart = new TwitterStuffPatternProvider(); 131 static EmailPatternProvider email = new EmailPatternProvider(); 132 static AbbreviationPatternProvider abbrev = new AbbreviationPatternProvider(entity); 133 private static final String spaceRegex = "\\s+"; 134 static String Separators = RegexUtil.regex_or_match("--+", "\u2015"); 135 static String Decorations = new String(" [\u266b]+ ").replace(" ",""); 136 static EmbeddedApostrophePatternProvider embedded = new EmbeddedApostrophePatternProvider(punctuation); 137 static EmbeddedDashPatternProvider embeddedDash = new EmbeddedDashPatternProvider(punctuation); 138 139 140 static String [] ProtectThese = new String[]{ 141 twitterPart.patternString(), 142 emoticons.patternString(), 143 url.patternString(), 144 email.patternString(), 145 entity.patternString(), 146 time.patternString(), 147 number.patternString(), 148// embeddedDash.patternString(), 149// embedded.patternString(), 150 punctuation.patternString(), 151 abbrev.patternString(), 152 Separators, 153 Decorations, 154 }; 155 static String oredProtect = RegexUtil.regex_or_match(ProtectThese); 156 static Pattern Protect_RE = Pattern.compile(oredProtect,Pattern.UNICODE_CASE|Pattern.CASE_INSENSITIVE); 157// static Pattern Protect_RE = twitterPart.pattern(); 158 159 160 /** 161 * @param s Tokenise this string 162 * @throws UnsupportedEncodingException 163 * @throws TweetTokeniserException 164 */ 165 public EntityTweetTokeniser(String s) throws UnsupportedEncodingException, TweetTokeniserException{ 166// System.out.println(EdgePunct); 167// System.out.println(new String("")); 168 this.text = new String(s); 169// System.out.println("TWEET:" + text); 170 fixEncoding(); 171 squeeze_whitespace(); 172 simple_tokenize(); 173 } 174 175 private void simple_tokenize() throws TweetTokeniserException { 176 this.tokenize = new ArrayList<Token>(); 177 edge_punct_munge(); 178 179 ArrayList<String> goods = new ArrayList<String>(); 180 ArrayList<String> bads = new ArrayList<String>(); 181 ArrayList<Token> res = new ArrayList<Token>(); 182 ArrayList<Token> goodt = new ArrayList<Token>(); 183 ArrayList<Token> badt = new ArrayList<Token>(); 184 int i = 0; 185 Matcher matches = Protect_RE.matcher(this.text); 186 if(matches!=null) 187 { 188 while(matches.find()) { 189 String goodString = this.text.substring(i,matches.start()); 190 goods.add(goodString); 191 List<Token> goodStrings = unprotected_tokenize(goodString); 192 res.addAll(goodStrings); 193 goodt.addAll(goodStrings); 194 String badString = this.text.substring(matches.start(),matches.end()); 195 bads.add(badString); 196 DefaultToken badTok = new DefaultToken(badString,0); 197 res.add(badTok); 198 badt.add(badTok); 199 i = matches.end(); 200 } 201 String finalGood = this.text.substring(i, this.text.length()); 202 List<Token> goodStrings = unprotected_tokenize(finalGood); 203 res.addAll(goodStrings); 204 goodt.addAll(goodStrings); 205 } 206 else 207 { 208 String goodString = this.text.substring(0, this.text.length()); 209 List<Token> goodStrings = unprotected_tokenize(goodString); 210 res.addAll(goodStrings); 211 goodt.addAll(goodStrings); 212 } 213 214 215 this.tokenize = post_process(res); 216 this.protectedTokens = post_process(badt); 217 this.unprotectedTokens = post_process(goodt); 218 } 219 220 private ArrayList<Token> post_process(ArrayList<Token> res) { 221 return res; 222 } 223 private List<Token> unprotected_tokenize(String goodString) { 224 String[] strings = goodString.split("\\s+"); 225 List<Token> t = new ArrayList<Token>(); 226 for (String s : strings) { 227 if(s.isEmpty()) continue; 228 t.add(new DefaultToken(s, 0)); 229 } 230 return t; 231 } 232 private void edge_punct_munge() { 233// this.text = EdgePunctuationPatternProvider.fixedges(this.text); 234 } 235 236 private void squeeze_whitespace() { 237 this.text = this.text.replaceAll(spaceRegex, " "); 238 } 239 240 private void fixEncoding() throws UnsupportedEncodingException { 241 this.text = new String(text.getBytes("UTF-8"),"UTF-8"); 242 this.text = StringEscapeUtils.unescapeHtml(this.text); 243// System.out.println("UTF-8:" + text); 244 } 245 @Override 246 public Iterator<Token> iterator() { 247 return this.tokenize.iterator(); 248 } 249 250 public List<Token> getTokens(){ 251 return this.tokenize; 252 } 253 254 public List<String> getStringTokens(){ 255 List<String> stringTokens = new ArrayList<String>(); 256 for (Token token : this.tokenize) { 257 stringTokens.add(token.getText()); 258 } 259 return stringTokens; 260 } 261 262 public List<String> getProtectedStringTokens() { 263 List<String> stringTokens = new ArrayList<String>(); 264 for (Token token : this.protectedTokens) { 265 stringTokens.add(token.getText()); 266 } 267 return stringTokens; 268 } 269 270 public List<String> getUnprotectedStringTokens() { 271 List<String> stringTokens = new ArrayList<String>(); 272 for (Token token : this.unprotectedTokens) { 273 stringTokens.add(token.getText()); 274 } 275 return stringTokens; 276 } 277 278}