001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.patterns;
031
032import java.util.regex.Pattern;
033
034import org.openimaj.text.util.RegexUtil;
035
036
037
038/**
039 * Borrowed heavily from https://github.com/twitter/twitter-text-java
040 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
041 *
042 */
043public class TwitterStuffPatternProvider extends PatternProvider{
044
045        // These constants were lifted directly from the twitter regex class file mentioned above
046        private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
047        private static final String HASHTAG_ALPHA_CHARS = "a-zA-Z" + LATIN_ACCENTS_CHARS +
048    "\\u0400-\\u04ff\\u0500-\\u0527" +  // Cyrillic
049    "\\u2de0-\\u2dff\\ua640-\\ua69f" +  // Cyrillic Extended A/B
050    "\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
051    "\\p{InHiragana}\\p{InKatakana}" +  // Japanese Hiragana and Katakana
052    "\\p{InCJKUnifiedIdeographs}" +     // Japanese Kanji / Chinese Han
053    "\\u3005\\u303b" +                  // Kanji/Han iteration marks
054    "\\uff21-\\uff3a\\uff41-\\uff5a" +  // full width Alphabet
055    "\\uff66-\\uff9f" +                 // half width Katakana
056    "\\uffa1-\\uffdc";                  // half width Hangul (Korean)
057        private static final String HASHTAG_ALPHA_NUMERIC_CHARS = "0-9\\uff10-\\uff19_-" + HASHTAG_ALPHA_CHARS;
058        private static final String HASHTAG_ALPHA = "[" + HASHTAG_ALPHA_CHARS +"]";
059        private static final String HASHTAG_ALPHA_NUMERIC = "[" + HASHTAG_ALPHA_NUMERIC_CHARS +"]";
060        private static String AT_SIGNS_CHARS = "@\uFF20";
061        private  static final Pattern AT_SIGNS = Pattern.compile("[" + AT_SIGNS_CHARS + "]");
062
063
064        String linkHashtag = "(?:#|\uFF03)(?:" + HASHTAG_ALPHA_NUMERIC + "*" + HASHTAG_ALPHA + HASHTAG_ALPHA_NUMERIC + "*)";
065        String linkUsernames = "(?:" + AT_SIGNS + "+)([a-z0-9_]{1,20})(/[a-z][a-z0-9_\\-]{0,24})?(?=[^a-zA-Z0-9_])";
066        String retweet = "(?:(\\b)RT:?(\\b))";
067
068        @Override
069        public String patternString() {
070                return RegexUtil.regex_or_match(linkUsernames,linkHashtag,retweet);
071        }
072
073        @Override
074        public Pattern pattern(){
075                return Pattern.compile(patternString(), Pattern.CASE_INSENSITIVE);
076        }
077
078        /**
079         * @return the hashtag component
080         */
081        public String hashtagPatternString() {
082                return linkHashtag;
083        }
084        /**
085         * @return the retweet component
086         */
087        public String retweetPatternString() {
088                return retweet;
089        }
090        /**
091         * @return the username component
092         */
093        public String usernamePatternString() {
094                return linkUsernames;
095        }
096}