001/**
002 * Copyright (c) 2012, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.twitter.modes.preprocessing;
031
032import java.io.IOException;
033import java.util.HashMap;
034import java.util.List;
035import java.util.Map;
036
037import org.openimaj.text.nlp.TweetTokeniser;
038import org.openimaj.text.nlp.language.LanguageDetector.WeightedLocale;
039import org.openimaj.twitter.USMFStatus;
040
041/**
042 * Use the twokeniser to tokenise tweets
043 * 
044 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
045 * 
046 */
047public class TokeniseMode extends TwitterPreprocessingMode<Map<String, List<String>>> {
048
049        final static String TOKENS = "tokens";
050        public static final String TOKENS_UNPROTECTED = "unprotected";
051        public static final String TOKENS_PROTECTED = "protected";
052        public static final String TOKENS_ALL = "all";
053        private LanguageDetectionMode langMode;
054
055        /**
056         * literally do nothing
057         */
058        public TokeniseMode() {
059                try {
060                        langMode = new LanguageDetectionMode();
061                } catch (final IOException e) {
062                        // The langauge detector was not instantiated, tokens will be of
063                        // lower quality!
064                }
065        }
066
067        @Override
068        public Map<String, List<String>> process(USMFStatus twitterStatus) {
069                TweetTokeniser tokeniser;
070                final Map<String, List<String>> tokens = new HashMap<String, List<String>>();
071                twitterStatus.addAnalysis(TOKENS, tokens);
072                try {
073                        if (langMode != null) {
074                                final Map<String, Object> localeMap = TwitterPreprocessingMode.results(twitterStatus, langMode);
075                                final WeightedLocale locale = WeightedLocale.fromMap(localeMap);
076                                if (!TweetTokeniser.isValid(locale.language)) {
077                                        return tokens;
078                                }
079                        }
080
081                        tokeniser = new TweetTokeniser(twitterStatus.text);
082                        tokens.put(TOKENS_ALL, tokeniser.getStringTokens());
083                        tokens.put(TOKENS_PROTECTED, tokeniser.getProtectedStringTokens());
084                        tokens.put(TOKENS_UNPROTECTED, tokeniser.getUnprotectedStringTokens());
085                        twitterStatus.addAnalysis(TOKENS, tokens);
086                } catch (final Exception e) {
087                }
088
089                return tokens;
090
091        }
092
093        @Override
094        public String getAnalysisKey() {
095                return TokeniseMode.TOKENS;
096        }
097}