001/**
002 * Copyright (c) 2012, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.twitter.options;
031
032import java.util.ArrayList;
033import java.util.List;
034
035import org.kohsuke.args4j.CmdLineException;
036import org.kohsuke.args4j.CmdLineParser;
037import org.kohsuke.args4j.Option;
038import org.kohsuke.args4j.ProxyOptionHandler;
039import org.openimaj.tools.InOutToolOptions;
040import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingFilterOption;
041import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingPredicate;
042import org.openimaj.tools.twitter.modes.output.TwitterOutputMode;
043import org.openimaj.tools.twitter.modes.output.TwitterOutputModeOption;
044import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingMode;
045import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingModeOption;
046import org.openimaj.twitter.GeneralJSON;
047import org.openimaj.twitter.GeneralJSONRDF;
048import org.openimaj.twitter.USMFStatus;
049import org.openimaj.twitter.collection.TwitterStatusListUtils;
050
051/**
052 * An abstract kind of twitter processing tool. Contains all the options generic
053 * to this kind of tool, not dependant on files or hadoop or whatever.
054 *
055 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
056 *
057 */
058public abstract class AbstractTwitterPreprocessingToolOptions extends InOutToolOptions {
059
060        @Option(
061                        name = "--mode",
062                        aliases = "-m",
063                        required = false,
064                        usage = "How should the tweets be processed.",
065                        handler = ProxyOptionHandler.class,
066                        multiValued = true)
067        List<TwitterPreprocessingModeOption> modeOptions = new ArrayList<TwitterPreprocessingModeOption>();
068        /**
069         * The preprocessing to perform
070         */
071        public List<TwitterPreprocessingMode<?>> modeOptionsOp = new ArrayList<TwitterPreprocessingMode<?>>();
072
073        @Option(
074                        name = "--pre-filter",
075                        aliases = "-prf",
076                        required = false,
077                        usage = "Define filters. Applied before other processing.",
078                        handler = ProxyOptionHandler.class,
079                        multiValued = true)
080        List<TwitterPreprocessingFilterOption> preFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>();
081        /**
082         * The prefiltering to perform
083         */
084        public List<TwitterPreprocessingPredicate> preFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>();
085
086        @Option(
087                        name = "--post-filter",
088                        aliases = "-pof",
089                        required = false,
090                        usage = "Define filters. Applied after other processing",
091                        handler = ProxyOptionHandler.class,
092                        multiValued = true)
093        List<TwitterPreprocessingFilterOption> postFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>();
094        /**
095         * the postfiltering to perform
096         */
097        public List<TwitterPreprocessingPredicate> postFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>();
098        //
099        @Option(
100                        name = "--encoding",
101                        aliases = "-e",
102                        required = false,
103                        usage = "The outputstreamwriter's text encoding",
104                        metaVar = "STRING")
105        String encoding = "UTF-8";
106
107        @Option(
108                        name = "--output-mode",
109                        aliases = "-om",
110                        required = false,
111                        usage = "How should the analysis be outputed.",
112                        handler = ProxyOptionHandler.class)
113        TwitterOutputModeOption outputModeOption = TwitterOutputModeOption.APPEND;
114        TwitterOutputMode outputModeOptionOp = TwitterOutputModeOption.APPEND.getOptions();
115
116        @Option(
117                        name = "--n-tweets",
118                        aliases = "-n",
119                        required = false,
120                        usage = "How many tweets from the input should this be applied to.",
121                        handler = ProxyOptionHandler.class)
122        int nTweets = -1;
123
124        @Option(name = "--quiet", aliases = "-q", required = false, usage = "Control the progress messages.")
125        boolean quiet = false;
126
127        @Option(name = "--verbose", aliases = "-v", required = false, usage = "Be very loud (overrides queit)")
128        boolean veryLoud = false;
129
130        @Option(
131                        name = "--time-before-skip",
132                        aliases = "-t",
133                        required = false,
134                        usage = "Time to wait before skipping an entry")
135        long timeBeforeSkip = 0;
136
137        /**
138         * the status type to take as input
139         */
140        @Option(
141                        name = "--input-type",
142                        aliases = "-it",
143                        required = false,
144                        usage = "The type of social media message being consumed")
145        public StatusType statusType = StatusType.TWITTER;
146
147        /**
148         * the status type to output
149         */
150        @Option(name = "--output-type", aliases = "-ot", required = false, usage = "How to output, defaults to USMF")
151        public StatusType outputStatusType = StatusType.USMF;
152
153        private String[] args;
154
155        /**
156         * @param args
157         *            the arguments, prepared using the prepare method
158         * @param prepare
159         *            whether prepare should be called now or later
160         * @throws CmdLineException
161         */
162        public AbstractTwitterPreprocessingToolOptions(String[] args, boolean prepare) throws CmdLineException {
163                this.args = args;
164                if (prepare)
165                        this.prepare();
166        }
167
168        /**
169         * @param args
170         *            the arguments, prepared using the prepare method
171         * @throws CmdLineException
172         */
173        public AbstractTwitterPreprocessingToolOptions(String[] args) throws CmdLineException {
174                this(args, true);
175        }
176
177        /**
178         * prepare the tool for running
179         *
180         * @throws CmdLineException
181         */
182        public void prepare() throws CmdLineException {
183                final CmdLineParser parser = new CmdLineParser(this);
184                try {
185                        if (veryLoud && quiet) {
186                                quiet = false;
187                                veryLoud = true;
188                        }
189                        parser.parseArgument(args);
190                        InOutToolOptions.prepareMultivaluedArgument(modeOptions);
191                        validateFilters();
192                        registerRDFAnalysis();
193                        this.validate();
194                } catch (final CmdLineException e) {
195                        throw e;
196                }
197
198        }
199
200        private void registerRDFAnalysis() {
201                if (this.outputStatusType == StatusType.RDF) {
202                        for (final TwitterPreprocessingMode<?> modes : this.modeOptionsOp) {
203                                GeneralJSONRDF.registerRDFAnalysisProvider(modes.getAnalysisKey(), modes.rdfAnalysisProvider());
204                        }
205                }
206        }
207
208        private void validateFilters() {
209                for (final TwitterPreprocessingPredicate filter : this.postFilterOptionsOp) {
210                        filter.validate();
211                }
212                for (final TwitterPreprocessingPredicate filter : this.preFilterOptionsOp) {
213                        filter.validate();
214                }
215        }
216
217        private String getExtractUsageInfo() {
218                return "Preprocess tweets for bag of words analysis";
219        }
220
221        /**
222         * @return an instance of the selected preprocessing mode
223         * @throws Exception
224         */
225        public List<TwitterPreprocessingMode<?>> preprocessingMode() throws Exception {
226                if (veryLoud) {
227                        System.out.println("Creating preprocessing modes");
228                }
229                final ArrayList<TwitterPreprocessingMode<?>> modes = new ArrayList<TwitterPreprocessingMode<?>>();
230                for (final TwitterPreprocessingModeOption modeOpt : this.modeOptions) {
231                        modes.add(modeOpt.getOptions());
232                }
233                return modes;
234        }
235
236        /**
237         * @return an instance of the selected output mode
238         */
239        public TwitterOutputMode ouputMode() {
240                outputModeOptionOp.validate(this);
241                return outputModeOptionOp;
242        }
243
244        /**
245         * @return whether the options provided make sense
246         * @throws CmdLineException
247         */
248        public abstract boolean validate() throws CmdLineException;
249
250        /**
251         * @param string
252         *            print progress if we are not being quiet
253         */
254        public void progress(String string) {
255                if (!quiet) {
256                        System.out.print(string);
257                }
258        }
259
260        /**
261         * @return print some extra information
262         */
263        public boolean veryLoud() {
264                return this.veryLoud;
265        }
266
267        /**
268         * @return the time to wait while analysing a tweet before it is skipped
269         *         over
270         */
271        public long getTimeBeforeSkip() {
272                return this.timeBeforeSkip;
273        }
274
275        /**
276         * @return the encoding
277         */
278        public String getEncoding() {
279                return encoding;
280        }
281
282        /**
283         * Check the internal preprocessing filters and say whether a given status
284         * should be skipped
285         *
286         * @param twitterStatus
287         * @return whether to skip a status
288         */
289        public boolean preProcessesSkip(USMFStatus twitterStatus) {
290                boolean skip = false;
291                for (final TwitterPreprocessingPredicate f : preFilterOptionsOp) {
292                        skip = !f.test(twitterStatus);
293                        if (skip)
294                                break;
295                }
296                return skip;
297        }
298
299        /**
300         * Check the internal postprocessing filters and say whether a given status
301         * should be skipped
302         *
303         * @param twitterStatus
304         * @return whether to skip a status
305         */
306        public boolean postProcessesSkip(USMFStatus twitterStatus) {
307                boolean skip = false;
308                for (final TwitterPreprocessingPredicate f : postFilterOptionsOp) {
309                        skip = !f.test(twitterStatus);
310                        if (skip)
311                                break;
312                }
313                return skip;
314        }
315
316        /**
317         * Provides the functionality to convert to the required output format as
318         * specified by -ot
319         *
320         * @param twitterStatus
321         * @return the converted output
322         */
323        public GeneralJSON convertToOutputFormat(USMFStatus twitterStatus) {
324                final GeneralJSON outInstance = TwitterStatusListUtils.newInstance(this.outputStatusType.type());
325                outInstance.fromUSMF(twitterStatus);
326                return outInstance;
327        }
328
329        /**
330         * @return the input status type
331         */
332        public StatusType getInputClass() {
333                return this.statusType;
334        }
335
336        /**
337         * @return the input status type
338         */
339        public StatusType getOutputClass() {
340                return this.outputStatusType;
341        }
342}