001/** 002 * Copyright (c) 2012, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.twitter.options; 031 032import java.util.ArrayList; 033import java.util.List; 034 035import org.kohsuke.args4j.CmdLineException; 036import org.kohsuke.args4j.CmdLineParser; 037import org.kohsuke.args4j.Option; 038import org.kohsuke.args4j.ProxyOptionHandler; 039import org.openimaj.tools.InOutToolOptions; 040import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingFilterOption; 041import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingPredicate; 042import org.openimaj.tools.twitter.modes.output.TwitterOutputMode; 043import org.openimaj.tools.twitter.modes.output.TwitterOutputModeOption; 044import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingMode; 045import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingModeOption; 046import org.openimaj.twitter.GeneralJSON; 047import org.openimaj.twitter.GeneralJSONRDF; 048import org.openimaj.twitter.USMFStatus; 049import org.openimaj.twitter.collection.TwitterStatusListUtils; 050 051/** 052 * An abstract kind of twitter processing tool. Contains all the options generic 053 * to this kind of tool, not dependant on files or hadoop or whatever. 054 * 055 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 056 * 057 */ 058public abstract class AbstractTwitterPreprocessingToolOptions extends InOutToolOptions { 059 060 @Option( 061 name = "--mode", 062 aliases = "-m", 063 required = false, 064 usage = "How should the tweets be processed.", 065 handler = ProxyOptionHandler.class, 066 multiValued = true) 067 List<TwitterPreprocessingModeOption> modeOptions = new ArrayList<TwitterPreprocessingModeOption>(); 068 /** 069 * The preprocessing to perform 070 */ 071 public List<TwitterPreprocessingMode<?>> modeOptionsOp = new ArrayList<TwitterPreprocessingMode<?>>(); 072 073 @Option( 074 name = "--pre-filter", 075 aliases = "-prf", 076 required = false, 077 usage = "Define filters. Applied before other processing.", 078 handler = ProxyOptionHandler.class, 079 multiValued = true) 080 List<TwitterPreprocessingFilterOption> preFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>(); 081 /** 082 * The prefiltering to perform 083 */ 084 public List<TwitterPreprocessingPredicate> preFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>(); 085 086 @Option( 087 name = "--post-filter", 088 aliases = "-pof", 089 required = false, 090 usage = "Define filters. Applied after other processing", 091 handler = ProxyOptionHandler.class, 092 multiValued = true) 093 List<TwitterPreprocessingFilterOption> postFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>(); 094 /** 095 * the postfiltering to perform 096 */ 097 public List<TwitterPreprocessingPredicate> postFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>(); 098 // 099 @Option( 100 name = "--encoding", 101 aliases = "-e", 102 required = false, 103 usage = "The outputstreamwriter's text encoding", 104 metaVar = "STRING") 105 String encoding = "UTF-8"; 106 107 @Option( 108 name = "--output-mode", 109 aliases = "-om", 110 required = false, 111 usage = "How should the analysis be outputed.", 112 handler = ProxyOptionHandler.class) 113 TwitterOutputModeOption outputModeOption = TwitterOutputModeOption.APPEND; 114 TwitterOutputMode outputModeOptionOp = TwitterOutputModeOption.APPEND.getOptions(); 115 116 @Option( 117 name = "--n-tweets", 118 aliases = "-n", 119 required = false, 120 usage = "How many tweets from the input should this be applied to.", 121 handler = ProxyOptionHandler.class) 122 int nTweets = -1; 123 124 @Option(name = "--quiet", aliases = "-q", required = false, usage = "Control the progress messages.") 125 boolean quiet = false; 126 127 @Option(name = "--verbose", aliases = "-v", required = false, usage = "Be very loud (overrides queit)") 128 boolean veryLoud = false; 129 130 @Option( 131 name = "--time-before-skip", 132 aliases = "-t", 133 required = false, 134 usage = "Time to wait before skipping an entry") 135 long timeBeforeSkip = 0; 136 137 /** 138 * the status type to take as input 139 */ 140 @Option( 141 name = "--input-type", 142 aliases = "-it", 143 required = false, 144 usage = "The type of social media message being consumed") 145 public StatusType statusType = StatusType.TWITTER; 146 147 /** 148 * the status type to output 149 */ 150 @Option(name = "--output-type", aliases = "-ot", required = false, usage = "How to output, defaults to USMF") 151 public StatusType outputStatusType = StatusType.USMF; 152 153 private String[] args; 154 155 /** 156 * @param args 157 * the arguments, prepared using the prepare method 158 * @param prepare 159 * whether prepare should be called now or later 160 * @throws CmdLineException 161 */ 162 public AbstractTwitterPreprocessingToolOptions(String[] args, boolean prepare) throws CmdLineException { 163 this.args = args; 164 if (prepare) 165 this.prepare(); 166 } 167 168 /** 169 * @param args 170 * the arguments, prepared using the prepare method 171 * @throws CmdLineException 172 */ 173 public AbstractTwitterPreprocessingToolOptions(String[] args) throws CmdLineException { 174 this(args, true); 175 } 176 177 /** 178 * prepare the tool for running 179 * 180 * @throws CmdLineException 181 */ 182 public void prepare() throws CmdLineException { 183 final CmdLineParser parser = new CmdLineParser(this); 184 try { 185 if (veryLoud && quiet) { 186 quiet = false; 187 veryLoud = true; 188 } 189 parser.parseArgument(args); 190 InOutToolOptions.prepareMultivaluedArgument(modeOptions); 191 validateFilters(); 192 registerRDFAnalysis(); 193 this.validate(); 194 } catch (final CmdLineException e) { 195 throw e; 196 } 197 198 } 199 200 private void registerRDFAnalysis() { 201 if (this.outputStatusType == StatusType.RDF) { 202 for (final TwitterPreprocessingMode<?> modes : this.modeOptionsOp) { 203 GeneralJSONRDF.registerRDFAnalysisProvider(modes.getAnalysisKey(), modes.rdfAnalysisProvider()); 204 } 205 } 206 } 207 208 private void validateFilters() { 209 for (final TwitterPreprocessingPredicate filter : this.postFilterOptionsOp) { 210 filter.validate(); 211 } 212 for (final TwitterPreprocessingPredicate filter : this.preFilterOptionsOp) { 213 filter.validate(); 214 } 215 } 216 217 private String getExtractUsageInfo() { 218 return "Preprocess tweets for bag of words analysis"; 219 } 220 221 /** 222 * @return an instance of the selected preprocessing mode 223 * @throws Exception 224 */ 225 public List<TwitterPreprocessingMode<?>> preprocessingMode() throws Exception { 226 if (veryLoud) { 227 System.out.println("Creating preprocessing modes"); 228 } 229 final ArrayList<TwitterPreprocessingMode<?>> modes = new ArrayList<TwitterPreprocessingMode<?>>(); 230 for (final TwitterPreprocessingModeOption modeOpt : this.modeOptions) { 231 modes.add(modeOpt.getOptions()); 232 } 233 return modes; 234 } 235 236 /** 237 * @return an instance of the selected output mode 238 */ 239 public TwitterOutputMode ouputMode() { 240 outputModeOptionOp.validate(this); 241 return outputModeOptionOp; 242 } 243 244 /** 245 * @return whether the options provided make sense 246 * @throws CmdLineException 247 */ 248 public abstract boolean validate() throws CmdLineException; 249 250 /** 251 * @param string 252 * print progress if we are not being quiet 253 */ 254 public void progress(String string) { 255 if (!quiet) { 256 System.out.print(string); 257 } 258 } 259 260 /** 261 * @return print some extra information 262 */ 263 public boolean veryLoud() { 264 return this.veryLoud; 265 } 266 267 /** 268 * @return the time to wait while analysing a tweet before it is skipped 269 * over 270 */ 271 public long getTimeBeforeSkip() { 272 return this.timeBeforeSkip; 273 } 274 275 /** 276 * @return the encoding 277 */ 278 public String getEncoding() { 279 return encoding; 280 } 281 282 /** 283 * Check the internal preprocessing filters and say whether a given status 284 * should be skipped 285 * 286 * @param twitterStatus 287 * @return whether to skip a status 288 */ 289 public boolean preProcessesSkip(USMFStatus twitterStatus) { 290 boolean skip = false; 291 for (final TwitterPreprocessingPredicate f : preFilterOptionsOp) { 292 skip = !f.test(twitterStatus); 293 if (skip) 294 break; 295 } 296 return skip; 297 } 298 299 /** 300 * Check the internal postprocessing filters and say whether a given status 301 * should be skipped 302 * 303 * @param twitterStatus 304 * @return whether to skip a status 305 */ 306 public boolean postProcessesSkip(USMFStatus twitterStatus) { 307 boolean skip = false; 308 for (final TwitterPreprocessingPredicate f : postFilterOptionsOp) { 309 skip = !f.test(twitterStatus); 310 if (skip) 311 break; 312 } 313 return skip; 314 } 315 316 /** 317 * Provides the functionality to convert to the required output format as 318 * specified by -ot 319 * 320 * @param twitterStatus 321 * @return the converted output 322 */ 323 public GeneralJSON convertToOutputFormat(USMFStatus twitterStatus) { 324 final GeneralJSON outInstance = TwitterStatusListUtils.newInstance(this.outputStatusType.type()); 325 outInstance.fromUSMF(twitterStatus); 326 return outInstance; 327 } 328 329 /** 330 * @return the input status type 331 */ 332 public StatusType getInputClass() { 333 return this.statusType; 334 } 335 336 /** 337 * @return the input status type 338 */ 339 public StatusType getOutputClass() { 340 return this.outputStatusType; 341 } 342}