001package org.openimaj.picslurper; 002 003import java.io.ByteArrayInputStream; 004import java.io.File; 005import java.io.IOException; 006import java.net.MalformedURLException; 007import java.net.URL; 008import java.util.ArrayList; 009import java.util.Arrays; 010import java.util.HashSet; 011import java.util.List; 012import java.util.Set; 013import java.util.regex.Matcher; 014import java.util.regex.Pattern; 015 016import org.apache.commons.io.FileUtils; 017import org.apache.http.HttpEntity; 018import org.apache.http.HttpRequest; 019import org.apache.http.HttpResponse; 020import org.apache.http.ProtocolException; 021import org.apache.http.protocol.HttpContext; 022import org.apache.log4j.Logger; 023import org.openimaj.image.ImageUtilities; 024import org.openimaj.image.MBFImage; 025import org.openimaj.io.HttpUtils; 026import org.openimaj.io.HttpUtils.MetaRefreshRedirectStrategy; 027import org.openimaj.picslurper.output.OutputListener; 028import org.openimaj.picslurper.output.WriteableImageOutput; 029import org.openimaj.text.nlp.patterns.URLPatternProvider; 030import org.openimaj.twitter.collection.StreamJSONStatusList.ReadableWritableJSON; 031import org.openimaj.util.pair.IndependentPair; 032import org.openimaj.web.scraping.SiteSpecificConsumer; 033import org.openimaj.web.scraping.images.CommonHTMLConsumers; 034import org.openimaj.web.scraping.images.FacebookConsumer; 035import org.openimaj.web.scraping.images.ImgurConsumer; 036import org.openimaj.web.scraping.images.InstagramConsumer; 037import org.openimaj.web.scraping.images.OwlyImageConsumer; 038import org.openimaj.web.scraping.images.TwipleConsumer; 039import org.openimaj.web.scraping.images.TwitPicConsumer; 040import org.openimaj.web.scraping.images.TwitterPhotoConsumer; 041import org.openimaj.web.scraping.images.YfrogConsumer; 042 043import twitter4j.Status; 044import twitter4j.URLEntity; 045 046/** 047 * A status consumer knows how to consume a {@link ReadableWritableJSON} and 048 * output image files. Currently this {@link StatusConsumer} only understands 049 * Twitter JSON, perhaps making it abstract and turning {@link #consume(Status)} 050 * into an abstract function that can deal with other types of status would be 051 * sensible 052 * 053 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 054 * 055 */ 056public class StatusConsumer { 057 058 /** 059 * The logger 060 */ 061 public static Logger logger = Logger.getLogger(StatusConsumer.class); 062 063 final static Pattern urlPattern = new URLPatternProvider().pattern(); 064 /** 065 * the site specific consumers 066 */ 067 public final static List<SiteSpecificConsumer> siteSpecific = new ArrayList<SiteSpecificConsumer>(); 068 static { 069 StatusConsumer.siteSpecific.add(new InstagramConsumer()); 070 StatusConsumer.siteSpecific.add(new TwitterPhotoConsumer()); 071 // StatusConsumer.siteSpecific.add(new TmblrPhotoConsumer()); 072 StatusConsumer.siteSpecific.add(new TwitPicConsumer()); 073 StatusConsumer.siteSpecific.add(new ImgurConsumer()); 074 StatusConsumer.siteSpecific.add(new FacebookConsumer()); 075 StatusConsumer.siteSpecific.add(new YfrogConsumer()); 076 StatusConsumer.siteSpecific.add(new OwlyImageConsumer()); 077 StatusConsumer.siteSpecific.add(new TwipleConsumer()); 078 StatusConsumer.siteSpecific.add(CommonHTMLConsumers.FOTOLOG); 079 StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PHOTONUI); 080 StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PICS_LOCKERZ); 081 } 082 private boolean outputStats; 083 private File globalStats; 084 private File outputLocation; 085 086 private final Set<String> toProcess; 087 088 private final HashSet<String> previouslySeen; 089 090 private List<OutputListener> outputModes; 091 092 /** 093 * @param outputStats 094 * whether statistics should be outputted 095 * @param globalStats 096 * the global statistics file 097 * @param outputLocation 098 * the output location for this status 099 * @param outputModes 100 * the output modes informed on image downloads 101 * 102 */ 103 public StatusConsumer(final boolean outputStats, final File globalStats, final File outputLocation, 104 final List<OutputListener> outputModes) 105 { 106 this(); 107 this.outputStats = outputStats; 108 this.globalStats = globalStats; 109 this.outputLocation = outputLocation; 110 this.outputModes = outputModes; 111 112 } 113 114 /** 115 * for convenience 116 */ 117 public StatusConsumer() { 118 this.previouslySeen = new HashSet<String>(); 119 this.toProcess = new HashSet<String>(); 120 } 121 122 class LoggingStatus { 123 List<String> strings = new ArrayList<String>(); 124 } 125 126 /** 127 * @param status 128 * @return the statistics of the consumption 129 * @throws Exception 130 */ 131 public StatusConsumption consume(final Status status) throws Exception { 132 StatusConsumption cons; 133 // Now add all the entries from entities.urls 134 135 if (status.getURLEntities() != null) { 136 137 for (final URLEntity map : status.getURLEntities()) { 138 String u = map.getExpandedURL(); 139 if (u == null) { 140 u = map.getURL(); 141 } 142 if (u == null) 143 continue; 144 final String eurl = u.toString(); 145 if (eurl == null) 146 continue; 147 this.add(eurl); 148 } 149 } 150 // Find the URLs in the raw text 151 final String text = status.getText(); 152 if (text != null) { // why was text null? 153 final Matcher matcher = StatusConsumer.urlPattern.matcher(text); 154 while (matcher.find()) { 155 final String urlString = text.substring(matcher.start(), matcher.end()); 156 this.add(urlString); 157 } 158 } 159 160 // now go through all the links and process them (i.e. download them) 161 cons = this.processAll(status); 162 163 if (this.outputStats) 164 PicSlurperUtils.updateStats(this.globalStats, cons, true); 165 return cons; 166 } 167 168 /** 169 * Process all added URLs 170 * 171 * @param status 172 * @return the {@link StatusConsumption} statistics 173 * @throws IOException 174 */ 175 public StatusConsumption processAll(final Status status) throws IOException { 176 final StatusConsumption cons = new StatusConsumption(); 177 cons.nTweets = 1; 178 cons.nURLs = 0; 179 while (this.toProcess.size() > 0) { 180 final String url = this.toProcess.iterator().next(); 181 this.toProcess.remove(url); 182 cons.nURLs++; 183 final File urlOut = this.resolveURL(new URL(url), cons); 184 if (urlOut != null) { 185 final File outStats = new File(urlOut, "status.txt"); 186 PicSlurperUtils.updateStats(outStats, cons); 187 PicSlurperUtils.updateTweets(urlOut, status); 188 for (final OutputListener outputMode : this.outputModes) { 189 outputMode.newImageDownloaded(new WriteableImageOutput(status, new URL(url), urlOut, cons)); 190 } 191 } 192 193 } 194 return cons; 195 } 196 197 /** 198 * Add a URL to process without allowing already seen URLs to be added 199 * 200 * @param newURL 201 */ 202 public void add(final String newURL) { 203 boolean add = true; 204 for (final String string : this.previouslySeen) { 205 if (string.startsWith(newURL) || newURL.startsWith(string) || newURL.equals(string)) { 206 add = false; 207 break; 208 } 209 } 210 if (add) { 211 StatusConsumer.logger.debug("New URL added to list: " + newURL); 212 this.toProcess.add(newURL); 213 this.previouslySeen.add(newURL); 214 } else { 215 StatusConsumer.logger.debug("URL not added, already exists: " + newURL); 216 } 217 } 218 219 /** 220 * Given a URL, use {@link #urlToImage(URL)} to turn the url into a list of 221 * images and write the images into the output location using the names 222 * "image_N.png" 223 * 224 * @param url 225 * @param cons 226 * the consumption stats 227 * @return the root output location 228 */ 229 public File resolveURL(final URL url, final StatusConsumption cons) { 230 final List<IndependentPair<URL, MBFImage>> image = this.urlToImage(url); 231 if (image == null) 232 return null; 233 File outputDir; 234 try { 235 if (this.outputLocation == null) 236 return null; 237 outputDir = StatusConsumer.urlToOutput(url, this.outputLocation); 238 cons.nTweets++; 239 int n = 0; 240 for (final IndependentPair<URL, MBFImage> mbfImage : image) { 241 final URL urlReadFrom = mbfImage.firstObject(); 242 final MBFImage imageToWrite = mbfImage.secondObject(); 243 File outImage = null; 244 if (imageToWrite == null) { 245 StatusConsumer.logger.debug("Downloading a raw GIF"); 246 // For now this is the signal that we have a GIF. Write the 247 // gif. 248 outImage = new File(outputDir, String.format("image_%d.gif", n++)); 249 final byte[] value = HttpUtils.readURLAsBytes(urlReadFrom, false); 250 FileUtils.writeByteArrayToFile(outImage, value); 251 } else { 252 StatusConsumer.logger.debug("Downloading a normal image"); 253 outImage = new File(outputDir, String.format("image_%d.png", n++)); 254 ImageUtilities.write(imageToWrite, outImage); 255 } 256 cons.nImages++; 257 cons.imageURLs.add(urlReadFrom); 258 } 259 return outputDir; 260 } catch (final IOException e) { 261 e.printStackTrace(); 262 } 263 return null; 264 265 } 266 267 /** 268 * An extention of the {@link MetaRefreshRedirectStrategy} which disallows 269 * all redirects and instead remembers a redirect for use later on. 270 * 271 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 272 * 273 */ 274 public static class StatusConsumerRedirectStrategy extends MetaRefreshRedirectStrategy { 275 private boolean wasRedirected = false; 276 private URL redirection; 277 278 @Override 279 public boolean isRedirected(final HttpRequest request, final HttpResponse response, final HttpContext context) 280 throws ProtocolException 281 { 282 this.wasRedirected = super.isRedirected(request, response, context); 283 284 if (this.wasRedirected) { 285 try { 286 this.redirection = this.getRedirect(request, response, context).getURI().toURL(); 287 } catch (final MalformedURLException e) { 288 this.wasRedirected = false; 289 } 290 } 291 return false; 292 } 293 294 /** 295 * @return whether a redirect was found 296 */ 297 public boolean wasRedirected() { 298 return this.wasRedirected; 299 } 300 301 /** 302 * @return the redirection 303 */ 304 public URL redirection() { 305 return this.redirection; 306 } 307 } 308 309 /** 310 * First, try all the {@link SiteSpecificConsumer} instances loaded into 311 * {@link #siteSpecific}. If any consumer takes control of a link the 312 * consumer's output is used 313 * 314 * if this fails use 315 * {@link HttpUtils#readURLAsByteArrayInputStream(URL, org.apache.http.client.RedirectStrategy)} 316 * with a {@link StatusConsumerRedirectStrategy} which specifically 317 * disallows redirects to be dealt with automatically and forces this 318 * function to be called for each redirect. 319 * 320 * 321 * @param url 322 * @return a list of images or null 323 */ 324 public List<IndependentPair<URL, MBFImage>> urlToImage(final URL url) { 325 StatusConsumer.logger.debug("Resolving URL: " + url); 326 StatusConsumer.logger.debug("Attempting site specific consumers"); 327 List<IndependentPair<URL, MBFImage>> image = null; 328 for (final SiteSpecificConsumer consumer : StatusConsumer.siteSpecific) { 329 if (consumer.canConsume(url)) { 330 StatusConsumer.logger.debug("Site specific consumer: " + consumer.getClass().getName() 331 + " working on link"); 332 final List<URL> urlList = consumer.consume(url); 333 if (urlList != null && !urlList.isEmpty()) { 334 StatusConsumer.logger.debug("Site specific consumer returned non-null, adding the URLs"); 335 for (final URL siteSpecific : urlList) { 336 this.add(siteSpecific.toString()); 337 } 338 return image; 339 } 340 } 341 } 342 try { 343 StatusConsumer.logger.debug("Site specific consumers failed, trying the raw link"); 344 final StatusConsumerRedirectStrategy redirector = new StatusConsumerRedirectStrategy(); 345 final IndependentPair<HttpEntity, ByteArrayInputStream> headersBais = HttpUtils 346 .readURLAsByteArrayInputStream(url, 1000, 1000, redirector, HttpUtils.DEFAULT_USERAGENT); 347 if (redirector.wasRedirected()) { 348 StatusConsumer.logger.debug("Redirect intercepted, adding redirection to list"); 349 final String redirect = redirector.redirection().toString(); 350 if (!redirect.equals(url.toString())) 351 this.add(redirect); 352 return null; 353 } 354 final HttpEntity headers = headersBais.firstObject(); 355 final ByteArrayInputStream bais = headersBais.getSecondObject(); 356 final String typeValue = headers.getContentType().getValue(); 357 if (typeValue.contains("text")) { 358 this.reportFailedURL(url, "text content"); 359 return null; 360 } else { 361 // Not text? try reading it as an image! 362 MBFImage readMBF = null; 363 if (typeValue.contains("gif")) { 364 // It is a gif! just download it normally (i.e. null image 365 // but not null URL) 366 readMBF = null; 367 } else { 368 // otherwise just try to read the damn image 369 readMBF = ImageUtilities.readMBF(bais); 370 } 371 final IndependentPair<URL, MBFImage> pair = IndependentPair.pair(url, readMBF); 372 image = Arrays.asList(pair); 373 StatusConsumer.logger.debug("Link resolved, returning image."); 374 return image; 375 } 376 } catch (final Throwable e) { // This input might not be an image! deal 377 // with that 378 this.reportFailedURL(url, e.getMessage()); 379 return null; 380 } 381 } 382 383 private void reportFailedURL(final URL url, final String reason) { 384 if (this.outputModes != null) { 385 for (final OutputListener listener : this.outputModes) { 386 listener.failedURL(url, reason); 387 } 388 } 389 } 390 391 /** 392 * Construct a file in the output location for a given url 393 * 394 * @param url 395 * @param outputLocation 396 * @return a file that looks like: outputLocation/protocol/path/query/... 397 * @throws IOException 398 */ 399 public static synchronized File urlToOutput(final URL url, final File outputLocation) throws IOException { 400 String urlPath = url.getProtocol() + File.separator + 401 url.getHost() + File.separator; 402 if (!url.getPath().equals("")) 403 urlPath += StatusConsumer.sanitizeFilename(url.getPath()) + File.separator; 404 if (url.getQuery() != null) 405 urlPath += StatusConsumer.sanitizeFilename(url.getQuery()) + File.separator; 406 407 final String outPath = outputLocation.getAbsolutePath() + File.separator + urlPath; 408 final File outFile = new File(outPath); 409 if (outFile.exists()) { 410 if (outFile.isDirectory()) { 411 return outFile; 412 } else { 413 StatusConsumer.createURLOutDir(outFile); 414 } 415 } else { 416 StatusConsumer.createURLOutDir(outFile); 417 } 418 return outFile; 419 } 420 421 /** 422 * Replaces illegal characters in a filename with "_" illegal characters : : 423 * \ / * ? | < > 424 * 425 * @param name 426 * @return Sanitised filename 427 */ 428 public static String sanitizeFilename(final String name) { 429 return name.replaceAll("[:\\\\/*?|<>]", "_"); 430 } 431 432 static void createURLOutDir(final File outFile) throws IOException { 433 if (!((!outFile.exists() || outFile.delete()) && outFile.mkdirs())) { 434 throw new IOException("Couldn't create URL output: " + outFile.getAbsolutePath()); 435 } 436 } 437 438}