001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.stream.functions; 031 032import java.io.ByteArrayInputStream; 033import java.net.MalformedURLException; 034import java.net.URL; 035import java.util.Arrays; 036import java.util.List; 037 038import org.apache.http.HttpEntity; 039import org.apache.http.HttpRequest; 040import org.apache.http.HttpResponse; 041import org.apache.http.ProtocolException; 042import org.apache.http.protocol.HttpContext; 043import org.apache.log4j.Logger; 044import org.openimaj.image.ImageUtilities; 045import org.openimaj.io.HttpUtils; 046import org.openimaj.io.HttpUtils.MetaRefreshRedirectStrategy; 047import org.openimaj.util.pair.IndependentPair; 048import org.openimaj.web.scraping.SiteSpecificConsumer; 049import org.openimaj.web.scraping.images.CommonHTMLConsumers; 050import org.openimaj.web.scraping.images.FacebookConsumer; 051import org.openimaj.web.scraping.images.ImgurConsumer; 052import org.openimaj.web.scraping.images.InstagramConsumer; 053import org.openimaj.web.scraping.images.OwlyImageConsumer; 054import org.openimaj.web.scraping.images.TmblrPhotoConsumer; 055import org.openimaj.web.scraping.images.TwipleConsumer; 056import org.openimaj.web.scraping.images.TwitPicConsumer; 057import org.openimaj.web.scraping.images.TwitterPhotoConsumer; 058import org.openimaj.web.scraping.images.YfrogConsumer; 059 060import com.google.common.collect.Lists; 061 062/** 063 * This class implements a function that will given an input URL outputs a list 064 * of URLs to the possible images related to the input URL. This works by using 065 * a set of {@link SiteSpecificConsumer}s for common image hosting sites to 066 * determine if the input URL is likely to lead to an image of images. 067 * <p> 068 * Currently, the following consumers are included: 069 * <ul> 070 * <li> {@link InstagramConsumer} 071 * <li> {@link TwitterPhotoConsumer} 072 * <li> {@link TmblrPhotoConsumer} 073 * <li> {@link TwitPicConsumer} 074 * <li> {@link ImgurConsumer} 075 * <li> {@link FacebookConsumer} 076 * <li> {@link YfrogConsumer} 077 * <li> {@link OwlyImageConsumer} 078 * <li> {@link TwipleConsumer} 079 * <li> {@link CommonHTMLConsumers#FOTOLOG} 080 * <li> {@link CommonHTMLConsumers#PHOTONUI} 081 * <li> {@link CommonHTMLConsumers#PICS_LOCKERZ} 082 * </ul> 083 * 084 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 085 */ 086public class ImageSiteURLExtractor extends SiteSpecificURLExtractor { 087 private static final Logger logger = Logger.getLogger(ImageSiteURLExtractor.class); 088 089 private boolean fallback = false; 090 091 /** 092 * Construct with or without Tumblr support 093 * 094 * @param tumblr 095 * true if tumblr is required. 096 * @param fallback 097 * true if should try to download directly 098 */ 099 public ImageSiteURLExtractor(boolean tumblr, boolean fallback) { 100 this(tumblr); 101 this.fallback = fallback; 102 } 103 104 /** 105 * Construct with or without Tumblr support 106 * 107 * @param tumblr 108 * true if tumblr is required. 109 */ 110 public ImageSiteURLExtractor(boolean tumblr) { 111 super(); 112 113 siteSpecific.addAll(Arrays.asList( 114 new TwitterPhotoConsumer(), 115 new InstagramConsumer(), 116 new TwitPicConsumer(), 117 new ImgurConsumer(), 118 new FacebookConsumer(), 119 new YfrogConsumer(), 120 new OwlyImageConsumer(), 121 new TwipleConsumer(), 122 CommonHTMLConsumers.FOTOLOG, 123 CommonHTMLConsumers.PHOTONUI, 124 CommonHTMLConsumers.PICS_LOCKERZ)); 125 126 if (tumblr) 127 siteSpecific.add(new TmblrPhotoConsumer()); 128 } 129 130 /** 131 * Default constructor; includes tumblr support. 132 */ 133 public ImageSiteURLExtractor() { 134 this(true); 135 } 136 137 /** 138 * An extension of the {@link MetaRefreshRedirectStrategy} which disallows 139 * all redirects and instead remembers a redirect for use later on. 140 * 141 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 142 */ 143 private static class StatusConsumerRedirectStrategy extends MetaRefreshRedirectStrategy { 144 private boolean wasRedirected = false; 145 private URL redirection; 146 147 @Override 148 public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context) 149 throws ProtocolException 150 { 151 wasRedirected = super.isRedirected(request, response, context); 152 153 if (wasRedirected) { 154 try { 155 this.redirection = this.getRedirect(request, response, context).getURI().toURL(); 156 } catch (final MalformedURLException e) { 157 this.wasRedirected = false; 158 } 159 } 160 return false; 161 } 162 163 /** 164 * @return whether a redirect was found 165 */ 166 public boolean wasRedirected() { 167 return wasRedirected; 168 } 169 170 /** 171 * @return the redirection 172 */ 173 public URL redirection() { 174 return redirection; 175 } 176 } 177 178 /** 179 * First, try all the {@link SiteSpecificConsumer} instances loaded into 180 * {@link #siteSpecific}. If any consumer takes control of a link the 181 * consumer's output is used 182 * 183 * if this fails use 184 * {@link HttpUtils#readURLAsByteArrayInputStream(URL, org.apache.http.client.RedirectStrategy)} 185 * with a {@link StatusConsumerRedirectStrategy} which specifically 186 * disallows redirects to be dealt with automatically and forces this 187 * function to be called for each redirect. 188 * 189 * @param url 190 * @return a list of images or null 191 */ 192 @Override 193 protected List<URL> processURLs(URL url) { 194 logger.debug("Resolving URL: " + url); 195 logger.debug("Attempting site specific consumers"); 196 197 for (final SiteSpecificConsumer consumer : siteSpecific) { 198 if (consumer.canConsume(url)) { 199 logger.debug("Site specific consumer: " + consumer.getClass().getName() + " working on link"); 200 final List<URL> urlList = consumer.consume(url); 201 202 if (urlList != null && !urlList.isEmpty()) { 203 logger.debug("Site specific consumer returned non-null, returning the URLs"); 204 205 return urlList; 206 } 207 } 208 } 209 210 if (fallback) { 211 try { 212 logger.debug("Site specific consumers failed, trying the raw link"); 213 214 final StatusConsumerRedirectStrategy redirector = new StatusConsumerRedirectStrategy(); 215 final IndependentPair<HttpEntity, ByteArrayInputStream> headersBais = HttpUtils 216 .readURLAsByteArrayInputStream(url, 1000, 1000, redirector, HttpUtils.DEFAULT_USERAGENT); 217 218 if (redirector.wasRedirected()) { 219 logger.debug("Redirect intercepted, adding redirection to list"); 220 221 final URL redirect = redirector.redirection(); 222 if (!redirect.toString().equals(url.toString())) 223 return processURLs(redirect); 224 } 225 226 // at this point any redirects have been resolved and the 227 // content 228 // can't be handled by any of the SSCs 229 // we now check to see if it's image data 230 231 final HttpEntity headers = headersBais.firstObject(); 232 final ByteArrayInputStream bais = headersBais.getSecondObject(); 233 234 final String typeValue = headers.getContentType().getValue(); 235 if (typeValue.contains("text")) { 236 logger.debug(url + " ignored -- text content"); 237 return null; 238 } else { 239 // Not text? try reading it as an image! 240 if (typeValue.contains("gif")) { 241 // It is a gif! just download it normally (i.e. null 242 // image 243 // but not null URL) 244 return Lists.newArrayList(url); 245 } else { 246 // otherwise just try to read the damn image 247 ImageUtilities.readMBF(bais); 248 return Lists.newArrayList(url); 249 } 250 } 251 } catch (final Throwable e) { 252 // This input is probably not an image! 253 logger.debug(url + " ignored -- exception", e); 254 } 255 } 256 257 return null; 258 } 259}