Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.stream.functions;
031
032import java.io.ByteArrayInputStream;
033import java.net.MalformedURLException;
034import java.net.URL;
035import java.util.Arrays;
036import java.util.List;
037
038import org.apache.http.HttpEntity;
039import org.apache.http.HttpRequest;
040import org.apache.http.HttpResponse;
041import org.apache.http.ProtocolException;
042import org.apache.http.protocol.HttpContext;
043import org.apache.log4j.Logger;
044import org.openimaj.image.ImageUtilities;
045import org.openimaj.io.HttpUtils;
046import org.openimaj.io.HttpUtils.MetaRefreshRedirectStrategy;
047import org.openimaj.util.pair.IndependentPair;
048import org.openimaj.web.scraping.SiteSpecificConsumer;
049import org.openimaj.web.scraping.images.CommonHTMLConsumers;
050import org.openimaj.web.scraping.images.FacebookConsumer;
051import org.openimaj.web.scraping.images.ImgurConsumer;
052import org.openimaj.web.scraping.images.InstagramConsumer;
053import org.openimaj.web.scraping.images.OwlyImageConsumer;
054import org.openimaj.web.scraping.images.TmblrPhotoConsumer;
055import org.openimaj.web.scraping.images.TwipleConsumer;
056import org.openimaj.web.scraping.images.TwitPicConsumer;
057import org.openimaj.web.scraping.images.TwitterPhotoConsumer;
058import org.openimaj.web.scraping.images.YfrogConsumer;
059
060import com.google.common.collect.Lists;
061
062/**
063 * This class implements a function that will given an input URL outputs a list
064 * of URLs to the possible images related to the input URL. This works by using
065 * a set of {@link SiteSpecificConsumer}s for common image hosting sites to
066 * determine if the input URL is likely to lead to an image of images.
067 * <p>
068 * Currently, the following consumers are included:
069 * <ul>
070 * <li> {@link InstagramConsumer}
071 * <li> {@link TwitterPhotoConsumer}
072 * <li> {@link TmblrPhotoConsumer}
073 * <li> {@link TwitPicConsumer}
074 * <li> {@link ImgurConsumer}
075 * <li> {@link FacebookConsumer}
076 * <li> {@link YfrogConsumer}
077 * <li> {@link OwlyImageConsumer}
078 * <li> {@link TwipleConsumer}
079 * <li> {@link CommonHTMLConsumers#FOTOLOG}
080 * <li> {@link CommonHTMLConsumers#PHOTONUI}
081 * <li> {@link CommonHTMLConsumers#PICS_LOCKERZ}
082 * </ul>
083 *
084 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
085 */
086public class ImageSiteURLExtractor extends SiteSpecificURLExtractor {
087        private static final Logger logger = Logger.getLogger(ImageSiteURLExtractor.class);
088
089        private boolean fallback = false;
090
091        /**
092         * Construct with or without Tumblr support
093         *
094         * @param tumblr
095         *            true if tumblr is required.
096         * @param fallback
097         *            true if should try to download directly
098         */
099        public ImageSiteURLExtractor(boolean tumblr, boolean fallback) {
100                this(tumblr);
101                this.fallback = fallback;
102        }
103
104        /**
105         * Construct with or without Tumblr support
106         *
107         * @param tumblr
108         *            true if tumblr is required.
109         */
110        public ImageSiteURLExtractor(boolean tumblr) {
111                super();
112
113                siteSpecific.addAll(Arrays.asList(
114                                new TwitterPhotoConsumer(),
115                                new InstagramConsumer(),
116                                new TwitPicConsumer(),
117                                new ImgurConsumer(),
118                                new FacebookConsumer(),
119                                new YfrogConsumer(),
120                                new OwlyImageConsumer(),
121                                new TwipleConsumer(),
122                                CommonHTMLConsumers.FOTOLOG,
123                                CommonHTMLConsumers.PHOTONUI,
124                                CommonHTMLConsumers.PICS_LOCKERZ));
125
126                if (tumblr)
127                        siteSpecific.add(new TmblrPhotoConsumer());
128        }
129
130        /**
131         * Default constructor; includes tumblr support.
132         */
133        public ImageSiteURLExtractor() {
134                this(true);
135        }
136
137        /**
138         * An extension of the {@link MetaRefreshRedirectStrategy} which disallows
139         * all redirects and instead remembers a redirect for use later on.
140         *
141         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
142         */
143        private static class StatusConsumerRedirectStrategy extends MetaRefreshRedirectStrategy {
144                private boolean wasRedirected = false;
145                private URL redirection;
146
147                @Override
148                public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context)
149                                throws ProtocolException
150                {
151                        wasRedirected = super.isRedirected(request, response, context);
152
153                        if (wasRedirected) {
154                                try {
155                                        this.redirection = this.getRedirect(request, response, context).getURI().toURL();
156                                } catch (final MalformedURLException e) {
157                                        this.wasRedirected = false;
158                                }
159                        }
160                        return false;
161                }
162
163                /**
164                 * @return whether a redirect was found
165                 */
166                public boolean wasRedirected() {
167                        return wasRedirected;
168                }
169
170                /**
171                 * @return the redirection
172                 */
173                public URL redirection() {
174                        return redirection;
175                }
176        }
177
178        /**
179         * First, try all the {@link SiteSpecificConsumer} instances loaded into
180         * {@link #siteSpecific}. If any consumer takes control of a link the
181         * consumer's output is used
182         *
183         * if this fails use
184         * {@link HttpUtils#readURLAsByteArrayInputStream(URL, org.apache.http.client.RedirectStrategy)}
185         * with a {@link StatusConsumerRedirectStrategy} which specifically
186         * disallows redirects to be dealt with automatically and forces this
187         * function to be called for each redirect.
188         *
189         * @param url
190         * @return a list of images or null
191         */
192        @Override
193        protected List<URL> processURLs(URL url) {
194                logger.debug("Resolving URL: " + url);
195                logger.debug("Attempting site specific consumers");
196
197                for (final SiteSpecificConsumer consumer : siteSpecific) {
198                        if (consumer.canConsume(url)) {
199                                logger.debug("Site specific consumer: " + consumer.getClass().getName() + " working on link");
200                                final List<URL> urlList = consumer.consume(url);
201
202                                if (urlList != null && !urlList.isEmpty()) {
203                                        logger.debug("Site specific consumer returned non-null, returning the URLs");
204
205                                        return urlList;
206                                }
207                        }
208                }
209
210                if (fallback) {
211                        try {
212                                logger.debug("Site specific consumers failed, trying the raw link");
213
214                                final StatusConsumerRedirectStrategy redirector = new StatusConsumerRedirectStrategy();
215                                final IndependentPair<HttpEntity, ByteArrayInputStream> headersBais = HttpUtils
216                                                .readURLAsByteArrayInputStream(url, 1000, 1000, redirector, HttpUtils.DEFAULT_USERAGENT);
217
218                                if (redirector.wasRedirected()) {
219                                        logger.debug("Redirect intercepted, adding redirection to list");
220
221                                        final URL redirect = redirector.redirection();
222                                        if (!redirect.toString().equals(url.toString()))
223                                                return processURLs(redirect);
224                                }
225
226                                // at this point any redirects have been resolved and the
227                                // content
228                                // can't be handled by any of the SSCs
229                                // we now check to see if it's image data
230
231                                final HttpEntity headers = headersBais.firstObject();
232                                final ByteArrayInputStream bais = headersBais.getSecondObject();
233
234                                final String typeValue = headers.getContentType().getValue();
235                                if (typeValue.contains("text")) {
236                                        logger.debug(url + " ignored -- text content");
237                                        return null;
238                                } else {
239                                        // Not text? try reading it as an image!
240                                        if (typeValue.contains("gif")) {
241                                                // It is a gif! just download it normally (i.e. null
242                                                // image
243                                                // but not null URL)
244                                                return Lists.newArrayList(url);
245                                        } else {
246                                                // otherwise just try to read the damn image
247                                                ImageUtilities.readMBF(bais);
248                                                return Lists.newArrayList(url);
249                                        }
250                                }
251                        } catch (final Throwable e) {
252                                // This input is probably not an image!
253                                logger.debug(url + " ignored -- exception", e);
254                        }
255                }
256
257                return null;
258        }
259}