Source code

001package org.openimaj.picslurper;
002
003import java.io.ByteArrayInputStream;
004import java.io.File;
005import java.io.IOException;
006import java.net.MalformedURLException;
007import java.net.URL;
008import java.util.ArrayList;
009import java.util.Arrays;
010import java.util.HashSet;
011import java.util.List;
012import java.util.Set;
013import java.util.regex.Matcher;
014import java.util.regex.Pattern;
015
016import org.apache.commons.io.FileUtils;
017import org.apache.http.HttpEntity;
018import org.apache.http.HttpRequest;
019import org.apache.http.HttpResponse;
020import org.apache.http.ProtocolException;
021import org.apache.http.protocol.HttpContext;
022import org.apache.log4j.Logger;
023import org.openimaj.image.ImageUtilities;
024import org.openimaj.image.MBFImage;
025import org.openimaj.io.HttpUtils;
026import org.openimaj.io.HttpUtils.MetaRefreshRedirectStrategy;
027import org.openimaj.picslurper.output.OutputListener;
028import org.openimaj.picslurper.output.WriteableImageOutput;
029import org.openimaj.text.nlp.patterns.URLPatternProvider;
030import org.openimaj.twitter.collection.StreamJSONStatusList.ReadableWritableJSON;
031import org.openimaj.util.pair.IndependentPair;
032import org.openimaj.web.scraping.SiteSpecificConsumer;
033import org.openimaj.web.scraping.images.CommonHTMLConsumers;
034import org.openimaj.web.scraping.images.FacebookConsumer;
035import org.openimaj.web.scraping.images.ImgurConsumer;
036import org.openimaj.web.scraping.images.InstagramConsumer;
037import org.openimaj.web.scraping.images.OwlyImageConsumer;
038import org.openimaj.web.scraping.images.TwipleConsumer;
039import org.openimaj.web.scraping.images.TwitPicConsumer;
040import org.openimaj.web.scraping.images.TwitterPhotoConsumer;
041import org.openimaj.web.scraping.images.YfrogConsumer;
042
043import twitter4j.Status;
044import twitter4j.URLEntity;
045
046/**
047 * A status consumer knows how to consume a {@link ReadableWritableJSON} and
048 * output image files. Currently this {@link StatusConsumer} only understands
049 * Twitter JSON, perhaps making it abstract and turning {@link #consume(Status)}
050 * into an abstract function that can deal with other types of status would be
051 * sensible
052 *
053 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
054 *
055 */
056public class StatusConsumer {
057
058        /**
059         * The logger
060         */
061        public static Logger logger = Logger.getLogger(StatusConsumer.class);
062
063        final static Pattern urlPattern = new URLPatternProvider().pattern();
064        /**
065         * the site specific consumers
066         */
067        public final static List<SiteSpecificConsumer> siteSpecific = new ArrayList<SiteSpecificConsumer>();
068        static {
069                StatusConsumer.siteSpecific.add(new InstagramConsumer());
070                StatusConsumer.siteSpecific.add(new TwitterPhotoConsumer());
071                // StatusConsumer.siteSpecific.add(new TmblrPhotoConsumer());
072                StatusConsumer.siteSpecific.add(new TwitPicConsumer());
073                StatusConsumer.siteSpecific.add(new ImgurConsumer());
074                StatusConsumer.siteSpecific.add(new FacebookConsumer());
075                StatusConsumer.siteSpecific.add(new YfrogConsumer());
076                StatusConsumer.siteSpecific.add(new OwlyImageConsumer());
077                StatusConsumer.siteSpecific.add(new TwipleConsumer());
078                StatusConsumer.siteSpecific.add(CommonHTMLConsumers.FOTOLOG);
079                StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PHOTONUI);
080                StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PICS_LOCKERZ);
081        }
082        private boolean outputStats;
083        private File globalStats;
084        private File outputLocation;
085
086        private final Set<String> toProcess;
087
088        private final HashSet<String> previouslySeen;
089
090        private List<OutputListener> outputModes;
091
092        /**
093         * @param outputStats
094         *            whether statistics should be outputted
095         * @param globalStats
096         *            the global statistics file
097         * @param outputLocation
098         *            the output location for this status
099         * @param outputModes
100         *            the output modes informed on image downloads
101         *
102         */
103        public StatusConsumer(final boolean outputStats, final File globalStats, final File outputLocation,
104                        final List<OutputListener> outputModes)
105        {
106                this();
107                this.outputStats = outputStats;
108                this.globalStats = globalStats;
109                this.outputLocation = outputLocation;
110                this.outputModes = outputModes;
111
112        }
113
114        /**
115         * for convenience
116         */
117        public StatusConsumer() {
118                this.previouslySeen = new HashSet<String>();
119                this.toProcess = new HashSet<String>();
120        }
121
122        class LoggingStatus {
123                List<String> strings = new ArrayList<String>();
124        }
125
126        /**
127         * @param status
128         * @return the statistics of the consumption
129         * @throws Exception
130         */
131        public StatusConsumption consume(final Status status) throws Exception {
132                StatusConsumption cons;
133                // Now add all the entries from entities.urls
134
135                if (status.getURLEntities() != null) {
136
137                        for (final URLEntity map : status.getURLEntities()) {
138                                String u = map.getExpandedURL();
139                                if (u == null) {
140                                        u = map.getURL();
141                                }
142                                if (u == null)
143                                        continue;
144                                final String eurl = u.toString();
145                                if (eurl == null)
146                                        continue;
147                                this.add(eurl);
148                        }
149                }
150                // Find the URLs in the raw text
151                final String text = status.getText();
152                if (text != null) { // why was text null?
153                        final Matcher matcher = StatusConsumer.urlPattern.matcher(text);
154                        while (matcher.find()) {
155                                final String urlString = text.substring(matcher.start(), matcher.end());
156                                this.add(urlString);
157                        }
158                }
159
160                // now go through all the links and process them (i.e. download them)
161                cons = this.processAll(status);
162
163                if (this.outputStats)
164                        PicSlurperUtils.updateStats(this.globalStats, cons, true);
165                return cons;
166        }
167
168        /**
169         * Process all added URLs
170         *
171         * @param status
172         * @return the {@link StatusConsumption} statistics
173         * @throws IOException
174         */
175        public StatusConsumption processAll(final Status status) throws IOException {
176                final StatusConsumption cons = new StatusConsumption();
177                cons.nTweets = 1;
178                cons.nURLs = 0;
179                while (this.toProcess.size() > 0) {
180                        final String url = this.toProcess.iterator().next();
181                        this.toProcess.remove(url);
182                        cons.nURLs++;
183                        final File urlOut = this.resolveURL(new URL(url), cons);
184                        if (urlOut != null) {
185                                final File outStats = new File(urlOut, "status.txt");
186                                PicSlurperUtils.updateStats(outStats, cons);
187                                PicSlurperUtils.updateTweets(urlOut, status);
188                                for (final OutputListener outputMode : this.outputModes) {
189                                        outputMode.newImageDownloaded(new WriteableImageOutput(status, new URL(url), urlOut, cons));
190                                }
191                        }
192
193                }
194                return cons;
195        }
196
197        /**
198         * Add a URL to process without allowing already seen URLs to be added
199         *
200         * @param newURL
201         */
202        public void add(final String newURL) {
203                boolean add = true;
204                for (final String string : this.previouslySeen) {
205                        if (string.startsWith(newURL) || newURL.startsWith(string) || newURL.equals(string)) {
206                                add = false;
207                                break;
208                        }
209                }
210                if (add) {
211                        StatusConsumer.logger.debug("New URL added to list: " + newURL);
212                        this.toProcess.add(newURL);
213                        this.previouslySeen.add(newURL);
214                } else {
215                        StatusConsumer.logger.debug("URL not added, already exists: " + newURL);
216                }
217        }
218
219        /**
220         * Given a URL, use {@link #urlToImage(URL)} to turn the url into a list of
221         * images and write the images into the output location using the names
222         * "image_N.png"
223         *
224         * @param url
225         * @param cons
226         *            the consumption stats
227         * @return the root output location
228         */
229        public File resolveURL(final URL url, final StatusConsumption cons) {
230                final List<IndependentPair<URL, MBFImage>> image = this.urlToImage(url);
231                if (image == null)
232                        return null;
233                File outputDir;
234                try {
235                        if (this.outputLocation == null)
236                                return null;
237                        outputDir = StatusConsumer.urlToOutput(url, this.outputLocation);
238                        cons.nTweets++;
239                        int n = 0;
240                        for (final IndependentPair<URL, MBFImage> mbfImage : image) {
241                                final URL urlReadFrom = mbfImage.firstObject();
242                                final MBFImage imageToWrite = mbfImage.secondObject();
243                                File outImage = null;
244                                if (imageToWrite == null) {
245                                        StatusConsumer.logger.debug("Downloading a raw GIF");
246                                        // For now this is the signal that we have a GIF. Write the
247                                        // gif.
248                                        outImage = new File(outputDir, String.format("image_%d.gif", n++));
249                                        final byte[] value = HttpUtils.readURLAsBytes(urlReadFrom, false);
250                                        FileUtils.writeByteArrayToFile(outImage, value);
251                                } else {
252                                        StatusConsumer.logger.debug("Downloading a normal image");
253                                        outImage = new File(outputDir, String.format("image_%d.png", n++));
254                                        ImageUtilities.write(imageToWrite, outImage);
255                                }
256                                cons.nImages++;
257                                cons.imageURLs.add(urlReadFrom);
258                        }
259                        return outputDir;
260                } catch (final IOException e) {
261                        e.printStackTrace();
262                }
263                return null;
264
265        }
266
267        /**
268         * An extention of the {@link MetaRefreshRedirectStrategy} which disallows
269         * all redirects and instead remembers a redirect for use later on.
270         *
271         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
272         *
273         */
274        public static class StatusConsumerRedirectStrategy extends MetaRefreshRedirectStrategy {
275                private boolean wasRedirected = false;
276                private URL redirection;
277
278                @Override
279                public boolean isRedirected(final HttpRequest request, final HttpResponse response, final HttpContext context)
280                                throws ProtocolException
281                {
282                        this.wasRedirected = super.isRedirected(request, response, context);
283
284                        if (this.wasRedirected) {
285                                try {
286                                        this.redirection = this.getRedirect(request, response, context).getURI().toURL();
287                                } catch (final MalformedURLException e) {
288                                        this.wasRedirected = false;
289                                }
290                        }
291                        return false;
292                }
293
294                /**
295                 * @return whether a redirect was found
296                 */
297                public boolean wasRedirected() {
298                        return this.wasRedirected;
299                }
300
301                /**
302                 * @return the redirection
303                 */
304                public URL redirection() {
305                        return this.redirection;
306                }
307        }
308
309        /**
310         * First, try all the {@link SiteSpecificConsumer} instances loaded into
311         * {@link #siteSpecific}. If any consumer takes control of a link the
312         * consumer's output is used
313         *
314         * if this fails use
315         * {@link HttpUtils#readURLAsByteArrayInputStream(URL, org.apache.http.client.RedirectStrategy)}
316         * with a {@link StatusConsumerRedirectStrategy} which specifically
317         * disallows redirects to be dealt with automatically and forces this
318         * function to be called for each redirect.
319         *
320         *
321         * @param url
322         * @return a list of images or null
323         */
324        public List<IndependentPair<URL, MBFImage>> urlToImage(final URL url) {
325                StatusConsumer.logger.debug("Resolving URL: " + url);
326                StatusConsumer.logger.debug("Attempting site specific consumers");
327                List<IndependentPair<URL, MBFImage>> image = null;
328                for (final SiteSpecificConsumer consumer : StatusConsumer.siteSpecific) {
329                        if (consumer.canConsume(url)) {
330                                StatusConsumer.logger.debug("Site specific consumer: " + consumer.getClass().getName()
331                                                + " working on link");
332                                final List<URL> urlList = consumer.consume(url);
333                                if (urlList != null && !urlList.isEmpty()) {
334                                        StatusConsumer.logger.debug("Site specific consumer returned non-null, adding the URLs");
335                                        for (final URL siteSpecific : urlList) {
336                                                this.add(siteSpecific.toString());
337                                        }
338                                        return image;
339                                }
340                        }
341                }
342                try {
343                        StatusConsumer.logger.debug("Site specific consumers failed, trying the raw link");
344                        final StatusConsumerRedirectStrategy redirector = new StatusConsumerRedirectStrategy();
345                        final IndependentPair<HttpEntity, ByteArrayInputStream> headersBais = HttpUtils
346                                        .readURLAsByteArrayInputStream(url, 1000, 1000, redirector, HttpUtils.DEFAULT_USERAGENT);
347                        if (redirector.wasRedirected()) {
348                                StatusConsumer.logger.debug("Redirect intercepted, adding redirection to list");
349                                final String redirect = redirector.redirection().toString();
350                                if (!redirect.equals(url.toString()))
351                                        this.add(redirect);
352                                return null;
353                        }
354                        final HttpEntity headers = headersBais.firstObject();
355                        final ByteArrayInputStream bais = headersBais.getSecondObject();
356                        final String typeValue = headers.getContentType().getValue();
357                        if (typeValue.contains("text")) {
358                                this.reportFailedURL(url, "text content");
359                                return null;
360                        } else {
361                                // Not text? try reading it as an image!
362                                MBFImage readMBF = null;
363                                if (typeValue.contains("gif")) {
364                                        // It is a gif! just download it normally (i.e. null image
365                                        // but not null URL)
366                                        readMBF = null;
367                                } else {
368                                        // otherwise just try to read the damn image
369                                        readMBF = ImageUtilities.readMBF(bais);
370                                }
371                                final IndependentPair<URL, MBFImage> pair = IndependentPair.pair(url, readMBF);
372                                image = Arrays.asList(pair);
373                                StatusConsumer.logger.debug("Link resolved, returning image.");
374                                return image;
375                        }
376                } catch (final Throwable e) { // This input might not be an image! deal
377                        // with that
378                        this.reportFailedURL(url, e.getMessage());
379                        return null;
380                }
381        }
382
383        private void reportFailedURL(final URL url, final String reason) {
384                if (this.outputModes != null) {
385                        for (final OutputListener listener : this.outputModes) {
386                                listener.failedURL(url, reason);
387                        }
388                }
389        }
390
391        /**
392         * Construct a file in the output location for a given url
393         *
394         * @param url
395         * @param outputLocation
396         * @return a file that looks like: outputLocation/protocol/path/query/...
397         * @throws IOException
398         */
399        public static synchronized File urlToOutput(final URL url, final File outputLocation) throws IOException {
400                String urlPath = url.getProtocol() + File.separator +
401                                url.getHost() + File.separator;
402                if (!url.getPath().equals(""))
403                        urlPath += StatusConsumer.sanitizeFilename(url.getPath()) + File.separator;
404                if (url.getQuery() != null)
405                        urlPath += StatusConsumer.sanitizeFilename(url.getQuery()) + File.separator;
406
407                final String outPath = outputLocation.getAbsolutePath() + File.separator + urlPath;
408                final File outFile = new File(outPath);
409                if (outFile.exists()) {
410                        if (outFile.isDirectory()) {
411                                return outFile;
412                        } else {
413                                StatusConsumer.createURLOutDir(outFile);
414                        }
415                } else {
416                        StatusConsumer.createURLOutDir(outFile);
417                }
418                return outFile;
419        }
420
421        /**
422         * Replaces illegal characters in a filename with "_" illegal characters : :
423         * \ / * ? | < >
424         *
425         * @param name
426         * @return Sanitised filename
427         */
428        public static String sanitizeFilename(final String name) {
429                return name.replaceAll("[:\\\\/*?|<>]", "_");
430        }
431
432        static void createURLOutDir(final File outFile) throws IOException {
433                if (!((!outFile.exists() || outFile.delete()) && outFile.mkdirs())) {
434                        throw new IOException("Couldn't create URL output: " + outFile.getAbsolutePath());
435                }
436        }
437
438}