001/**
002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.scraping.images;
018
019import java.net.URL;
020import java.util.Arrays;
021import java.util.List;
022
023import org.jsoup.Jsoup;
024import org.jsoup.nodes.Document;
025import org.jsoup.select.Elements;
026import org.openimaj.web.scraping.SiteSpecificConsumer;
027
028/**
029 * Download images from twitter's own image hosting service
030 *
031 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
032 *
033 */
034public class TwitterPhotoConsumer implements SiteSpecificConsumer {
035        @Override
036        public boolean canConsume(URL url) {
037                // http://twitter.com/HutchSelenator/status/222772697531301890/photo/1
038                if (url.getHost().equals("twitter.com") && url.getPath().contains("photo"))
039                        return true;
040
041                // http://pbs.twimg.com/media/B_7Q6PMWAAAzvH0.jpg
042                return url.getHost().endsWith("twimg.com") && url.getPath().contains("media");
043        }
044
045        @Override
046        public List<URL> consume(URL url) {
047                if (url.getHost().endsWith("twimg.com")) {
048                        return Arrays.asList(new URL[] { url });
049                }
050
051                String largeURLStr = url.toString();
052                if (!largeURLStr.endsWith("large")) {
053                        largeURLStr += "/large";
054                }
055                try {
056                        final Document doc = Jsoup.connect(largeURLStr).get();
057                        final Elements largeimage = doc.select(".media-slideshow-image");
058                        final URL link = new URL(largeimage.get(0).attr("src"));
059                        return Arrays.asList(link);
060                } catch (final Exception e) {
061                        return null;
062                }
063        }
064}