001/**
002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.scraping.images;
018
019import java.net.URL;
020import java.util.Arrays;
021import java.util.List;
022
023import org.jsoup.Jsoup;
024import org.jsoup.nodes.Document;
025import org.jsoup.nodes.Element;
026import org.jsoup.select.Elements;
027import org.openimaj.web.scraping.SiteSpecificConsumer;
028
029/**
030 * Use JSoup to load the twitpic page and find the img tag that has a source
031 * which contains the string "photos" or "cloudfront"
032 * 
033 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
034 * 
035 */
036public class TwitPicConsumer implements SiteSpecificConsumer {
037        @Override
038        public boolean canConsume(URL url) {
039                // http://twitpic.com/a67733
040                return url.getHost().contains("twitpic.com");
041        }
042
043        @Override
044        public List<URL> consume(URL url) {
045                String largeURLStr = url.toString();
046                if (!largeURLStr.endsWith("full")) {
047                        largeURLStr += "/full";
048                }
049                try {
050                        final Document doc = Jsoup.connect(largeURLStr).get();
051                        final Elements largeimage = doc.select("img");
052                        String imgSrc = "";
053                        for (final Element e : largeimage) {
054                                imgSrc = e.attr("src");
055                                if (imgSrc.contains("photos") || imgSrc.contains("cloudfront")) {
056                                        break;
057                                }
058                        }
059                        final URL link = new URL(imgSrc);
060                        final List<URL> a = Arrays.asList(link);
061                        return a;
062                } catch (final Exception e) {
063                        return null;
064                }
065
066        }
067}