001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping.images; 018 019import java.net.URL; 020import java.util.Arrays; 021import java.util.List; 022 023import org.jsoup.Jsoup; 024import org.jsoup.nodes.Document; 025import org.jsoup.nodes.Element; 026import org.jsoup.select.Elements; 027import org.openimaj.web.scraping.SiteSpecificConsumer; 028 029/** 030 * Use JSoup to load the twitpic page and find the img tag that has a source 031 * which contains the string "photos" or "cloudfront" 032 * 033 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 034 * 035 */ 036public class TwitPicConsumer implements SiteSpecificConsumer { 037 @Override 038 public boolean canConsume(URL url) { 039 // http://twitpic.com/a67733 040 return url.getHost().contains("twitpic.com"); 041 } 042 043 @Override 044 public List<URL> consume(URL url) { 045 String largeURLStr = url.toString(); 046 if (!largeURLStr.endsWith("full")) { 047 largeURLStr += "/full"; 048 } 049 try { 050 final Document doc = Jsoup.connect(largeURLStr).get(); 051 final Elements largeimage = doc.select("img"); 052 String imgSrc = ""; 053 for (final Element e : largeimage) { 054 imgSrc = e.attr("src"); 055 if (imgSrc.contains("photos") || imgSrc.contains("cloudfront")) { 056 break; 057 } 058 } 059 final URL link = new URL(imgSrc); 060 final List<URL> a = Arrays.asList(link); 061 return a; 062 } catch (final Exception e) { 063 return null; 064 } 065 066 } 067}