001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping.images; 018 019import java.net.URL; 020import java.util.Arrays; 021import java.util.List; 022 023import org.jsoup.Jsoup; 024import org.jsoup.nodes.Document; 025import org.jsoup.select.Elements; 026import org.openimaj.web.scraping.SiteSpecificConsumer; 027 028/** 029 * Download images from twitter's own image hosting service 030 * 031 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 032 * 033 */ 034public class TwitterPhotoConsumer implements SiteSpecificConsumer { 035 @Override 036 public boolean canConsume(URL url) { 037 // http://twitter.com/HutchSelenator/status/222772697531301890/photo/1 038 if (url.getHost().equals("twitter.com") && url.getPath().contains("photo")) 039 return true; 040 041 // http://pbs.twimg.com/media/B_7Q6PMWAAAzvH0.jpg 042 return url.getHost().endsWith("twimg.com") && url.getPath().contains("media"); 043 } 044 045 @Override 046 public List<URL> consume(URL url) { 047 if (url.getHost().endsWith("twimg.com")) { 048 return Arrays.asList(new URL[] { url }); 049 } 050 051 String largeURLStr = url.toString(); 052 if (!largeURLStr.endsWith("large")) { 053 largeURLStr += "/large"; 054 } 055 try { 056 final Document doc = Jsoup.connect(largeURLStr).get(); 057 final Elements largeimage = doc.select(".media-slideshow-image"); 058 final URL link = new URL(largeimage.get(0).attr("src")); 059 return Arrays.asList(link); 060 } catch (final Exception e) { 061 return null; 062 } 063 } 064}