001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping.images; 018 019import java.net.URL; 020import java.util.ArrayList; 021import java.util.List; 022 023import org.jsoup.Jsoup; 024import org.jsoup.nodes.Document; 025import org.jsoup.nodes.Element; 026import org.jsoup.select.Elements; 027import org.openimaj.io.HttpUtils; 028import org.openimaj.web.scraping.SiteSpecificConsumer; 029 030/** 031 * ow.ly is a url shortening service that also has an image sharing service 032 * 033 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 034 * 035 */ 036public class OwlyImageConsumer implements SiteSpecificConsumer { 037 038 @Override 039 public boolean canConsume(URL url) { 040 return url.getHost().contains("ow.ly") && url.getFile().startsWith("/i/"); 041 } 042 043 @Override 044 public List<URL> consume(URL url) { 045 try { 046 final byte[] retPage = HttpUtils.readURLAsBytes(url, false); 047 final Document soup = Jsoup.parse(new String(retPage, "UTF-8")); 048 final Elements imageElement = soup.select(".imageWrapper img"); 049 final List<URL> ret = new ArrayList<URL>(); 050 for (final Element element : imageElement) { 051 final String imageSource = element.attr("src"); 052 if (imageSource != null) { 053 final URL link = new URL(imageSource); 054 ret.add(link); 055 } 056 } 057 return ret; 058 } catch (final Throwable e) { 059 return null; 060 } 061 } 062 063}