001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping; 018 019import java.io.ByteArrayInputStream; 020import java.net.URL; 021import java.util.ArrayList; 022import java.util.List; 023 024import org.jsoup.Jsoup; 025import org.jsoup.nodes.Document; 026import org.jsoup.nodes.Element; 027import org.jsoup.select.Elements; 028import org.openimaj.io.HttpUtils; 029import org.openimaj.web.scraping.SiteSpecificConsumer; 030 031/** 032 * Abstract base for scraping data from elements in web pages 033 * 034 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 035 */ 036public abstract class HTMLScrapingSiteSpecificConsumer implements SiteSpecificConsumer { 037 @Override 038 public List<URL> consume(URL url) { 039 try { 040 final ByteArrayInputStream stream = HttpUtils.readURLAsByteArrayInputStream(url, 1000, 1000, null, 041 HttpUtils.DEFAULT_USERAGENT).getSecondObject(); 042 final byte[] retPage = org.apache.commons.io.IOUtils.toByteArray(stream); 043 final Document soup = Jsoup.parse(new String(retPage, "UTF-8")); 044 final Elements imageElement = soup.select(cssSelect()); 045 final List<URL> ret = new ArrayList<URL>(); 046 for (final Element element : imageElement) { 047 final String imageSource = element.attr("src"); 048 if (imageSource != null) { 049 try { 050 final URL link = new URL(imageSource); 051 ret.add(link); 052 } catch (final Throwable e) { 053 // ?? maybe it didn't have the host in the src? 054 final URL link = new URL(url.getProtocol(), url.getHost(), imageSource); 055 ret.add(link); 056 } 057 } 058 } 059 return ret; 060 } catch (final Throwable e) { 061 return null; 062 } 063 } 064 065 /** 066 * @return the css selection from which to find the img to scrape 067 */ 068 public abstract String cssSelect(); 069}