Source code

001/**
002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.scraping;
018
019import java.io.ByteArrayInputStream;
020import java.net.URL;
021import java.util.ArrayList;
022import java.util.List;
023
024import org.jsoup.Jsoup;
025import org.jsoup.nodes.Document;
026import org.jsoup.nodes.Element;
027import org.jsoup.select.Elements;
028import org.openimaj.io.HttpUtils;
029import org.openimaj.web.scraping.SiteSpecificConsumer;
030
031/**
032 * Abstract base for scraping data from elements in web pages
033 * 
034 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
035 */
036public abstract class HTMLScrapingSiteSpecificConsumer implements SiteSpecificConsumer {
037        @Override
038        public List<URL> consume(URL url) {
039                try {
040                        final ByteArrayInputStream stream = HttpUtils.readURLAsByteArrayInputStream(url, 1000, 1000, null,
041                                        HttpUtils.DEFAULT_USERAGENT).getSecondObject();
042                        final byte[] retPage = org.apache.commons.io.IOUtils.toByteArray(stream);
043                        final Document soup = Jsoup.parse(new String(retPage, "UTF-8"));
044                        final Elements imageElement = soup.select(cssSelect());
045                        final List<URL> ret = new ArrayList<URL>();
046                        for (final Element element : imageElement) {
047                                final String imageSource = element.attr("src");
048                                if (imageSource != null) {
049                                        try {
050                                                final URL link = new URL(imageSource);
051                                                ret.add(link);
052                                        } catch (final Throwable e) {
053                                                // ?? maybe it didn't have the host in the src?
054                                                final URL link = new URL(url.getProtocol(), url.getHost(), imageSource);
055                                                ret.add(link);
056                                        }
057                                }
058                        }
059                        return ret;
060                } catch (final Throwable e) {
061                        return null;
062                }
063        }
064
065        /**
066         * @return the css selection from which to find the img to scrape
067         */
068        public abstract String cssSelect();
069}