001/**
002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.scraping.images;
018
019import java.net.URL;
020import java.util.ArrayList;
021import java.util.List;
022
023import org.jsoup.Jsoup;
024import org.jsoup.nodes.Document;
025import org.jsoup.nodes.Element;
026import org.jsoup.select.Elements;
027import org.openimaj.io.HttpUtils;
028import org.openimaj.web.scraping.SiteSpecificConsumer;
029
030/**
031 * ow.ly is a url shortening service that also has an image sharing service
032 * 
033 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
034 * 
035 */
036public class OwlyImageConsumer implements SiteSpecificConsumer {
037
038        @Override
039        public boolean canConsume(URL url) {
040                return url.getHost().contains("ow.ly") && url.getFile().startsWith("/i/");
041        }
042
043        @Override
044        public List<URL> consume(URL url) {
045                try {
046                        final byte[] retPage = HttpUtils.readURLAsBytes(url, false);
047                        final Document soup = Jsoup.parse(new String(retPage, "UTF-8"));
048                        final Elements imageElement = soup.select(".imageWrapper img");
049                        final List<URL> ret = new ArrayList<URL>();
050                        for (final Element element : imageElement) {
051                                final String imageSource = element.attr("src");
052                                if (imageSource != null) {
053                                        final URL link = new URL(imageSource);
054                                        ret.add(link);
055                                }
056                        }
057                        return ret;
058                } catch (final Throwable e) {
059                        return null;
060                }
061        }
062
063}