001/**
002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.scraping.images;
018
019import java.net.URL;
020import java.util.ArrayList;
021import java.util.List;
022
023import org.jsoup.Jsoup;
024import org.jsoup.nodes.Document;
025import org.jsoup.nodes.Element;
026import org.jsoup.select.Elements;
027import org.openimaj.io.HttpUtils;
028import org.openimaj.web.scraping.SiteSpecificConsumer;
029
030/**
031 * Consume facebook posts/pictures using the {@link com.restfb.FacebookClient}
032 * client
033 * 
034 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
035 * 
036 */
037public class FacebookConsumer implements SiteSpecificConsumer {
038
039        @Override
040        public boolean canConsume(URL url) {
041                return url.getHost().contains("facebook");
042        }
043
044        @Override
045        public List<URL> consume(URL url) {
046                // posts == http://www.facebook.com/jsproducoes/posts/426306737404997
047                // photos ==
048                // http://www.facebook.com/photo.php?pid=1307526&l=3d755a0895&id=353116314727854
049                final String urlFile = url.getFile();
050
051                List<URL> ret = null;
052                if (urlFile.startsWith("/photo.php")) {
053                        ret = consumeFacebookPhoto(url);
054                }
055                else if (urlFile.contains("/posts/")) {
056                        ret = consumeFacebookPost(url);
057                }
058                if (ret == null || ret.isEmpty())
059                        return null;
060                return ret;
061        }
062
063        private List<URL> consumeFacebookPost(URL url) {
064                try {
065                        final byte[] retPage = HttpUtils.readURLAsBytes(url, false);
066                        final Document soup = Jsoup.parse(new String(retPage, "UTF-8"));
067                        final Elements imageElement = soup.select(".storyInnerContent img");
068                        final List<URL> ret = new ArrayList<URL>();
069                        for (final Element element : imageElement) {
070                                final String imageSource = element.attr("src");
071                                if (imageSource != null) {
072                                        final URL u = new URL(imageSource);
073                                        ret.add(u);
074                                }
075                        }
076                        return ret;
077                } catch (final Throwable e) {
078                        return null;
079                }
080        }
081
082        private List<URL> consumeFacebookPhoto(URL url) {
083                try {
084                        final byte[] retPage = HttpUtils.readURLAsBytes(url, false);
085                        final Document soup = Jsoup.parse(new String(retPage, "UTF-8"));
086                        final Elements imageElement = soup.select("#fbPhotoImage");
087                        final List<URL> ret = new ArrayList<URL>();
088                        for (final Element element : imageElement) {
089                                final String imageSource = element.attr("src");
090                                if (imageSource != null) {
091                                        final URL u = new URL(imageSource);
092                                        ret.add(u);
093                                }
094                        }
095                        return ret;
096                } catch (final Throwable e) {
097                        return null;
098                }
099        }
100}