001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping.images; 018 019import java.net.URL; 020import java.util.ArrayList; 021import java.util.List; 022 023import org.jsoup.Jsoup; 024import org.jsoup.nodes.Document; 025import org.jsoup.nodes.Element; 026import org.jsoup.select.Elements; 027import org.openimaj.io.HttpUtils; 028import org.openimaj.web.scraping.SiteSpecificConsumer; 029 030/** 031 * Consume facebook posts/pictures using the {@link com.restfb.FacebookClient} 032 * client 033 * 034 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 035 * 036 */ 037public class FacebookConsumer implements SiteSpecificConsumer { 038 039 @Override 040 public boolean canConsume(URL url) { 041 return url.getHost().contains("facebook"); 042 } 043 044 @Override 045 public List<URL> consume(URL url) { 046 // posts == http://www.facebook.com/jsproducoes/posts/426306737404997 047 // photos == 048 // http://www.facebook.com/photo.php?pid=1307526&l=3d755a0895&id=353116314727854 049 final String urlFile = url.getFile(); 050 051 List<URL> ret = null; 052 if (urlFile.startsWith("/photo.php")) { 053 ret = consumeFacebookPhoto(url); 054 } 055 else if (urlFile.contains("/posts/")) { 056 ret = consumeFacebookPost(url); 057 } 058 if (ret == null || ret.isEmpty()) 059 return null; 060 return ret; 061 } 062 063 private List<URL> consumeFacebookPost(URL url) { 064 try { 065 final byte[] retPage = HttpUtils.readURLAsBytes(url, false); 066 final Document soup = Jsoup.parse(new String(retPage, "UTF-8")); 067 final Elements imageElement = soup.select(".storyInnerContent img"); 068 final List<URL> ret = new ArrayList<URL>(); 069 for (final Element element : imageElement) { 070 final String imageSource = element.attr("src"); 071 if (imageSource != null) { 072 final URL u = new URL(imageSource); 073 ret.add(u); 074 } 075 } 076 return ret; 077 } catch (final Throwable e) { 078 return null; 079 } 080 } 081 082 private List<URL> consumeFacebookPhoto(URL url) { 083 try { 084 final byte[] retPage = HttpUtils.readURLAsBytes(url, false); 085 final Document soup = Jsoup.parse(new String(retPage, "UTF-8")); 086 final Elements imageElement = soup.select("#fbPhotoImage"); 087 final List<URL> ret = new ArrayList<URL>(); 088 for (final Element element : imageElement) { 089 final String imageSource = element.attr("src"); 090 if (imageSource != null) { 091 final URL u = new URL(imageSource); 092 ret.add(u); 093 } 094 } 095 return ret; 096 } catch (final Throwable e) { 097 return null; 098 } 099 } 100}