001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping.images; 018 019import java.io.IOException; 020import java.io.InputStreamReader; 021import java.net.HttpURLConnection; 022import java.net.URL; 023import java.util.ArrayList; 024import java.util.List; 025import java.util.Map; 026 027import org.openimaj.util.api.auth.DefaultTokenFactory; 028import org.openimaj.util.auth.web.TumblrAPIToken; 029import org.openimaj.web.scraping.SiteSpecificConsumer; 030 031import com.google.gson.Gson; 032 033/** 034 * Using a tumblr API key turn a Tmblr URL to an image id and call the tumblr 035 * API's posts function. 036 * 037 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 038 * 039 */ 040public class TmblrPhotoConsumer implements SiteSpecificConsumer { 041 private transient Gson gson = new Gson(); 042 private TumblrAPIToken token; 043 044 /** 045 * Use the {@link DefaultTokenFactory} to load the default api token 046 */ 047 public TmblrPhotoConsumer() { 048 this(DefaultTokenFactory.get(TumblrAPIToken.class)); 049 } 050 051 /** 052 * Construct with the given api token 053 * 054 * @param token 055 * the api token 056 */ 057 public TmblrPhotoConsumer(TumblrAPIToken token) { 058 this.token = token; 059 } 060 061 @Override 062 public boolean canConsume(URL url) { 063 // http://tmblr.co/ZoH2IyP4lDVD 064 return (url.getHost().equals("tmblr.co") || url.getHost().endsWith("tumblr.com")) 065 && !url.getHost().contains("media"); 066 } 067 068 String tumblrAPICall = "http://api.tumblr.com/v2/blog/derekg.org/posts?id=%s&api_key=%s"; 069 070 @SuppressWarnings("unchecked") 071 @Override 072 public List<URL> consume(URL url) { 073 // construct the actual tumblr address 074 try { 075 final List<URL> images = new ArrayList<URL>(); 076 final String postID = getPostID(url); 077 if (postID == null) 078 return images; 079 // NOW call the tumblrAPI 080 final String tmblrRequest = String.format(tumblrAPICall, postID, token.apikey); 081 final Map<String, Object> res = gson.fromJson(new InputStreamReader(new URL(tmblrRequest).openConnection() 082 .getInputStream()), Map.class); 083 084 final Map<?, ?> response = (Map<?, ?>) res.get("response"); 085 final Map<?, ?> posts = (Map<?, ?>) ((List<?>) response.get("posts")).get(0); 086 final List<Map<?, ?>> photos = ((List<Map<?, ?>>) posts.get("photos")); 087 if (photos == null) 088 return null; 089 090 for (final Map<?, ?> photo : photos) { 091 final String photoURLStr = (String) ((Map<String, Object>) photo.get("original_size")).get("url"); 092 final URL photoURL = new URL(photoURLStr); 093 images.add(photoURL); 094 } 095 096 return images; 097 } catch (final Throwable e) { 098 return null; 099 } 100 } 101 102 /** 103 * handles the variety of ways a tumblr addresses can be forwarded to 104 * 105 * @param url 106 * @return 107 * @throws IOException 108 */ 109 private String getPostID(URL url) throws IOException { 110 final String host = url.getHost(); 111 URL loc = url; 112 113 if (host.equals("tmblr.co") || host.equals("tumblr.com") || host.equals("www.tumblr.com")) { 114 URL forwardURL = null; 115 if (url.getHost().equals("tmblr.co")) { 116 final String tumblrCode = url.getPath(); 117 forwardURL = new URL("http://www.tumblr.com" + tumblrCode); 118 } 119 else { 120 forwardURL = url; 121 } 122 // now get the location header 123 final HttpURLConnection con = (HttpURLConnection) forwardURL.openConnection(); 124 con.setInstanceFollowRedirects(false); 125 final String locStr = con.getHeaderField("Location"); 126 loc = new URL(locStr); 127 con.disconnect(); 128 } 129 130 // Now extract the post ID from the actual tumblr address 131 final String[] parts = loc.getPath().split("[/]"); 132 String postID = null; 133 for (int i = 0; i < parts.length; i++) { 134 if (parts[i].equals("post")) { 135 postID = parts[i + 1]; 136 break; 137 } 138 } 139 return postID; 140 } 141 142}