001/**
002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.scraping.images;
018
019import java.io.IOException;
020import java.io.InputStreamReader;
021import java.net.HttpURLConnection;
022import java.net.URL;
023import java.util.ArrayList;
024import java.util.List;
025import java.util.Map;
026
027import org.openimaj.util.api.auth.DefaultTokenFactory;
028import org.openimaj.util.auth.web.TumblrAPIToken;
029import org.openimaj.web.scraping.SiteSpecificConsumer;
030
031import com.google.gson.Gson;
032
033/**
034 * Using a tumblr API key turn a Tmblr URL to an image id and call the tumblr
035 * API's posts function.
036 *
037 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
038 *
039 */
040public class TmblrPhotoConsumer implements SiteSpecificConsumer {
041        private transient Gson gson = new Gson();
042        private TumblrAPIToken token;
043
044        /**
045         * Use the {@link DefaultTokenFactory} to load the default api token
046         */
047        public TmblrPhotoConsumer() {
048                this(DefaultTokenFactory.get(TumblrAPIToken.class));
049        }
050
051        /**
052         * Construct with the given api token
053         *
054         * @param token
055         *            the api token
056         */
057        public TmblrPhotoConsumer(TumblrAPIToken token) {
058                this.token = token;
059        }
060
061        @Override
062        public boolean canConsume(URL url) {
063                // http://tmblr.co/ZoH2IyP4lDVD
064                return (url.getHost().equals("tmblr.co") || url.getHost().endsWith("tumblr.com"))
065                                && !url.getHost().contains("media");
066        }
067
068        String tumblrAPICall = "http://api.tumblr.com/v2/blog/derekg.org/posts?id=%s&api_key=%s";
069
070        @SuppressWarnings("unchecked")
071        @Override
072        public List<URL> consume(URL url) {
073                // construct the actual tumblr address
074                try {
075                        final List<URL> images = new ArrayList<URL>();
076                        final String postID = getPostID(url);
077                        if (postID == null)
078                                return images;
079                        // NOW call the tumblrAPI
080                        final String tmblrRequest = String.format(tumblrAPICall, postID, token.apikey);
081                        final Map<String, Object> res = gson.fromJson(new InputStreamReader(new URL(tmblrRequest).openConnection()
082                                        .getInputStream()), Map.class);
083
084                        final Map<?, ?> response = (Map<?, ?>) res.get("response");
085                        final Map<?, ?> posts = (Map<?, ?>) ((List<?>) response.get("posts")).get(0);
086                        final List<Map<?, ?>> photos = ((List<Map<?, ?>>) posts.get("photos"));
087                        if (photos == null)
088                                return null;
089
090                        for (final Map<?, ?> photo : photos) {
091                                final String photoURLStr = (String) ((Map<String, Object>) photo.get("original_size")).get("url");
092                                final URL photoURL = new URL(photoURLStr);
093                                images.add(photoURL);
094                        }
095
096                        return images;
097                } catch (final Throwable e) {
098                        return null;
099                }
100        }
101
102        /**
103         * handles the variety of ways a tumblr addresses can be forwarded to
104         *
105         * @param url
106         * @return
107         * @throws IOException
108         */
109        private String getPostID(URL url) throws IOException {
110                final String host = url.getHost();
111                URL loc = url;
112
113                if (host.equals("tmblr.co") || host.equals("tumblr.com") || host.equals("www.tumblr.com")) {
114                        URL forwardURL = null;
115                        if (url.getHost().equals("tmblr.co")) {
116                                final String tumblrCode = url.getPath();
117                                forwardURL = new URL("http://www.tumblr.com" + tumblrCode);
118                        }
119                        else {
120                                forwardURL = url;
121                        }
122                        // now get the location header
123                        final HttpURLConnection con = (HttpURLConnection) forwardURL.openConnection();
124                        con.setInstanceFollowRedirects(false);
125                        final String locStr = con.getHeaderField("Location");
126                        loc = new URL(locStr);
127                        con.disconnect();
128                }
129
130                // Now extract the post ID from the actual tumblr address
131                final String[] parts = loc.getPath().split("[/]");
132                String postID = null;
133                for (int i = 0; i < parts.length; i++) {
134                        if (parts[i].equals("post")) {
135                                postID = parts[i + 1];
136                                break;
137                        }
138                }
139                return postID;
140        }
141
142}