001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.io;
031
032import java.io.ByteArrayInputStream;
033import java.io.ByteArrayOutputStream;
034import java.io.IOException;
035import java.io.InputStream;
036import java.net.HttpURLConnection;
037import java.net.MalformedURLException;
038import java.net.URISyntaxException;
039import java.net.URL;
040import java.util.regex.Matcher;
041import java.util.regex.Pattern;
042
043import org.apache.http.Header;
044import org.apache.http.HttpEntity;
045import org.apache.http.HttpHost;
046import org.apache.http.HttpRequest;
047import org.apache.http.HttpResponse;
048import org.apache.http.ProtocolException;
049import org.apache.http.client.RedirectStrategy;
050import org.apache.http.client.methods.HttpGet;
051import org.apache.http.client.methods.HttpHead;
052import org.apache.http.client.methods.HttpUriRequest;
053import org.apache.http.client.params.HttpClientParams;
054import org.apache.http.entity.BufferedHttpEntity;
055import org.apache.http.impl.client.DefaultHttpClient;
056import org.apache.http.impl.client.DefaultRedirectStrategy;
057import org.apache.http.params.BasicHttpParams;
058import org.apache.http.params.HttpConnectionParams;
059import org.apache.http.params.HttpParams;
060import org.apache.http.params.HttpProtocolParams;
061import org.apache.http.protocol.HttpContext;
062import org.jsoup.Jsoup;
063import org.jsoup.nodes.Document;
064import org.jsoup.select.Elements;
065import org.openimaj.util.pair.IndependentPair;
066
067/**
068 * HTTP(S) download utilities, with support for HTTP redirects and meta refresh
069 * redirection.
070 *
071 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
072 */
073public class HttpUtils {
074
075        /**
076         * The default user-agent string
077         */
078        public static final String DEFAULT_USERAGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ru; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)";
079
080        private HttpUtils() {
081        }
082
083        /**
084         * Read the contents of the given {@link URL} as an array of bytes.
085         * Redirects are followed automatically.
086         *
087         * @param u
088         *            the URL to read from
089         * @return the content referenced by the URL
090         * @throws IOException
091         *             if an error occurs
092         * @throws IllegalArgumentException
093         *             if the URL is not an HTTP(s) URL
094         */
095        public static byte[] readURLAsBytes(URL u) throws IOException {
096                return readURLAsBytes(u, true);
097        }
098
099        /**
100         * Read the contents of the given {@link URL} as an array of bytes. If
101         * redirects are not being followed, then the result will be null if the URL
102         * is redirected.
103         *
104         * @param u
105         *            the URL to read from
106         * @param followRedirects
107         *            should redirects be followed?
108         * @return the content referenced by the URL
109         * @throws IOException
110         *             if an error occurs
111         * @throws IllegalArgumentException
112         *             if the URL is not an HTTP(s) URL
113         */
114        public static byte[] readURLAsBytes(URL u, boolean followRedirects) throws IOException {
115                final InputStream stream = readURLAsStream(u, followRedirects);
116                if (stream == null)
117                        return null;
118
119                try {
120                        return org.apache.commons.io.IOUtils.toByteArray(stream);
121                } finally {
122                        if (stream != null)
123                                stream.close();
124                }
125        }
126
127        /**
128         * A {@link RedirectStrategy} that can deal with meta-refresh style
129         * redirection
130         *
131         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
132         *
133         */
134        public static class MetaRefreshRedirectStrategy extends DefaultRedirectStrategy {
135                private static final String METAREFRESH_LOCATION = "METAREFRESH_LOCATION";
136
137                @Override
138                public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context)
139                                throws ProtocolException
140                {
141                        final boolean isRedirect = super.isRedirected(request, response, context);
142                        context.setAttribute(METAREFRESH_LOCATION, null);
143                        if (!isRedirect) {
144                                // Consume and buffer the entity, set the entity
145                                HttpEntity entity = null;
146                                try {
147                                        entity = response.getEntity();
148                                        if (!entity.isRepeatable())
149                                        {
150                                                entity = new BufferedHttpEntity(response.getEntity());
151                                                response.setEntity(entity); // Set the entity!
152                                        }
153                                        final HttpHost host = (HttpHost) context.getAttribute("http.target_host");
154                                        final URL url = new URL(host.toURI());
155
156                                        final Header encodingObj = entity.getContentEncoding();
157                                        String encoding = null;
158                                        if (encodingObj == null) {
159                                                encoding = "UTF-8";
160                                        }
161                                        else {
162                                                encoding = encodingObj.getValue();
163                                                if (encoding == null) {
164                                                        encoding = "UTF-8";
165                                                }
166                                        }
167                                        final URL u = checkRedirects(url, FileUtils.readall(entity.getContent(), encoding));
168                                        if (u != null) {
169                                                // set the location so it doesn't have to be read again
170                                                context.setAttribute(METAREFRESH_LOCATION, u);
171                                                return true;
172                                        }
173
174                                } catch (final IOException e) {
175                                        return false;
176                                }
177                        }
178                        return isRedirect;
179                }
180
181                @Override
182                public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context)
183                                throws ProtocolException
184                {
185                        final URL metarefresh = (URL) context.getAttribute(METAREFRESH_LOCATION);
186                        if (metarefresh == null) {
187                                return super.getRedirect(request, response, context);
188                        }
189
190                        final String method = request.getRequestLine().getMethod();
191                        try {
192                                if (method.equalsIgnoreCase(HttpHead.METHOD_NAME)) {
193                                        return new HttpHead(metarefresh.toURI());
194                                } else {
195                                        return new HttpGet(metarefresh.toURI());
196                                }
197                        } catch (final URISyntaxException e) {
198                                return super.getRedirect(request, response, context);
199                        }
200                }
201        }
202
203        /**
204         * Read the contents of the given {@link URL} as a
205         * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an
206         * {@link InputStream}). If redirects are not being followed, then the
207         * result will be null if the URL is redirected.
208         *
209         * @param u
210         *            the URL to read from
211         * @param followRedirects
212         *            should redirects be followed?
213         * @return the content referenced by the URL
214         * @throws IOException
215         *             if an error occurs
216         * @throws IllegalArgumentException
217         *             if the URL is not an HTTP(s) URL
218         */
219        public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u,
220                        boolean followRedirects) throws IOException
221        {
222                return readURLAsByteArrayInputStream(u, 15000, 15000, followRedirects ? new MetaRefreshRedirectStrategy() : null,
223                                DEFAULT_USERAGENT);
224        }
225
226        /**
227         * Read the contents of the given {@link URL} as a
228         * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an
229         * {@link InputStream}). If redirects are not being followed, then the
230         * result will be null if the URL is redirected.
231         *
232         * @param u
233         *            the URL to read from
234         * @param strategy
235         *            how redirects should be followed
236         * @return the content referenced by the URL
237         * @throws IOException
238         *             if an error occurs
239         * @throws IllegalArgumentException
240         *             if the URL is not an HTTP(s) URL
241         */
242        public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u,
243                        RedirectStrategy strategy) throws IOException
244        {
245                return readURLAsByteArrayInputStream(u, 15000, 15000, strategy, DEFAULT_USERAGENT);
246        }
247
248        /**
249         * Read the contents of the given {@link URL} as a
250         * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an
251         * {@link InputStream}). If redirects are not being followed, then the
252         * result will be null if the URL is redirected.
253         *
254         * @param url
255         *            the URL to read from
256         * @param connectionTimeout
257         *            amount of time to wait for connection
258         * @param readTimeout
259         *            amount of time to wait for reading
260         * @param redirectStrategy
261         *            the redirection strategy
262         * @param userAgent
263         *            the useragent string
264         * @return the content referenced by the URL
265         * @throws IOException
266         *             if an error occurs
267         * @throws IllegalArgumentException
268         *             if the URL is not an HTTP(s) URL
269         */
270        public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL url,
271                        int connectionTimeout, int readTimeout, RedirectStrategy redirectStrategy, String userAgent)
272                        throws IOException
273        {
274                DefaultHttpClient c = null;
275                try {
276                        final HttpParams params = new BasicHttpParams();
277                        HttpConnectionParams.setConnectionTimeout(params, connectionTimeout);
278                        HttpConnectionParams.setSoTimeout(params, readTimeout);
279                        HttpProtocolParams.setUserAgent(params, userAgent);
280                        HttpClientParams.setRedirecting(params, redirectStrategy != null);
281                        final boolean followRedirects = redirectStrategy != null;
282                        c = new DefaultHttpClient(params);
283                        if (followRedirects)
284                                c.setRedirectStrategy(redirectStrategy);
285                        HttpResponse resp = null;
286                        try {
287                                resp = c.execute(new HttpGet(url.toURI()));
288                        } catch (final URISyntaxException e) {
289                                throw new IOException(e);
290                        }
291
292                        final ByteArrayOutputStream outStream = new ByteArrayOutputStream();
293                        final InputStream stream = resp.getEntity().getContent();
294                        final byte[] tempBuffer = new byte[1024];
295
296                        // read the rest!
297                        while (true) {
298                                final int readThisTime = stream.read(tempBuffer);
299                                if (readThisTime == -1) {
300                                        break;
301                                }
302                                // write to the outStream
303                                outStream.write(tempBuffer, 0, readThisTime);
304                        }
305                        final IndependentPair<HttpEntity, ByteArrayInputStream> toRet = IndependentPair.pair(resp.getEntity(),
306                                        new ByteArrayInputStream(outStream.toByteArray()));
307                        ;
308                        return toRet;
309                } finally {
310                        if (c != null)
311                                c.getConnectionManager().shutdown();
312                }
313
314        }
315
316        /**
317         * Open an {@link HttpURLConnection} to the {@link URL} as an array of
318         * bytes. Redirects are followed automatically.
319         *
320         * @param url
321         *            the URL to read from
322         * @return the content referenced by the URL
323         * @throws IOException
324         *             if an error occurs
325         * @throws IllegalArgumentException
326         *             if the URL is not an HTTP(s) URL
327         */
328        public static InputStream readURL(URL url) throws IOException {
329                return readURLAsByteArrayInputStream(url, 15000, 15000, new MetaRefreshRedirectStrategy(), DEFAULT_USERAGENT)
330                                .getSecondObject();
331        }
332
333        /**
334         * Open an {@link HttpURLConnection} to the {@link URL} as an array of
335         * bytes.
336         *
337         * @param url
338         *            the URL to read from
339         * @param followRedirects
340         *            should redirects be followed?
341         * @return the content referenced by the URL
342         * @throws IOException
343         *             if an error occurs
344         * @throws IllegalArgumentException
345         *             if the URL is not an HTTP(s) URL
346         */
347        public static InputStream readURL(URL url, boolean followRedirects) throws IOException {
348                return readURLAsByteArrayInputStream(url, 15000, 15000,
349                                followRedirects ? new MetaRefreshRedirectStrategy() : null, DEFAULT_USERAGENT).getSecondObject();
350        }
351
352        private static URL searchMetaRefresh(URL base, String html) throws MalformedURLException {
353                final Document doc = Jsoup.parse(html);
354
355                final Elements tags = doc.select("meta[http-equiv=refresh]");
356                if (tags != null && tags.size() > 0) {
357                        final String content = tags.first().attr("content");
358
359                        final Pattern pattern = Pattern.compile("\\d+\\;url\\=(.*)", Pattern.CASE_INSENSITIVE);
360                        final Matcher matcher = pattern.matcher(content);
361                        if (matcher.find()) {
362                                final String url = matcher.group(1);
363
364                                URL toRet = null;
365                                if (url.contains("://")) {
366                                        toRet = new URL(url);
367                                }
368                                {
369                                        toRet = new URL(base, url);
370                                }
371                                // A legitimate use of http-refresh was to refresh the current
372                                // page
373                                // this would result in a horrible loop
374                                if (!toRet.equals(base)) {
375                                        return toRet;
376                                }
377                        }
378                }
379
380                return null;
381        }
382
383        private static URL checkRedirects(URL base, String html) throws IOException {
384                final URL u = searchMetaRefresh(base, html);
385
386                // potentially add more checks here for things
387                // like JS refresh
388
389                return u;
390        }
391
392        /**
393         * Open a {@link InputStream} to the contents referenced by the {@link URL}.
394         * Redirects are followed automatically.
395         *
396         * @param url
397         *            the URL to read from
398         * @return the content referenced by the URL
399         * @throws IOException
400         *             if an error occurs
401         * @throws IllegalArgumentException
402         *             if the URL is not an HTTP(s) URL
403         */
404        public static InputStream readURLAsStream(URL url) throws IOException {
405                return readURL(url);
406        }
407
408        /**
409         * Open a {@link InputStream} to the contents referenced by the {@link URL}.
410         * If redirects are not being followed, then the result will be null if the
411         * URL is redirected.
412         *
413         * @param url
414         *            the URL to read from
415         * @param followRedirects
416         *            should redirects be followed.
417         * @return the content referenced by the URL
418         * @throws IOException
419         *             if an error occurs
420         * @throws IllegalArgumentException
421         *             if the URL is not an HTTP(s) URL
422         */
423        public static InputStream readURLAsStream(URL url, boolean followRedirects) throws IOException {
424                final InputStream conn = readURL(url, followRedirects);
425
426                return conn;
427        }
428
429        /**
430         * Read the internal state of an object from the given URL.
431         *
432         * @param <T>
433         *            Type of object being read.
434         *
435         * @param url
436         *            the URL to read from
437         * @param obj
438         *            the object to fill
439         * @return the content referenced by the URL
440         * @throws IOException
441         *             if an error occurs
442         * @throws IllegalArgumentException
443         *             if the URL is not an HTTP(s) URL
444         */
445        public static <T extends InternalReadable> T readURL(URL url, T obj) throws IOException {
446                final InputStream stream = readURLAsStream(url);
447
448                try {
449                        return IOUtils.read(stream, obj);
450                } finally {
451                        if (stream != null)
452                                stream.close();
453                }
454        }
455
456        /**
457         * Read the an object from the given URL.
458         *
459         * @param <T>
460         *            Type of object being read.
461         *
462         * @param url
463         *            the URL to read from
464         * @param clz
465         *            the class of the object to read
466         * @return the content referenced by the URL
467         * @throws IOException
468         *             if an error occurs
469         * @throws IllegalArgumentException
470         *             if the URL is not an HTTP(s) URL
471         */
472        public static <T extends InternalReadable> T readURL(URL url, Class<? extends T> clz) throws IOException {
473                final InputStream stream = readURLAsStream(url);
474
475                try {
476                        return IOUtils.read(stream, clz);
477                } finally {
478                        if (stream != null)
479                                stream.close();
480                }
481        }
482
483        /**
484         * Read the an object from the given URL.
485         *
486         * @param <T>
487         *            Type of object being read.
488         * @param <Q>
489         *            Type of the object reader.
490         *
491         * @param url
492         *            the URL to read from
493         * @param reader
494         *            the reader that creates the object.
495         * @return the content referenced by the URL
496         * @throws IOException
497         *             if an error occurs
498         * @throws IllegalArgumentException
499         *             if the URL is not an HTTP(s) URL
500         */
501        public static <T, Q extends InputStreamObjectReader<T>> T readURL(URL url, Q reader) throws IOException {
502                final InputStream stream = readURLAsStream(url);
503
504                try {
505                        return reader.read(stream);
506                } finally {
507                        if (stream != null)
508                                stream.close();
509                }
510        }
511}