001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.io; 031 032import java.io.ByteArrayInputStream; 033import java.io.ByteArrayOutputStream; 034import java.io.IOException; 035import java.io.InputStream; 036import java.net.HttpURLConnection; 037import java.net.MalformedURLException; 038import java.net.URISyntaxException; 039import java.net.URL; 040import java.util.regex.Matcher; 041import java.util.regex.Pattern; 042 043import org.apache.http.Header; 044import org.apache.http.HttpEntity; 045import org.apache.http.HttpHost; 046import org.apache.http.HttpRequest; 047import org.apache.http.HttpResponse; 048import org.apache.http.ProtocolException; 049import org.apache.http.client.RedirectStrategy; 050import org.apache.http.client.methods.HttpGet; 051import org.apache.http.client.methods.HttpHead; 052import org.apache.http.client.methods.HttpUriRequest; 053import org.apache.http.client.params.HttpClientParams; 054import org.apache.http.entity.BufferedHttpEntity; 055import org.apache.http.impl.client.DefaultHttpClient; 056import org.apache.http.impl.client.DefaultRedirectStrategy; 057import org.apache.http.params.BasicHttpParams; 058import org.apache.http.params.HttpConnectionParams; 059import org.apache.http.params.HttpParams; 060import org.apache.http.params.HttpProtocolParams; 061import org.apache.http.protocol.HttpContext; 062import org.jsoup.Jsoup; 063import org.jsoup.nodes.Document; 064import org.jsoup.select.Elements; 065import org.openimaj.util.pair.IndependentPair; 066 067/** 068 * HTTP(S) download utilities, with support for HTTP redirects and meta refresh 069 * redirection. 070 * 071 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 072 */ 073public class HttpUtils { 074 075 /** 076 * The default user-agent string 077 */ 078 public static final String DEFAULT_USERAGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ru; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)"; 079 080 private HttpUtils() { 081 } 082 083 /** 084 * Read the contents of the given {@link URL} as an array of bytes. 085 * Redirects are followed automatically. 086 * 087 * @param u 088 * the URL to read from 089 * @return the content referenced by the URL 090 * @throws IOException 091 * if an error occurs 092 * @throws IllegalArgumentException 093 * if the URL is not an HTTP(s) URL 094 */ 095 public static byte[] readURLAsBytes(URL u) throws IOException { 096 return readURLAsBytes(u, true); 097 } 098 099 /** 100 * Read the contents of the given {@link URL} as an array of bytes. If 101 * redirects are not being followed, then the result will be null if the URL 102 * is redirected. 103 * 104 * @param u 105 * the URL to read from 106 * @param followRedirects 107 * should redirects be followed? 108 * @return the content referenced by the URL 109 * @throws IOException 110 * if an error occurs 111 * @throws IllegalArgumentException 112 * if the URL is not an HTTP(s) URL 113 */ 114 public static byte[] readURLAsBytes(URL u, boolean followRedirects) throws IOException { 115 final InputStream stream = readURLAsStream(u, followRedirects); 116 if (stream == null) 117 return null; 118 119 try { 120 return org.apache.commons.io.IOUtils.toByteArray(stream); 121 } finally { 122 if (stream != null) 123 stream.close(); 124 } 125 } 126 127 /** 128 * A {@link RedirectStrategy} that can deal with meta-refresh style 129 * redirection 130 * 131 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 132 * 133 */ 134 public static class MetaRefreshRedirectStrategy extends DefaultRedirectStrategy { 135 private static final String METAREFRESH_LOCATION = "METAREFRESH_LOCATION"; 136 137 @Override 138 public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context) 139 throws ProtocolException 140 { 141 final boolean isRedirect = super.isRedirected(request, response, context); 142 context.setAttribute(METAREFRESH_LOCATION, null); 143 if (!isRedirect) { 144 // Consume and buffer the entity, set the entity 145 HttpEntity entity = null; 146 try { 147 entity = response.getEntity(); 148 if (!entity.isRepeatable()) 149 { 150 entity = new BufferedHttpEntity(response.getEntity()); 151 response.setEntity(entity); // Set the entity! 152 } 153 final HttpHost host = (HttpHost) context.getAttribute("http.target_host"); 154 final URL url = new URL(host.toURI()); 155 156 final Header encodingObj = entity.getContentEncoding(); 157 String encoding = null; 158 if (encodingObj == null) { 159 encoding = "UTF-8"; 160 } 161 else { 162 encoding = encodingObj.getValue(); 163 if (encoding == null) { 164 encoding = "UTF-8"; 165 } 166 } 167 final URL u = checkRedirects(url, FileUtils.readall(entity.getContent(), encoding)); 168 if (u != null) { 169 // set the location so it doesn't have to be read again 170 context.setAttribute(METAREFRESH_LOCATION, u); 171 return true; 172 } 173 174 } catch (final IOException e) { 175 return false; 176 } 177 } 178 return isRedirect; 179 } 180 181 @Override 182 public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) 183 throws ProtocolException 184 { 185 final URL metarefresh = (URL) context.getAttribute(METAREFRESH_LOCATION); 186 if (metarefresh == null) { 187 return super.getRedirect(request, response, context); 188 } 189 190 final String method = request.getRequestLine().getMethod(); 191 try { 192 if (method.equalsIgnoreCase(HttpHead.METHOD_NAME)) { 193 return new HttpHead(metarefresh.toURI()); 194 } else { 195 return new HttpGet(metarefresh.toURI()); 196 } 197 } catch (final URISyntaxException e) { 198 return super.getRedirect(request, response, context); 199 } 200 } 201 } 202 203 /** 204 * Read the contents of the given {@link URL} as a 205 * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an 206 * {@link InputStream}). If redirects are not being followed, then the 207 * result will be null if the URL is redirected. 208 * 209 * @param u 210 * the URL to read from 211 * @param followRedirects 212 * should redirects be followed? 213 * @return the content referenced by the URL 214 * @throws IOException 215 * if an error occurs 216 * @throws IllegalArgumentException 217 * if the URL is not an HTTP(s) URL 218 */ 219 public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u, 220 boolean followRedirects) throws IOException 221 { 222 return readURLAsByteArrayInputStream(u, 15000, 15000, followRedirects ? new MetaRefreshRedirectStrategy() : null, 223 DEFAULT_USERAGENT); 224 } 225 226 /** 227 * Read the contents of the given {@link URL} as a 228 * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an 229 * {@link InputStream}). If redirects are not being followed, then the 230 * result will be null if the URL is redirected. 231 * 232 * @param u 233 * the URL to read from 234 * @param strategy 235 * how redirects should be followed 236 * @return the content referenced by the URL 237 * @throws IOException 238 * if an error occurs 239 * @throws IllegalArgumentException 240 * if the URL is not an HTTP(s) URL 241 */ 242 public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u, 243 RedirectStrategy strategy) throws IOException 244 { 245 return readURLAsByteArrayInputStream(u, 15000, 15000, strategy, DEFAULT_USERAGENT); 246 } 247 248 /** 249 * Read the contents of the given {@link URL} as a 250 * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an 251 * {@link InputStream}). If redirects are not being followed, then the 252 * result will be null if the URL is redirected. 253 * 254 * @param url 255 * the URL to read from 256 * @param connectionTimeout 257 * amount of time to wait for connection 258 * @param readTimeout 259 * amount of time to wait for reading 260 * @param redirectStrategy 261 * the redirection strategy 262 * @param userAgent 263 * the useragent string 264 * @return the content referenced by the URL 265 * @throws IOException 266 * if an error occurs 267 * @throws IllegalArgumentException 268 * if the URL is not an HTTP(s) URL 269 */ 270 public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL url, 271 int connectionTimeout, int readTimeout, RedirectStrategy redirectStrategy, String userAgent) 272 throws IOException 273 { 274 DefaultHttpClient c = null; 275 try { 276 final HttpParams params = new BasicHttpParams(); 277 HttpConnectionParams.setConnectionTimeout(params, connectionTimeout); 278 HttpConnectionParams.setSoTimeout(params, readTimeout); 279 HttpProtocolParams.setUserAgent(params, userAgent); 280 HttpClientParams.setRedirecting(params, redirectStrategy != null); 281 final boolean followRedirects = redirectStrategy != null; 282 c = new DefaultHttpClient(params); 283 if (followRedirects) 284 c.setRedirectStrategy(redirectStrategy); 285 HttpResponse resp = null; 286 try { 287 resp = c.execute(new HttpGet(url.toURI())); 288 } catch (final URISyntaxException e) { 289 throw new IOException(e); 290 } 291 292 final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); 293 final InputStream stream = resp.getEntity().getContent(); 294 final byte[] tempBuffer = new byte[1024]; 295 296 // read the rest! 297 while (true) { 298 final int readThisTime = stream.read(tempBuffer); 299 if (readThisTime == -1) { 300 break; 301 } 302 // write to the outStream 303 outStream.write(tempBuffer, 0, readThisTime); 304 } 305 final IndependentPair<HttpEntity, ByteArrayInputStream> toRet = IndependentPair.pair(resp.getEntity(), 306 new ByteArrayInputStream(outStream.toByteArray())); 307 ; 308 return toRet; 309 } finally { 310 if (c != null) 311 c.getConnectionManager().shutdown(); 312 } 313 314 } 315 316 /** 317 * Open an {@link HttpURLConnection} to the {@link URL} as an array of 318 * bytes. Redirects are followed automatically. 319 * 320 * @param url 321 * the URL to read from 322 * @return the content referenced by the URL 323 * @throws IOException 324 * if an error occurs 325 * @throws IllegalArgumentException 326 * if the URL is not an HTTP(s) URL 327 */ 328 public static InputStream readURL(URL url) throws IOException { 329 return readURLAsByteArrayInputStream(url, 15000, 15000, new MetaRefreshRedirectStrategy(), DEFAULT_USERAGENT) 330 .getSecondObject(); 331 } 332 333 /** 334 * Open an {@link HttpURLConnection} to the {@link URL} as an array of 335 * bytes. 336 * 337 * @param url 338 * the URL to read from 339 * @param followRedirects 340 * should redirects be followed? 341 * @return the content referenced by the URL 342 * @throws IOException 343 * if an error occurs 344 * @throws IllegalArgumentException 345 * if the URL is not an HTTP(s) URL 346 */ 347 public static InputStream readURL(URL url, boolean followRedirects) throws IOException { 348 return readURLAsByteArrayInputStream(url, 15000, 15000, 349 followRedirects ? new MetaRefreshRedirectStrategy() : null, DEFAULT_USERAGENT).getSecondObject(); 350 } 351 352 private static URL searchMetaRefresh(URL base, String html) throws MalformedURLException { 353 final Document doc = Jsoup.parse(html); 354 355 final Elements tags = doc.select("meta[http-equiv=refresh]"); 356 if (tags != null && tags.size() > 0) { 357 final String content = tags.first().attr("content"); 358 359 final Pattern pattern = Pattern.compile("\\d+\\;url\\=(.*)", Pattern.CASE_INSENSITIVE); 360 final Matcher matcher = pattern.matcher(content); 361 if (matcher.find()) { 362 final String url = matcher.group(1); 363 364 URL toRet = null; 365 if (url.contains("://")) { 366 toRet = new URL(url); 367 } 368 { 369 toRet = new URL(base, url); 370 } 371 // A legitimate use of http-refresh was to refresh the current 372 // page 373 // this would result in a horrible loop 374 if (!toRet.equals(base)) { 375 return toRet; 376 } 377 } 378 } 379 380 return null; 381 } 382 383 private static URL checkRedirects(URL base, String html) throws IOException { 384 final URL u = searchMetaRefresh(base, html); 385 386 // potentially add more checks here for things 387 // like JS refresh 388 389 return u; 390 } 391 392 /** 393 * Open a {@link InputStream} to the contents referenced by the {@link URL}. 394 * Redirects are followed automatically. 395 * 396 * @param url 397 * the URL to read from 398 * @return the content referenced by the URL 399 * @throws IOException 400 * if an error occurs 401 * @throws IllegalArgumentException 402 * if the URL is not an HTTP(s) URL 403 */ 404 public static InputStream readURLAsStream(URL url) throws IOException { 405 return readURL(url); 406 } 407 408 /** 409 * Open a {@link InputStream} to the contents referenced by the {@link URL}. 410 * If redirects are not being followed, then the result will be null if the 411 * URL is redirected. 412 * 413 * @param url 414 * the URL to read from 415 * @param followRedirects 416 * should redirects be followed. 417 * @return the content referenced by the URL 418 * @throws IOException 419 * if an error occurs 420 * @throws IllegalArgumentException 421 * if the URL is not an HTTP(s) URL 422 */ 423 public static InputStream readURLAsStream(URL url, boolean followRedirects) throws IOException { 424 final InputStream conn = readURL(url, followRedirects); 425 426 return conn; 427 } 428 429 /** 430 * Read the internal state of an object from the given URL. 431 * 432 * @param <T> 433 * Type of object being read. 434 * 435 * @param url 436 * the URL to read from 437 * @param obj 438 * the object to fill 439 * @return the content referenced by the URL 440 * @throws IOException 441 * if an error occurs 442 * @throws IllegalArgumentException 443 * if the URL is not an HTTP(s) URL 444 */ 445 public static <T extends InternalReadable> T readURL(URL url, T obj) throws IOException { 446 final InputStream stream = readURLAsStream(url); 447 448 try { 449 return IOUtils.read(stream, obj); 450 } finally { 451 if (stream != null) 452 stream.close(); 453 } 454 } 455 456 /** 457 * Read the an object from the given URL. 458 * 459 * @param <T> 460 * Type of object being read. 461 * 462 * @param url 463 * the URL to read from 464 * @param clz 465 * the class of the object to read 466 * @return the content referenced by the URL 467 * @throws IOException 468 * if an error occurs 469 * @throws IllegalArgumentException 470 * if the URL is not an HTTP(s) URL 471 */ 472 public static <T extends InternalReadable> T readURL(URL url, Class<? extends T> clz) throws IOException { 473 final InputStream stream = readURLAsStream(url); 474 475 try { 476 return IOUtils.read(stream, clz); 477 } finally { 478 if (stream != null) 479 stream.close(); 480 } 481 } 482 483 /** 484 * Read the an object from the given URL. 485 * 486 * @param <T> 487 * Type of object being read. 488 * @param <Q> 489 * Type of the object reader. 490 * 491 * @param url 492 * the URL to read from 493 * @param reader 494 * the reader that creates the object. 495 * @return the content referenced by the URL 496 * @throws IOException 497 * if an error occurs 498 * @throws IllegalArgumentException 499 * if the URL is not an HTTP(s) URL 500 */ 501 public static <T, Q extends InputStreamObjectReader<T>> T readURL(URL url, Q reader) throws IOException { 502 final InputStream stream = readURLAsStream(url); 503 504 try { 505 return reader.read(stream); 506 } finally { 507 if (stream != null) 508 stream.close(); 509 } 510 } 511}