001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.downloader; 031 032import java.net.URL; 033import java.util.ArrayList; 034import java.util.List; 035 036import org.apache.hadoop.io.MD5Hash; 037import org.kohsuke.args4j.CmdLineOptionsProvider; 038import org.kohsuke.args4j.Option; 039import org.openimaj.util.pair.IndependentPair; 040 041/** 042 * Different types of input file formats. 043 * 044 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 045 */ 046public enum InputMode implements CmdLineOptionsProvider { 047 /** 048 * Plain list-of-urls file. One URL per line. 049 * 050 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 051 * 052 */ 053 PLAIN { 054 @Option(name = "-hash-keys", usage = "use the MD5SUM of the URL as the key, rather than the URL itself.") 055 boolean hashKeys = false; 056 057 @Override 058 public Parser getOptions() { 059 return new Parser() { 060 @Override 061 public IndependentPair<String, List<URL>> parse(String data) throws Exception { 062 String key = data; 063 064 if (hashKeys) { 065 key = MD5Hash.digest(key).toString(); 066 } 067 068 final ArrayList<URL> value = new ArrayList<URL>(); 069 value.add(new URL(data)); 070 071 return new IndependentPair<String, List<URL>>(key, value); 072 } 073 }; 074 } 075 }, 076 /** 077 * List of URLs in the form provided by <a 078 * href="http://www.image-net.org">image-net</a> 079 * 080 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 081 * 082 */ 083 IMAGE_NET { 084 @Override 085 public Parser getOptions() { 086 return new Parser() { 087 @Override 088 public IndependentPair<String, List<URL>> parse(String data) throws Exception { 089 // we expect a format [id]\t[url] as with the image-net url 090 // set 091 final String[] split = data.split("\t"); 092 if (split.length != 2) { 093 throw new RuntimeException("Record is in the wrong format"); 094 } 095 096 final String id = split[0].trim(); 097 final String url = split[1].trim(); 098 099 final ArrayList<URL> value = new ArrayList<URL>(); 100 value.add(new URL(url)); 101 102 return new IndependentPair<String, List<URL>>(id, value); 103 } 104 }; 105 } 106 }, 107 /** 108 * Wikipedia image URLs dump format 109 * 110 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 111 * 112 */ 113 WIKIPEDIA_IMAGES_DUMP { 114 @Override 115 public Parser getOptions() { 116 return new Parser() { 117 @Option( 118 name = "--wikipedia-baseurl", 119 aliases = "-wbase", 120 required = false, 121 usage = "wikipedia upload files base urls. add many urls to check different locations for each image. defaults to upload.wikimedia.org/wikipedia/commons and upload.wikimedia.org/wikipedia/en", 122 multiValued = true) 123 private List<String> wikipediaBase; 124 125 @Override 126 public IndependentPair<String, List<URL>> parse(String data) throws Exception { 127 if (wikipediaBase == null) { 128 wikipediaBase = new ArrayList<String>(); 129 wikipediaBase.add("http://upload.wikimedia.org/wikipedia/commons"); 130 wikipediaBase.add("http://upload.wikimedia.org/wikipedia/en"); 131 } 132 133 final String[] split = data.split(":"); 134 if (split.length != 2) { 135 throw new RuntimeException("Record is in the wrong format"); 136 } 137 138 final String hash = MD5Hash.digest(split[1]).toString(); 139 final String dirStructure = String.format("%s/%s", hash.substring(0, 1), hash.substring(0, 2)); 140 141 final ArrayList<URL> value = new ArrayList<URL>(); 142 for (final String base : wikipediaBase) { 143 final String completeURL = String.format("%s/%s/%s", base, dirStructure, 144 split[1].replace(" ", "_")); 145 value.add(new URL(completeURL)); 146 } 147 148 return new IndependentPair<String, List<URL>>(data, value); 149 } 150 }; 151 } 152 }, 153 /** 154 * Parse urls and keys from a csv record. 155 * 156 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 157 */ 158 CSV { 159 @Override 160 public Parser getOptions() { 161 return new CsvParser() { 162 @Option(name = "--key-field") 163 int keyField; 164 @Option(name = "--url-field") 165 int urlField; 166 167 @Override 168 public int getKeyField() { 169 return keyField; 170 } 171 172 @Override 173 public int getUrlField() { 174 return urlField; 175 } 176 }; 177 } 178 }, 179 /** 180 * Parse the FlickrCrawler csv file to get the medium url of the image. 181 * 182 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 183 * 184 */ 185 FLICKR_CSV_MEDIUM { 186 @Override 187 public Parser getOptions() { 188 return new CsvParser() { 189 @Override 190 public int getKeyField() { 191 return 2; 192 } 193 194 @Override 195 public int getUrlField() { 196 return 5; 197 } 198 }; 199 } 200 }; 201 202 @Override 203 public abstract Parser getOptions(); 204 205 /** 206 * Options for the {@link InputMode} 207 * 208 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 209 */ 210 public static abstract class Parser { 211 /** 212 * Parse a record into a key and list of potential URLs. In most cases 213 * there will only be a single potential URL in the list. The downloader 214 * will work through the list until it finds a working URL, or exhausts 215 * its options. 216 * 217 * @param data 218 * the data record from the input file 219 * @return the key and potential URLs 220 * @throws Exception 221 * if an error occurs 222 */ 223 public abstract IndependentPair<String, List<URL>> parse(String data) throws Exception; 224 } 225 226 private static abstract class CsvParser extends Parser { 227 final static String CVS_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))"; 228 229 public abstract int getKeyField(); 230 231 public abstract int getUrlField(); 232 233 @Override 234 public IndependentPair<String, List<URL>> parse(String data) throws Exception { 235 final String[] parts = data.split(CVS_REGEX); 236 237 final String key = unescapeCSV(parts[getKeyField()]); 238 final URL url = new URL(unescapeCSV(parts[getUrlField()])); 239 240 final ArrayList<URL> value = new ArrayList<URL>(); 241 value.add(url); 242 243 return new IndependentPair<String, List<URL>>(key, value); 244 } 245 246 private String unescapeCSV(String input) { 247 if (input == null) 248 return input; 249 else if (input.length() < 2) 250 return input; 251 else if (input.charAt(0) != '"' || input.charAt(input.length() - 1) != '"') 252 return input; 253 else { 254 String quoteless = input.substring(1, input.length() - 1); 255 256 if (quoteless.contains(",") || quoteless.contains("\n") || quoteless.contains("\"")) { 257 quoteless = quoteless.replace("\"\"", "\""); 258 } 259 260 return quoteless; 261 } 262 } 263 } 264}