001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.downloader; 031 032import java.io.IOException; 033import java.net.URI; 034 035import org.apache.hadoop.conf.Configuration; 036import org.apache.hadoop.fs.FileSystem; 037import org.apache.hadoop.fs.Path; 038import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper; 039import org.kohsuke.args4j.CmdLineException; 040import org.kohsuke.args4j.CmdLineParser; 041import org.kohsuke.args4j.Option; 042import org.kohsuke.args4j.ProxyOptionHandler; 043import org.openimaj.hadoop.sequencefile.SequenceFileUtility; 044import org.openimaj.hadoop.tools.downloader.InputMode.Parser; 045 046/** 047 * Command-line options for the downloader tool 048 * 049 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 050 */ 051public class HadoopDownloaderOptions { 052 private String[] args; 053 054 @Option(name = "--input", aliases = "-i", required = true, usage = "Input file or URL.", metaVar = "STRING") 055 private String input; 056 057 @Option(name = "--output", aliases = "-o", required = true, usage = "Output file or URL.", metaVar = "STRING") 058 private String output; 059 060 @Option( 061 name = "--remove", 062 aliases = "-rm", 063 required = false, 064 usage = "Remove the existing output location if it exists.", 065 metaVar = "BOOLEAN") 066 private boolean replace = false; 067 068 @Option( 069 name = "--num-reducers", 070 aliases = "-nr", 071 required = false, 072 usage = "Number of reducers. Controls the number of sequencefile parts created.") 073 private int nreducers = 0; 074 075 @Option( 076 name = "--num-threads", 077 aliases = "-nt", 078 required = false, 079 usage = "Number of mapper threads. If > 1, then a multithreaded mapper will be used.") 080 private int nThreads = 1; 081 082 @Option( 083 name = "--input-mode", 084 aliases = "-m", 085 required = false, 086 usage = "How should the URLs be processed to be downloaded.", 087 handler = ProxyOptionHandler.class) 088 private InputMode inputMode = InputMode.PLAIN; 089 private Parser inputModeOp; 090 091 @Option( 092 name = "--sleep", 093 aliases = "-s", 094 required = false, 095 usage = "Time in milliseconds to sleep after downloading a file.", 096 metaVar = "LONG") 097 private long sleep = 0; 098 099 @Option(name = "--follow-redirects", aliases = "-f", usage = "Follow URL redirections", required = false) 100 private boolean followRedirects = false; 101 102 @Option(name = "--log-failures", aliases = "-l", usage = "Log failed records to a file", required = false) 103 private boolean writeFailures = false; 104 105 /** 106 * Construct with the given arguments 107 * 108 * @param args 109 * the arguments 110 */ 111 public HadoopDownloaderOptions(String[] args) { 112 this.args = args; 113 } 114 115 /** 116 * Prepare the options 117 * 118 * @param initial 119 * true if initial setup is being performed; false if inside the 120 * mapper 121 */ 122 public void prepare(boolean initial) { 123 final CmdLineParser parser = new CmdLineParser(this); 124 try { 125 parser.parseArgument(args); 126 this.validate(initial); 127 } catch (final CmdLineException e) { 128 System.err.println(e.getMessage()); 129 System.err.println("Usage: hadoop -jar HadoopImageDownloader [options...] [files...]"); 130 parser.printUsage(System.err); 131 132 System.exit(1); 133 } 134 } 135 136 private void validate(boolean initial) { 137 if (replace && initial) { 138 try { 139 final URI outuri = SequenceFileUtility.convertToURI(output); 140 141 final FileSystem fs = SequenceFileUtility.getFileSystem(outuri, new Configuration()); 142 143 fs.delete(new Path(outuri.toString()), true); 144 } catch (final IOException e) { 145 146 } 147 } 148 } 149 150 /** 151 * Get the input file(s) containing the URLs 152 * 153 * @return the input paths 154 * @throws IOException 155 */ 156 public Path[] getInputPaths() throws IOException { 157 return SequenceFileUtility.getFilePaths(input, "part"); 158 } 159 160 /** 161 * @return the output file location 162 */ 163 public Path getOutputPath() { 164 return new Path(SequenceFileUtility.convertToURI(output).toString()); 165 } 166 167 /** 168 * @return the number of reducers 169 */ 170 public int getNumberOfReducers() { 171 return this.nreducers; 172 } 173 174 /** 175 * @return the {@link Parser} corresponding to the selected mode. 176 */ 177 public Parser getInputParser() { 178 return inputModeOp; 179 } 180 181 /** 182 * @return the time in milliseconds to sleep after downloading a file 183 */ 184 public long getSleep() { 185 return sleep; 186 } 187 188 /** 189 * @return true if redirects should be followed; false otherwise 190 */ 191 public boolean followRedirects() { 192 return followRedirects; 193 } 194 195 /** 196 * @return true if failed records should be logged to a file 197 */ 198 public boolean writeFailures() { 199 return writeFailures; 200 } 201 202 /** 203 * Get the number of threads to use in the mapper. If >1 a 204 * {@link MultithreadedMapper} will be used. 205 * 206 * @return number of threads to use 207 */ 208 public int getNumberOfThreads() { 209 return nThreads; 210 } 211}