001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.downloader;
031
032import java.io.IOException;
033import java.net.URI;
034
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.FileSystem;
037import org.apache.hadoop.fs.Path;
038import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
039import org.kohsuke.args4j.CmdLineException;
040import org.kohsuke.args4j.CmdLineParser;
041import org.kohsuke.args4j.Option;
042import org.kohsuke.args4j.ProxyOptionHandler;
043import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
044import org.openimaj.hadoop.tools.downloader.InputMode.Parser;
045
046/**
047 * Command-line options for the downloader tool
048 *
049 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
050 */
051public class HadoopDownloaderOptions {
052        private String[] args;
053
054        @Option(name = "--input", aliases = "-i", required = true, usage = "Input file or URL.", metaVar = "STRING")
055        private String input;
056
057        @Option(name = "--output", aliases = "-o", required = true, usage = "Output file or URL.", metaVar = "STRING")
058        private String output;
059
060        @Option(
061                        name = "--remove",
062                        aliases = "-rm",
063                        required = false,
064                        usage = "Remove the existing output location if it exists.",
065                        metaVar = "BOOLEAN")
066        private boolean replace = false;
067
068        @Option(
069                        name = "--num-reducers",
070                        aliases = "-nr",
071                        required = false,
072                        usage = "Number of reducers. Controls the number of sequencefile parts created.")
073        private int nreducers = 0;
074
075        @Option(
076                        name = "--num-threads",
077                        aliases = "-nt",
078                        required = false,
079                        usage = "Number of mapper threads. If > 1, then a multithreaded mapper will be used.")
080        private int nThreads = 1;
081
082        @Option(
083                        name = "--input-mode",
084                        aliases = "-m",
085                        required = false,
086                        usage = "How should the URLs be processed to be downloaded.",
087                        handler = ProxyOptionHandler.class)
088        private InputMode inputMode = InputMode.PLAIN;
089        private Parser inputModeOp;
090
091        @Option(
092                        name = "--sleep",
093                        aliases = "-s",
094                        required = false,
095                        usage = "Time in milliseconds to sleep after downloading a file.",
096                        metaVar = "LONG")
097        private long sleep = 0;
098
099        @Option(name = "--follow-redirects", aliases = "-f", usage = "Follow URL redirections", required = false)
100        private boolean followRedirects = false;
101
102        @Option(name = "--log-failures", aliases = "-l", usage = "Log failed records to a file", required = false)
103        private boolean writeFailures = false;
104
105        /**
106         * Construct with the given arguments
107         *
108         * @param args
109         *            the arguments
110         */
111        public HadoopDownloaderOptions(String[] args) {
112                this.args = args;
113        }
114
115        /**
116         * Prepare the options
117         *
118         * @param initial
119         *            true if initial setup is being performed; false if inside the
120         *            mapper
121         */
122        public void prepare(boolean initial) {
123                final CmdLineParser parser = new CmdLineParser(this);
124                try {
125                        parser.parseArgument(args);
126                        this.validate(initial);
127                } catch (final CmdLineException e) {
128                        System.err.println(e.getMessage());
129                        System.err.println("Usage: hadoop -jar HadoopImageDownloader [options...] [files...]");
130                        parser.printUsage(System.err);
131
132                        System.exit(1);
133                }
134        }
135
136        private void validate(boolean initial) {
137                if (replace && initial) {
138                        try {
139                                final URI outuri = SequenceFileUtility.convertToURI(output);
140
141                                final FileSystem fs = SequenceFileUtility.getFileSystem(outuri, new Configuration());
142
143                                fs.delete(new Path(outuri.toString()), true);
144                        } catch (final IOException e) {
145
146                        }
147                }
148        }
149
150        /**
151         * Get the input file(s) containing the URLs
152         *
153         * @return the input paths
154         * @throws IOException
155         */
156        public Path[] getInputPaths() throws IOException {
157                return SequenceFileUtility.getFilePaths(input, "part");
158        }
159
160        /**
161         * @return the output file location
162         */
163        public Path getOutputPath() {
164                return new Path(SequenceFileUtility.convertToURI(output).toString());
165        }
166
167        /**
168         * @return the number of reducers
169         */
170        public int getNumberOfReducers() {
171                return this.nreducers;
172        }
173
174        /**
175         * @return the {@link Parser} corresponding to the selected mode.
176         */
177        public Parser getInputParser() {
178                return inputModeOp;
179        }
180
181        /**
182         * @return the time in milliseconds to sleep after downloading a file
183         */
184        public long getSleep() {
185                return sleep;
186        }
187
188        /**
189         * @return true if redirects should be followed; false otherwise
190         */
191        public boolean followRedirects() {
192                return followRedirects;
193        }
194
195        /**
196         * @return true if failed records should be logged to a file
197         */
198        public boolean writeFailures() {
199                return writeFailures;
200        }
201
202        /**
203         * Get the number of threads to use in the mapper. If >1 a
204         * {@link MultithreadedMapper} will be used.
205         *
206         * @return number of threads to use
207         */
208        public int getNumberOfThreads() {
209                return nThreads;
210        }
211}