001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.sequencefile;
031
032import java.io.IOException;
033import java.net.URI;
034import java.util.ArrayList;
035import java.util.LinkedHashMap;
036import java.util.List;
037import java.util.Map;
038import java.util.Map.Entry;
039import java.util.zip.ZipOutputStream;
040
041import org.apache.hadoop.fs.FileSystem;
042import org.apache.hadoop.fs.Path;
043import org.apache.hadoop.fs.PathFilter;
044import org.apache.hadoop.io.BytesWritable;
045import org.apache.hadoop.io.SequenceFile;
046import org.apache.hadoop.io.Text;
047import org.kohsuke.args4j.Argument;
048import org.kohsuke.args4j.CmdLineException;
049import org.kohsuke.args4j.CmdLineOptionsProvider;
050import org.kohsuke.args4j.CmdLineParser;
051import org.kohsuke.args4j.Option;
052import org.kohsuke.args4j.ProxyOptionHandler;
053import org.openimaj.hadoop.sequencefile.ExtractionState;
054import org.openimaj.hadoop.sequencefile.NamingStrategy;
055import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
056import org.openimaj.hadoop.sequencefile.SequenceFileUtility.KeyProvider;
057import org.openimaj.hadoop.sequencefile.TextBytesSequenceFileUtility;
058
059/**
060 * {@link SequenceFileTool} is a commandline tool for creating, extracting and
061 * inspecting Hadoop {@link SequenceFile}s.
062 *
063 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
064 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
065 */
066public class SequenceFileTool {
067        /**
068         * What to print when getting info
069         *
070         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
071         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
072         *
073         */
074        enum InfoModeOptions {
075                GUID, METADATA, NRECORDS, COMPRESSION_CODEC, COMPRESSION_TYPE;
076        }
077
078        /**
079         * Strategies for key naming
080         *
081         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
082         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
083         *
084         */
085        private enum KeyNameStrategy {
086                MD5UUID {
087                        @Override
088                        public KeyProvider<Text> getKeyProvider() {
089                                return new SequenceFileUtility.MD5UUIDKeyProvider();
090                        }
091                },
092                FILENAME {
093                        @Override
094                        public KeyProvider<Text> getKeyProvider() {
095                                return new SequenceFileUtility.FilenameKeyProvider();
096                        }
097                },
098                RELATIVEPATH {
099                        @Override
100                        public KeyProvider<Text> getKeyProvider() {
101                                return new SequenceFileUtility.RelativePathFilenameKeyProvider();
102                        }
103                },
104                ;
105                public abstract KeyProvider<Text> getKeyProvider();
106        }
107
108        private static abstract class ModeOp {
109                public abstract void execute() throws Exception;
110        }
111
112        private static class InfoMode extends ModeOp {
113                @Option(
114                                name = "--options",
115                                aliases = "-opts",
116                                required = false,
117                                usage = "Choose info type. Defaults to all.",
118                                multiValued = true)
119                private List<InfoModeOptions> options;
120
121                @Argument(required = true, usage = "Sequence file", metaVar = "input-path-or-uri")
122                private String inputPathOrUri;
123
124                @Override
125                public void execute() throws Exception {
126                        final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(inputPathOrUri,
127                                        true);
128
129                        if (options == null) {
130                                options = new ArrayList<InfoModeOptions>();
131                                for (final InfoModeOptions o : InfoModeOptions.values())
132                                        options.add(o);
133                        }
134
135                        if (options.contains(InfoModeOptions.GUID) && !options.contains(InfoModeOptions.METADATA)) {
136                                System.out.println("UUID: " + utility.getUUID());
137                        }
138
139                        if (options.contains(InfoModeOptions.METADATA)) {
140                                final Map<Text, Text> metadata = utility.getMetadata();
141
142                                System.out.println("Metadata:");
143                                for (final Entry<Text, Text> e : metadata.entrySet()) {
144                                        System.out.println(e.getKey() + ": " + e.getValue());
145                                }
146                        }
147
148                        if (options.contains(InfoModeOptions.NRECORDS)) {
149                                System.out.println("NRecords: " + utility.getNumberRecords());
150                        }
151
152                        if (options.contains(InfoModeOptions.COMPRESSION_CODEC)) {
153                                System.out.println("Compression codec: " + utility.getCompressionCodecClass());
154                        }
155
156                        if (options.contains(InfoModeOptions.COMPRESSION_TYPE)) {
157                                System.out.println("Compression type: " + utility.getCompressionType());
158                        }
159                }
160        }
161
162        private static class CreateMode extends ModeOp {
163                @Option(
164                                name = "--recursive",
165                                aliases = "-R",
166                                required = false,
167                                usage = "Recurse into directories inside input directories")
168                boolean recurse = false;
169
170                @Option(name = "--key-name-strategy", aliases = "-kns", required = false, usage = "Strategy for naming keys")
171                KeyNameStrategy strategy = KeyNameStrategy.FILENAME;
172
173                @Option(name = "--output", aliases = "-o", required = false, usage = "Output directory (path or uri).")
174                String outputPathOrUri = "./";
175
176                @Option(
177                                name = "--output-name",
178                                aliases = "-name",
179                                required = false,
180                                usage = "Output filename. Defaults to <uuid>.seq.")
181                String outputName;
182
183                @Option(
184                                name = "--write-map",
185                                aliases = "-wm",
186                                required = false,
187                                usage = "Write uuid -> filename map to a file. File is saved in output directory as <name>-map.txt.")
188                boolean writeFilename2IDMap = false;
189
190                @Option(name = "--print-map", aliases = "-pm", required = false, usage = "Print uuid -> filename map.")
191                boolean printFilename2IDMap = false;
192
193                @Option(
194                                name = "--filename-regex",
195                                aliases = "-fnr",
196                                required = false,
197                                usage = "Regular expressions that file names must match to be added.")
198                String filenameRegex = null;
199
200                @Argument(usage = "input files", multiValued = true, required = true, metaVar = "input-paths-or-uris")
201                List<String> inputs = null;
202
203                @Override
204                public void execute() throws Exception {
205                        if (outputName != null) {
206                                if (!outputPathOrUri.endsWith("/"))
207                                        outputPathOrUri += "/";
208                                outputPathOrUri += outputName;
209                        }
210
211                        final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(outputPathOrUri,
212                                        false);
213                        final Map<Path, Text> map = new LinkedHashMap<Path, Text>();
214
215                        for (final String input : inputs) {
216                                final URI uri = SequenceFileUtility.convertToURI(input);
217                                final FileSystem fs = utility.getFileSystem(uri);
218                                final Path path = utility.getPath(uri);
219
220                                PathFilter pathFilter = null;
221                                if (filenameRegex != null) {
222                                        pathFilter = new RegexPathFilter(filenameRegex);
223                                }
224
225                                map.putAll(utility.appendFiles(fs, path, recurse, pathFilter, strategy.getKeyProvider()));
226                        }
227
228                        if (writeFilename2IDMap) {
229                                utility.writePathMap(map);
230                        }
231
232                        if (printFilename2IDMap) {
233                                for (final Entry<Path, Text> e : map.entrySet()) {
234                                        System.out.println(e.getValue() + " " + e.getKey());
235                                }
236                        }
237
238                        utility.close();
239                        System.err.println("Created " + utility.getSequenceFilePath());
240                }
241        }
242
243        private static class ExtractMode extends ModeOp {
244                @Option(name = "--output", aliases = "-o", required = false, usage = "Output directory (path or uri).")
245                String outputPathOrUri;
246
247                @Option(
248                                name = "--key",
249                                aliases = "-k",
250                                required = false,
251                                usage = "Key of file to extract. By default if this is not provided, all files are extracted.")
252                String queryKey;
253
254                @Option(name = "--offset", required = false, usage = "Offset from which to start extract")
255                long offset;
256
257                @Option(
258                                name = "--name-policy",
259                                aliases = "-n",
260                                handler = ProxyOptionHandler.class,
261                                required = false,
262                                usage = "Select the naming policy of outputed files")
263                NamingStrategy np = NamingStrategy.KEY;
264
265                @Option(
266                                name = "--random-select",
267                                aliases = "-r",
268                                required = false,
269                                usage = "Randomly select a subset of input of this size")
270                int random = -1;
271
272                @Option(
273                                name = "--extract-max",
274                                aliases = "-max",
275                                required = false,
276                                usage = "Randomly select a subset of input of this size")
277                int max = -1;
278
279                @Option(
280                                name = "--auto-extension",
281                                aliases = "-ae",
282                                required = false,
283                                usage = "Automatically extract the filetype and append its appropriate extension")
284                boolean autoExtension = false;
285
286                @Argument(required = true, usage = "Sequence file", metaVar = "input-path-or-uri")
287                private String inputPathOrUri;
288
289                @Option(name = "-zip", required = false, usage = "Extract to zip")
290                private boolean zipMode = false;
291
292                @Override
293                public void execute() throws IOException {
294                        if (offset < 0)
295                                throw new IllegalArgumentException("Offset cannot be less than 0.");
296
297                        System.out.println("Getting file paths...");
298
299                        final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(inputPathOrUri, "part");
300                        final ExtractionState nps = new ExtractionState();
301                        nps.setMaxFileExtract(max);
302
303                        if (random >= 0) {
304                                System.out.println("Counting records");
305
306                                int totalRecords = 0;
307                                for (final Path path : sequenceFiles) {
308                                        System.out.println("... Counting from file: " + path);
309                                        final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(
310                                                        path.toUri(), true);
311                                        totalRecords += utility.getNumberRecords();
312                                }
313
314                                System.out.println("Selecting random subset of " + random + " from " + totalRecords);
315
316                                nps.setRandomSelection(random, totalRecords);
317                        }
318
319                        ZipOutputStream zos = null;
320                        if (zipMode) {
321                                zos = SequenceFileUtility.openZipOutputStream(outputPathOrUri);
322                        }
323
324                        for (final Path path : sequenceFiles) {
325                                System.out.println("Extracting from " + path.getName());
326
327                                final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(path.toUri(),
328                                                true);
329                                if (queryKey == null) {
330                                        if (zipMode) {
331                                                utility.exportDataToZip(zos, np, nps, autoExtension, offset);
332                                        } else {
333                                                utility.exportData(outputPathOrUri, np, nps, autoExtension, offset);
334                                        }
335                                } else {
336                                        if (zipMode) {
337                                                throw new UnsupportedOperationException("Not implemented yet");
338                                        } else {
339                                                if (!utility.findAndExport(new Text(queryKey), outputPathOrUri, offset)) {
340                                                        if (offset == 0)
341                                                                System.err.format("Key '%s' was not found in the file.\n", queryKey);
342                                                        else
343                                                                System.err.format("Key '%s' was not found in the file after offset %d.\n", queryKey,
344                                                                                offset);
345                                                }
346                                        }
347                                }
348
349                                if (nps.isFinished())
350                                        break;
351                        }
352
353                        if (zos != null)
354                                zos.close();
355                }
356        }
357
358        private static class ListMode extends ModeOp {
359                @Option(
360                                name = "--print-offsets",
361                                aliases = "-po",
362                                required = false,
363                                usage = "Also print the offset of each record")
364                boolean printOffsets = false;
365
366                @Option(
367                                name = "--options",
368                                aliases = "-opts",
369                                required = false,
370                                usage = "Choose options to include per record in order.",
371                                multiValued = true)
372                private final List<ListModeOptions> options = new ArrayList<ListModeOptions>();
373
374                @Option(
375                                name = "--deliminator",
376                                aliases = "-delim",
377                                required = false,
378                                usage = "Choose the per record options deliminator")
379                private final String delim = " ";
380
381                @Argument(required = true, usage = "Sequence file", metaVar = "input-path-or-uri")
382                private String inputPathOrUri;
383
384                @Override
385                public void execute() throws IOException {
386                        final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(inputPathOrUri, "part");
387
388                        for (final Path path : sequenceFiles) {
389                                System.err.println("Outputting from seqfile: " + path);
390                                final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(path.toUri(),
391                                                true);
392
393                                if (options == null) {
394                                        if (printOffsets) {
395                                                for (final Entry<Text, Long> e : utility.listKeysAndOffsets().entrySet())
396                                                        System.out.format("%10d %s\n", e.getValue(), e.getKey().toString());
397                                        } else {
398                                                for (final Text t : utility.listKeys())
399                                                        System.out.println(t.toString());
400                                        }
401                                } else {
402                                        utility.extract(ListModeOptions.listOptionsToExtractPolicy(options), System.out, delim);
403                                }
404                        }
405                }
406        }
407
408        /**
409         * Tool operation modes.
410         *
411         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
412         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
413         */
414        enum Mode implements CmdLineOptionsProvider {
415                INFO {
416                        @Override
417                        public Object getOptions() {
418                                return new InfoMode();
419                        }
420                },
421                CREATE {
422                        @Override
423                        public Object getOptions() {
424                                return new CreateMode();
425                        }
426                },
427                EXTRACT {
428                        @Override
429                        public Object getOptions() {
430                                return new ExtractMode();
431                        }
432                },
433                LIST {
434                        @Override
435                        public Object getOptions() {
436                                return new ListMode();
437                        }
438                };
439        }
440
441        @Option(
442                        name = "--mode",
443                        aliases = "-m",
444                        required = true,
445                        handler = ProxyOptionHandler.class,
446                        usage = "Operation mode")
447        private Mode mode;
448        private ModeOp modeOp;
449
450        /**
451         * Execute the tool in the mode set through the commandline options
452         *
453         * @throws Exception
454         *             if an error occurs
455         */
456        public void execute() throws Exception {
457                modeOp.execute();
458        }
459
460        /**
461         * Tool main method.
462         *
463         * @param args
464         *            the tool arguments
465         * @throws Exception
466         *             if an error occurs
467         */
468        public static void main(String[] args) throws Exception {
469                final SequenceFileTool options = new SequenceFileTool();
470                final CmdLineParser parser = new CmdLineParser(options);
471
472                try {
473                        parser.parseArgument(args);
474                } catch (final CmdLineException e) {
475                        System.err.println(e.getMessage());
476                        System.err.println("Usage: java -jar SequenceFileTool.jar [options...]");
477                        parser.printUsage(System.err);
478
479                        if (options.mode == null) {
480                                for (final Mode m : Mode.values()) {
481                                        System.err.println();
482                                        System.err.println(m + " options: ");
483                                        new CmdLineParser(m.getOptions()).printUsage(System.err);
484                                }
485                        }
486                        return;
487                }
488
489                options.execute();
490        }
491}