001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.sequencefile; 031 032import java.io.IOException; 033import java.net.URI; 034import java.util.ArrayList; 035import java.util.LinkedHashMap; 036import java.util.List; 037import java.util.Map; 038import java.util.Map.Entry; 039import java.util.zip.ZipOutputStream; 040 041import org.apache.hadoop.fs.FileSystem; 042import org.apache.hadoop.fs.Path; 043import org.apache.hadoop.fs.PathFilter; 044import org.apache.hadoop.io.BytesWritable; 045import org.apache.hadoop.io.SequenceFile; 046import org.apache.hadoop.io.Text; 047import org.kohsuke.args4j.Argument; 048import org.kohsuke.args4j.CmdLineException; 049import org.kohsuke.args4j.CmdLineOptionsProvider; 050import org.kohsuke.args4j.CmdLineParser; 051import org.kohsuke.args4j.Option; 052import org.kohsuke.args4j.ProxyOptionHandler; 053import org.openimaj.hadoop.sequencefile.ExtractionState; 054import org.openimaj.hadoop.sequencefile.NamingStrategy; 055import org.openimaj.hadoop.sequencefile.SequenceFileUtility; 056import org.openimaj.hadoop.sequencefile.SequenceFileUtility.KeyProvider; 057import org.openimaj.hadoop.sequencefile.TextBytesSequenceFileUtility; 058 059/** 060 * {@link SequenceFileTool} is a commandline tool for creating, extracting and 061 * inspecting Hadoop {@link SequenceFile}s. 062 * 063 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 064 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 065 */ 066public class SequenceFileTool { 067 /** 068 * What to print when getting info 069 * 070 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 071 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 072 * 073 */ 074 enum InfoModeOptions { 075 GUID, METADATA, NRECORDS, COMPRESSION_CODEC, COMPRESSION_TYPE; 076 } 077 078 /** 079 * Strategies for key naming 080 * 081 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 082 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 083 * 084 */ 085 private enum KeyNameStrategy { 086 MD5UUID { 087 @Override 088 public KeyProvider<Text> getKeyProvider() { 089 return new SequenceFileUtility.MD5UUIDKeyProvider(); 090 } 091 }, 092 FILENAME { 093 @Override 094 public KeyProvider<Text> getKeyProvider() { 095 return new SequenceFileUtility.FilenameKeyProvider(); 096 } 097 }, 098 RELATIVEPATH { 099 @Override 100 public KeyProvider<Text> getKeyProvider() { 101 return new SequenceFileUtility.RelativePathFilenameKeyProvider(); 102 } 103 }, 104 ; 105 public abstract KeyProvider<Text> getKeyProvider(); 106 } 107 108 private static abstract class ModeOp { 109 public abstract void execute() throws Exception; 110 } 111 112 private static class InfoMode extends ModeOp { 113 @Option( 114 name = "--options", 115 aliases = "-opts", 116 required = false, 117 usage = "Choose info type. Defaults to all.", 118 multiValued = true) 119 private List<InfoModeOptions> options; 120 121 @Argument(required = true, usage = "Sequence file", metaVar = "input-path-or-uri") 122 private String inputPathOrUri; 123 124 @Override 125 public void execute() throws Exception { 126 final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(inputPathOrUri, 127 true); 128 129 if (options == null) { 130 options = new ArrayList<InfoModeOptions>(); 131 for (final InfoModeOptions o : InfoModeOptions.values()) 132 options.add(o); 133 } 134 135 if (options.contains(InfoModeOptions.GUID) && !options.contains(InfoModeOptions.METADATA)) { 136 System.out.println("UUID: " + utility.getUUID()); 137 } 138 139 if (options.contains(InfoModeOptions.METADATA)) { 140 final Map<Text, Text> metadata = utility.getMetadata(); 141 142 System.out.println("Metadata:"); 143 for (final Entry<Text, Text> e : metadata.entrySet()) { 144 System.out.println(e.getKey() + ": " + e.getValue()); 145 } 146 } 147 148 if (options.contains(InfoModeOptions.NRECORDS)) { 149 System.out.println("NRecords: " + utility.getNumberRecords()); 150 } 151 152 if (options.contains(InfoModeOptions.COMPRESSION_CODEC)) { 153 System.out.println("Compression codec: " + utility.getCompressionCodecClass()); 154 } 155 156 if (options.contains(InfoModeOptions.COMPRESSION_TYPE)) { 157 System.out.println("Compression type: " + utility.getCompressionType()); 158 } 159 } 160 } 161 162 private static class CreateMode extends ModeOp { 163 @Option( 164 name = "--recursive", 165 aliases = "-R", 166 required = false, 167 usage = "Recurse into directories inside input directories") 168 boolean recurse = false; 169 170 @Option(name = "--key-name-strategy", aliases = "-kns", required = false, usage = "Strategy for naming keys") 171 KeyNameStrategy strategy = KeyNameStrategy.FILENAME; 172 173 @Option(name = "--output", aliases = "-o", required = false, usage = "Output directory (path or uri).") 174 String outputPathOrUri = "./"; 175 176 @Option( 177 name = "--output-name", 178 aliases = "-name", 179 required = false, 180 usage = "Output filename. Defaults to <uuid>.seq.") 181 String outputName; 182 183 @Option( 184 name = "--write-map", 185 aliases = "-wm", 186 required = false, 187 usage = "Write uuid -> filename map to a file. File is saved in output directory as <name>-map.txt.") 188 boolean writeFilename2IDMap = false; 189 190 @Option(name = "--print-map", aliases = "-pm", required = false, usage = "Print uuid -> filename map.") 191 boolean printFilename2IDMap = false; 192 193 @Option( 194 name = "--filename-regex", 195 aliases = "-fnr", 196 required = false, 197 usage = "Regular expressions that file names must match to be added.") 198 String filenameRegex = null; 199 200 @Argument(usage = "input files", multiValued = true, required = true, metaVar = "input-paths-or-uris") 201 List<String> inputs = null; 202 203 @Override 204 public void execute() throws Exception { 205 if (outputName != null) { 206 if (!outputPathOrUri.endsWith("/")) 207 outputPathOrUri += "/"; 208 outputPathOrUri += outputName; 209 } 210 211 final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(outputPathOrUri, 212 false); 213 final Map<Path, Text> map = new LinkedHashMap<Path, Text>(); 214 215 for (final String input : inputs) { 216 final URI uri = SequenceFileUtility.convertToURI(input); 217 final FileSystem fs = utility.getFileSystem(uri); 218 final Path path = utility.getPath(uri); 219 220 PathFilter pathFilter = null; 221 if (filenameRegex != null) { 222 pathFilter = new RegexPathFilter(filenameRegex); 223 } 224 225 map.putAll(utility.appendFiles(fs, path, recurse, pathFilter, strategy.getKeyProvider())); 226 } 227 228 if (writeFilename2IDMap) { 229 utility.writePathMap(map); 230 } 231 232 if (printFilename2IDMap) { 233 for (final Entry<Path, Text> e : map.entrySet()) { 234 System.out.println(e.getValue() + " " + e.getKey()); 235 } 236 } 237 238 utility.close(); 239 System.err.println("Created " + utility.getSequenceFilePath()); 240 } 241 } 242 243 private static class ExtractMode extends ModeOp { 244 @Option(name = "--output", aliases = "-o", required = false, usage = "Output directory (path or uri).") 245 String outputPathOrUri; 246 247 @Option( 248 name = "--key", 249 aliases = "-k", 250 required = false, 251 usage = "Key of file to extract. By default if this is not provided, all files are extracted.") 252 String queryKey; 253 254 @Option(name = "--offset", required = false, usage = "Offset from which to start extract") 255 long offset; 256 257 @Option( 258 name = "--name-policy", 259 aliases = "-n", 260 handler = ProxyOptionHandler.class, 261 required = false, 262 usage = "Select the naming policy of outputed files") 263 NamingStrategy np = NamingStrategy.KEY; 264 265 @Option( 266 name = "--random-select", 267 aliases = "-r", 268 required = false, 269 usage = "Randomly select a subset of input of this size") 270 int random = -1; 271 272 @Option( 273 name = "--extract-max", 274 aliases = "-max", 275 required = false, 276 usage = "Randomly select a subset of input of this size") 277 int max = -1; 278 279 @Option( 280 name = "--auto-extension", 281 aliases = "-ae", 282 required = false, 283 usage = "Automatically extract the filetype and append its appropriate extension") 284 boolean autoExtension = false; 285 286 @Argument(required = true, usage = "Sequence file", metaVar = "input-path-or-uri") 287 private String inputPathOrUri; 288 289 @Option(name = "-zip", required = false, usage = "Extract to zip") 290 private boolean zipMode = false; 291 292 @Override 293 public void execute() throws IOException { 294 if (offset < 0) 295 throw new IllegalArgumentException("Offset cannot be less than 0."); 296 297 System.out.println("Getting file paths..."); 298 299 final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(inputPathOrUri, "part"); 300 final ExtractionState nps = new ExtractionState(); 301 nps.setMaxFileExtract(max); 302 303 if (random >= 0) { 304 System.out.println("Counting records"); 305 306 int totalRecords = 0; 307 for (final Path path : sequenceFiles) { 308 System.out.println("... Counting from file: " + path); 309 final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility( 310 path.toUri(), true); 311 totalRecords += utility.getNumberRecords(); 312 } 313 314 System.out.println("Selecting random subset of " + random + " from " + totalRecords); 315 316 nps.setRandomSelection(random, totalRecords); 317 } 318 319 ZipOutputStream zos = null; 320 if (zipMode) { 321 zos = SequenceFileUtility.openZipOutputStream(outputPathOrUri); 322 } 323 324 for (final Path path : sequenceFiles) { 325 System.out.println("Extracting from " + path.getName()); 326 327 final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(path.toUri(), 328 true); 329 if (queryKey == null) { 330 if (zipMode) { 331 utility.exportDataToZip(zos, np, nps, autoExtension, offset); 332 } else { 333 utility.exportData(outputPathOrUri, np, nps, autoExtension, offset); 334 } 335 } else { 336 if (zipMode) { 337 throw new UnsupportedOperationException("Not implemented yet"); 338 } else { 339 if (!utility.findAndExport(new Text(queryKey), outputPathOrUri, offset)) { 340 if (offset == 0) 341 System.err.format("Key '%s' was not found in the file.\n", queryKey); 342 else 343 System.err.format("Key '%s' was not found in the file after offset %d.\n", queryKey, 344 offset); 345 } 346 } 347 } 348 349 if (nps.isFinished()) 350 break; 351 } 352 353 if (zos != null) 354 zos.close(); 355 } 356 } 357 358 private static class ListMode extends ModeOp { 359 @Option( 360 name = "--print-offsets", 361 aliases = "-po", 362 required = false, 363 usage = "Also print the offset of each record") 364 boolean printOffsets = false; 365 366 @Option( 367 name = "--options", 368 aliases = "-opts", 369 required = false, 370 usage = "Choose options to include per record in order.", 371 multiValued = true) 372 private final List<ListModeOptions> options = new ArrayList<ListModeOptions>(); 373 374 @Option( 375 name = "--deliminator", 376 aliases = "-delim", 377 required = false, 378 usage = "Choose the per record options deliminator") 379 private final String delim = " "; 380 381 @Argument(required = true, usage = "Sequence file", metaVar = "input-path-or-uri") 382 private String inputPathOrUri; 383 384 @Override 385 public void execute() throws IOException { 386 final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(inputPathOrUri, "part"); 387 388 for (final Path path : sequenceFiles) { 389 System.err.println("Outputting from seqfile: " + path); 390 final SequenceFileUtility<Text, BytesWritable> utility = new TextBytesSequenceFileUtility(path.toUri(), 391 true); 392 393 if (options == null) { 394 if (printOffsets) { 395 for (final Entry<Text, Long> e : utility.listKeysAndOffsets().entrySet()) 396 System.out.format("%10d %s\n", e.getValue(), e.getKey().toString()); 397 } else { 398 for (final Text t : utility.listKeys()) 399 System.out.println(t.toString()); 400 } 401 } else { 402 utility.extract(ListModeOptions.listOptionsToExtractPolicy(options), System.out, delim); 403 } 404 } 405 } 406 } 407 408 /** 409 * Tool operation modes. 410 * 411 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 412 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 413 */ 414 enum Mode implements CmdLineOptionsProvider { 415 INFO { 416 @Override 417 public Object getOptions() { 418 return new InfoMode(); 419 } 420 }, 421 CREATE { 422 @Override 423 public Object getOptions() { 424 return new CreateMode(); 425 } 426 }, 427 EXTRACT { 428 @Override 429 public Object getOptions() { 430 return new ExtractMode(); 431 } 432 }, 433 LIST { 434 @Override 435 public Object getOptions() { 436 return new ListMode(); 437 } 438 }; 439 } 440 441 @Option( 442 name = "--mode", 443 aliases = "-m", 444 required = true, 445 handler = ProxyOptionHandler.class, 446 usage = "Operation mode") 447 private Mode mode; 448 private ModeOp modeOp; 449 450 /** 451 * Execute the tool in the mode set through the commandline options 452 * 453 * @throws Exception 454 * if an error occurs 455 */ 456 public void execute() throws Exception { 457 modeOp.execute(); 458 } 459 460 /** 461 * Tool main method. 462 * 463 * @param args 464 * the tool arguments 465 * @throws Exception 466 * if an error occurs 467 */ 468 public static void main(String[] args) throws Exception { 469 final SequenceFileTool options = new SequenceFileTool(); 470 final CmdLineParser parser = new CmdLineParser(options); 471 472 try { 473 parser.parseArgument(args); 474 } catch (final CmdLineException e) { 475 System.err.println(e.getMessage()); 476 System.err.println("Usage: java -jar SequenceFileTool.jar [options...]"); 477 parser.printUsage(System.err); 478 479 if (options.mode == null) { 480 for (final Mode m : Mode.values()) { 481 System.err.println(); 482 System.err.println(m + " options: "); 483 new CmdLineParser(m.getOptions()).printUsage(System.err); 484 } 485 } 486 return; 487 } 488 489 options.execute(); 490 } 491}