001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.fastkmeans; 031 032import java.io.IOException; 033import java.util.ArrayList; 034import java.util.List; 035 036import org.apache.hadoop.conf.Configured; 037import org.apache.hadoop.fs.Path; 038import org.apache.hadoop.io.BytesWritable; 039import org.apache.hadoop.io.Text; 040import org.apache.hadoop.mapreduce.Job; 041import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 042import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 043import org.apache.hadoop.util.Tool; 044import org.apache.hadoop.util.ToolRunner; 045import org.openimaj.hadoop.sequencefile.SequenceFileUtility; 046 047 048public class SequenceFileByteImageFeatureSelector extends Configured implements Tool { 049 050 /** 051 * 052 */ 053 private static final long serialVersionUID = -5976796322589912944L; 054 private List<String> inputFilePaths; 055 private String outputFilePath; 056 private HadoopFastKMeansOptions options; 057 private int nRandomRows; 058 059 public SequenceFileByteImageFeatureSelector(List<String> inputs, String outputFilePath, HadoopFastKMeansOptions options) throws IOException, InterruptedException, ClassNotFoundException{ 060 this.inputFilePaths = inputs; 061 this.outputFilePath = outputFilePath; 062 this.options = options; 063 } 064 065 public String getFeatures(int k) throws Exception { 066 this.nRandomRows = k; 067 ToolRunner.run(this, options.original_args); 068 return this.outputFilePath; 069 } 070 071 @Override 072 public int run(String[] args) throws Exception { 073 074 // Create the output path 075 Path outpath = new Path(this.outputFilePath); 076 System.out.println("It is all going to: " + outpath); 077 078 079 List<Path> sequenceFiles = new ArrayList<Path>(); 080 for(String inputFilePath : this.inputFilePaths){ 081 Path[] foundPaths = SequenceFileUtility.getFilePaths(inputFilePath, "part"); 082 for(Path p : foundPaths){ 083 sequenceFiles.add(p); 084 } 085 } 086 087 088 Job job = new Job(this.getConf(), "featureselect"); 089 090 job.setJarByClass(SequenceFileByteImageFeatureSelector.class); 091 job.setOutputKeyClass(Text.class); 092 job.setOutputValueClass(BytesWritable.class); 093 094 job.setMapperClass(ImageFeatureSelect.Map.class); 095 if(this.nRandomRows==-1){ 096 job.setNumReduceTasks(0); 097 } 098 else{ 099 job.setNumReduceTasks(1); 100 job.setReducerClass(ImageFeatureSelect.Reduce.class); 101 } 102 103 job.setInputFormatClass(SequenceFileInputFormat.class); 104 job.setOutputFormatClass(SequenceFileOutputFormat.class); 105 106 job.getConfiguration().setStrings(ImageFeatureSelect.FILETYPE_KEY, new String[]{options.fileType}); 107 job.getConfiguration().setStrings(ImageFeatureSelect.NFEATURE_KEY, new String[]{"" + nRandomRows}); 108 109 SequenceFileInputFormat.setInputPaths(job, sequenceFiles.toArray(new Path[sequenceFiles.size()])); 110 SequenceFileOutputFormat.setOutputPath(job, outpath); 111 SequenceFileOutputFormat.setCompressOutput(job, false); 112 113 job.waitForCompletion(true); 114 return 0; 115 } 116}