1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 package org.openimaj.hadoop.tools.fastkmeans;
31
32 import java.io.IOException;
33
34 import org.apache.hadoop.conf.Configured;
35 import org.apache.hadoop.fs.Path;
36 import org.apache.hadoop.io.BytesWritable;
37 import org.apache.hadoop.io.IntWritable;
38 import org.apache.hadoop.mapred.JobConf;
39 import org.apache.hadoop.mapreduce.Job;
40 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
41 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
42 import org.apache.hadoop.util.Tool;
43 import org.apache.hadoop.util.ToolRunner;
44 import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
45
46
47 public class SequenceFileByteFeatureSelector extends Configured implements Tool {
48
49
50
51
52 private static final long serialVersionUID = -5976796322589912944L;
53 private String inputFilePath;
54 private String outputFilePath;
55 private int nRandomRows;
56 private HadoopFastKMeansOptions options;
57
58 public SequenceFileByteFeatureSelector(String inputFilePath, String outputFilePath, HadoopFastKMeansOptions options) throws IOException, InterruptedException, ClassNotFoundException{
59 this.inputFilePath = inputFilePath;
60 this.outputFilePath = outputFilePath;
61 this.options = options;
62 }
63
64 public String getRandomFeatures(int k) throws Exception {
65 this.nRandomRows = k;
66 ToolRunner.run(this, options.original_args);
67 return this.outputFilePath;
68 }
69
70 @Override
71 public int run(String[] args) throws Exception {
72
73 Path outpath = new Path(SequenceFileUtility.convertToURI(this.outputFilePath).toString());
74 System.out.println("It is all going to: " + outpath);
75
76 Path[] sequenceFiles = SequenceFileUtility.getFilePaths(inputFilePath, "part");
77
78 Job job = new Job(this.getConf(), "featureselect");
79 job.setNumReduceTasks(1);
80 job.setJarByClass(SequenceFileByteImageFeatureSelector.class);
81 job.setOutputKeyClass(IntWritable.class);
82 job.setOutputValueClass(BytesWritable.class);
83
84 job.setMapperClass(FeatureSelect.Map.class);
85 job.setReducerClass(FeatureSelect.Reduce.class);
86
87 job.setInputFormatClass(SequenceFileInputFormat.class);
88 job.setOutputFormatClass(SequenceFileOutputFormat.class);
89
90 job.getConfiguration().setStrings(FeatureSelect.FILETYPE_KEY, new String[]{options.fileType});
91 job.getConfiguration().setStrings(FeatureSelect.NFEATURE_KEY, new String[]{"" + this.nRandomRows});
92
93 ((JobConf)job.getConfiguration()).setNumTasksToExecutePerJvm(-1);
94
95 SequenceFileInputFormat.setInputPaths(job, sequenceFiles);
96 SequenceFileOutputFormat.setOutputPath(job, outpath);
97 SequenceFileOutputFormat.setCompressOutput(job, false);
98 job.waitForCompletion(true);
99 return 0;
100 }
101
102 }