1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 package org.openimaj.hadoop.tools.fastkmeans;
31
32 import java.io.IOException;
33 import java.net.URI;
34 import java.util.List;
35
36 import org.apache.hadoop.conf.Configuration;
37 import org.apache.hadoop.fs.FileSystem;
38 import org.apache.hadoop.fs.LocalFileSystem;
39 import org.apache.hadoop.fs.Path;
40 import org.kohsuke.args4j.CmdLineException;
41 import org.kohsuke.args4j.CmdLineParser;
42 import org.kohsuke.args4j.Option;
43 import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
44 import org.openimaj.ml.clustering.kmeans.ByteKMeansInit;
45 import org.openimaj.tools.clusterquantiser.FileType;
46
47 public class HadoopFastKMeansOptions {
48 @Option(
49 name = "--threads",
50 aliases = "-j",
51 required = false,
52 usage = "Use NUMBER threads for quantization.",
53 metaVar = "NUMBER")
54 public int concurrency = Runtime.getRuntime().availableProcessors();
55
56 public ByteKMeansInit init = new ByteKMeansInit.RANDOM();
57
58 @Option(name = "--input", aliases = "-i", required = true, usage = "set the input sequencefile", multiValued = true)
59 public List<String> inputs;
60
61 @Option(
62 name = "--output",
63 aliases = "-o",
64 required = true,
65 usage = "set the cluster output directory. The final cluster will go into output/final")
66 public String output;
67
68 @Option(name = "--number-of-clusters", aliases = "-k", required = false, usage = "Number of clusters.")
69 public int k = 100;
70
71 @Option(name = "--file-type", aliases = "-t", required = false, usage = "Specify the type of file to be read.")
72 public String fileType = FileType.BINARY_KEYPOINT.toString();
73
74 @Option(name = "--nsamples", aliases = "-s", required = false, usage = "How many samples should be selected")
75 public int nsamples = -1;
76
77 @Option(name = "--exact-mode", aliases = "-e", required = false, usage = "Compare the features in exact mode")
78 public boolean exact = false;
79
80 @Option(
81 name = "--force-delete",
82 aliases = "-rm",
83 required = false,
84 usage = "If it exists, remove the output directory before starting")
85 public boolean forceRM = false;
86
87 @Option(
88 name = "--number-of-iterations",
89 aliases = "-iters",
90 required = false,
91 usage = "How many times should the Kmeans iterate")
92 public int iter = 3;
93
94 @Option(name = "--samples-only", aliases = "-so", required = false, usage = "Extract samples only.")
95 public boolean samplesOnly = false;
96
97 @Option(
98 name = "--check-sample-equality",
99 aliases = "-cse",
100 required = false,
101 usage = "Extract samples but only check which features are identical (euclidian sense).")
102 public boolean checkSampleEquality = false;
103
104 @Option(
105 name = "--check-sample-equality-threshold",
106 aliases = "-cset",
107 required = false,
108 usage = "The threshold for sample equality.")
109 public int checkSampleEqualityThreshold = 0;
110
111 private boolean beforeMaps;
112
113 public String[] args;
114 public String[] original_args;
115
116 public HadoopFastKMeansOptions(String[] args) {
117 this(args, false);
118 }
119
120 public HadoopFastKMeansOptions(String[] args, boolean beforeMaps) {
121 this.beforeMaps = beforeMaps;
122 this.args = args;
123 }
124
125 public HadoopFastKMeansOptions(String[] args, String[] original_args, boolean b) {
126 this.args = args;
127 this.original_args = original_args;
128 this.beforeMaps = b;
129 }
130
131 public static FileSystem getFileSystem(URI uri) throws IOException {
132 final Configuration config = new Configuration();
133 FileSystem fs = FileSystem.get(uri, config);
134 if (fs instanceof LocalFileSystem)
135 fs = ((LocalFileSystem) fs).getRaw();
136 return fs;
137 }
138
139 public void prepare() {
140 final CmdLineParser parser = new CmdLineParser(this);
141 try {
142 parser.parseArgument(args);
143 this.validate();
144 } catch (final CmdLineException e) {
145 System.err.println(e.getMessage());
146 System.err.println("Usage: java -jar HadoopFastKMeans.jar [options...] [files...]");
147 parser.printUsage(System.err);
148 System.err.print(HadoopFastKMeans.EXTRA_USAGE_INFO);
149
150 System.exit(1);
151 }
152
153 }
154
155 private void validate() {
156 System.out.println("forcerm " + this.forceRM + " beforemaps " + this.beforeMaps);
157 if (this.forceRM && this.beforeMaps) {
158 System.out.println("Attempting to delete: " + this.output);
159 try {
160 final URI outuri = SequenceFileUtility.convertToURI(this.output);
161 final FileSystem fs = getFileSystem(outuri);
162 fs.delete(new Path(outuri.toString()), true);
163
164 } catch (final IOException e) {
165 System.out.println("Error deleting!!");
166 e.printStackTrace();
167 }
168 }
169 }
170
171 }