View Javadoc

1   /**
2    * Copyright (c) 2011, The University of Southampton and the individual contributors.
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without modification,
6    * are permitted provided that the following conditions are met:
7    *
8    *   * 	Redistributions of source code must retain the above copyright notice,
9    * 	this list of conditions and the following disclaimer.
10   *
11   *   *	Redistributions in binary form must reproduce the above copyright notice,
12   * 	this list of conditions and the following disclaimer in the documentation
13   * 	and/or other materials provided with the distribution.
14   *
15   *   *	Neither the name of the University of Southampton nor the names of its
16   * 	contributors may be used to endorse or promote products derived from this
17   * 	software without specific prior written permission.
18   *
19   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22   * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29   */
30  package org.openimaj.hadoop.tools.fastkmeans;
31  
32  import java.io.IOException;
33  import java.net.URI;
34  import java.util.List;
35  
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FileSystem;
38  import org.apache.hadoop.fs.LocalFileSystem;
39  import org.apache.hadoop.fs.Path;
40  import org.kohsuke.args4j.CmdLineException;
41  import org.kohsuke.args4j.CmdLineParser;
42  import org.kohsuke.args4j.Option;
43  import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
44  import org.openimaj.ml.clustering.kmeans.ByteKMeansInit;
45  import org.openimaj.tools.clusterquantiser.FileType;
46  
47  public class HadoopFastKMeansOptions {
48  	@Option(
49  			name = "--threads",
50  			aliases = "-j",
51  			required = false,
52  			usage = "Use NUMBER threads for quantization.",
53  			metaVar = "NUMBER")
54  	public int concurrency = Runtime.getRuntime().availableProcessors();
55  
56  	public ByteKMeansInit init = new ByteKMeansInit.RANDOM();
57  
58  	@Option(name = "--input", aliases = "-i", required = true, usage = "set the input sequencefile", multiValued = true)
59  	public List<String> inputs;
60  
61  	@Option(
62  			name = "--output",
63  			aliases = "-o",
64  			required = true,
65  			usage = "set the cluster output directory. The final cluster will go into output/final")
66  	public String output;
67  
68  	@Option(name = "--number-of-clusters", aliases = "-k", required = false, usage = "Number of clusters.")
69  	public int k = 100;
70  
71  	@Option(name = "--file-type", aliases = "-t", required = false, usage = "Specify the type of file to be read.")
72  	public String fileType = FileType.BINARY_KEYPOINT.toString();
73  
74  	@Option(name = "--nsamples", aliases = "-s", required = false, usage = "How many samples should be selected")
75  	public int nsamples = -1;
76  
77  	@Option(name = "--exact-mode", aliases = "-e", required = false, usage = "Compare the features in exact mode")
78  	public boolean exact = false;
79  
80  	@Option(
81  			name = "--force-delete",
82  			aliases = "-rm",
83  			required = false,
84  			usage = "If it exists, remove the output directory before starting")
85  	public boolean forceRM = false;
86  
87  	@Option(
88  			name = "--number-of-iterations",
89  			aliases = "-iters",
90  			required = false,
91  			usage = "How many times should the Kmeans iterate")
92  	public int iter = 3;
93  
94  	@Option(name = "--samples-only", aliases = "-so", required = false, usage = "Extract samples only.")
95  	public boolean samplesOnly = false;
96  
97  	@Option(
98  			name = "--check-sample-equality",
99  			aliases = "-cse",
100 			required = false,
101 			usage = "Extract samples but only check which features are identical (euclidian sense).")
102 	public boolean checkSampleEquality = false;
103 
104 	@Option(
105 			name = "--check-sample-equality-threshold",
106 			aliases = "-cset",
107 			required = false,
108 			usage = "The threshold for sample equality.")
109 	public int checkSampleEqualityThreshold = 0;
110 
111 	private boolean beforeMaps;
112 
113 	public String[] args;
114 	public String[] original_args;
115 
116 	public HadoopFastKMeansOptions(String[] args) {
117 		this(args, false);
118 	}
119 
120 	public HadoopFastKMeansOptions(String[] args, boolean beforeMaps) {
121 		this.beforeMaps = beforeMaps;
122 		this.args = args;
123 	}
124 
125 	public HadoopFastKMeansOptions(String[] args, String[] original_args, boolean b) {
126 		this.args = args;
127 		this.original_args = original_args;
128 		this.beforeMaps = b;
129 	}
130 
131 	public static FileSystem getFileSystem(URI uri) throws IOException {
132 		final Configuration config = new Configuration();
133 		FileSystem fs = FileSystem.get(uri, config);
134 		if (fs instanceof LocalFileSystem)
135 			fs = ((LocalFileSystem) fs).getRaw();
136 		return fs;
137 	}
138 
139 	public void prepare() {
140 		final CmdLineParser parser = new CmdLineParser(this);
141 		try {
142 			parser.parseArgument(args);
143 			this.validate();
144 		} catch (final CmdLineException e) {
145 			System.err.println(e.getMessage());
146 			System.err.println("Usage: java -jar HadoopFastKMeans.jar [options...] [files...]");
147 			parser.printUsage(System.err);
148 			System.err.print(HadoopFastKMeans.EXTRA_USAGE_INFO);
149 
150 			System.exit(1);
151 		}
152 
153 	}
154 
155 	private void validate() {
156 		System.out.println("forcerm " + this.forceRM + " beforemaps " + this.beforeMaps);
157 		if (this.forceRM && this.beforeMaps) {
158 			System.out.println("Attempting to delete: " + this.output);
159 			try {
160 				final URI outuri = SequenceFileUtility.convertToURI(this.output);
161 				final FileSystem fs = getFileSystem(outuri);
162 				fs.delete(new Path(outuri.toString()), true);
163 
164 			} catch (final IOException e) {
165 				System.out.println("Error deleting!!");
166 				e.printStackTrace();
167 			}
168 		}
169 	}
170 
171 }