001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.fastkmeans; 031 032import java.io.IOException; 033import java.net.URI; 034import java.util.List; 035 036import org.apache.hadoop.conf.Configuration; 037import org.apache.hadoop.fs.FileSystem; 038import org.apache.hadoop.fs.LocalFileSystem; 039import org.apache.hadoop.fs.Path; 040import org.kohsuke.args4j.CmdLineException; 041import org.kohsuke.args4j.CmdLineParser; 042import org.kohsuke.args4j.Option; 043import org.openimaj.hadoop.sequencefile.SequenceFileUtility; 044import org.openimaj.ml.clustering.kmeans.ByteKMeansInit; 045import org.openimaj.tools.clusterquantiser.FileType; 046 047public class HadoopFastKMeansOptions { 048 @Option( 049 name = "--threads", 050 aliases = "-j", 051 required = false, 052 usage = "Use NUMBER threads for quantization.", 053 metaVar = "NUMBER") 054 public int concurrency = Runtime.getRuntime().availableProcessors(); 055 056 public ByteKMeansInit init = new ByteKMeansInit.RANDOM(); 057 058 @Option(name = "--input", aliases = "-i", required = true, usage = "set the input sequencefile", multiValued = true) 059 public List<String> inputs; 060 061 @Option( 062 name = "--output", 063 aliases = "-o", 064 required = true, 065 usage = "set the cluster output directory. The final cluster will go into output/final") 066 public String output; 067 068 @Option(name = "--number-of-clusters", aliases = "-k", required = false, usage = "Number of clusters.") 069 public int k = 100; 070 071 @Option(name = "--file-type", aliases = "-t", required = false, usage = "Specify the type of file to be read.") 072 public String fileType = FileType.BINARY_KEYPOINT.toString(); 073 074 @Option(name = "--nsamples", aliases = "-s", required = false, usage = "How many samples should be selected") 075 public int nsamples = -1; 076 077 @Option(name = "--exact-mode", aliases = "-e", required = false, usage = "Compare the features in exact mode") 078 public boolean exact = false; 079 080 @Option( 081 name = "--force-delete", 082 aliases = "-rm", 083 required = false, 084 usage = "If it exists, remove the output directory before starting") 085 public boolean forceRM = false; 086 087 @Option( 088 name = "--number-of-iterations", 089 aliases = "-iters", 090 required = false, 091 usage = "How many times should the Kmeans iterate") 092 public int iter = 3; 093 094 @Option(name = "--samples-only", aliases = "-so", required = false, usage = "Extract samples only.") 095 public boolean samplesOnly = false; 096 097 @Option( 098 name = "--check-sample-equality", 099 aliases = "-cse", 100 required = false, 101 usage = "Extract samples but only check which features are identical (euclidian sense).") 102 public boolean checkSampleEquality = false; 103 104 @Option( 105 name = "--check-sample-equality-threshold", 106 aliases = "-cset", 107 required = false, 108 usage = "The threshold for sample equality.") 109 public int checkSampleEqualityThreshold = 0; 110 111 private boolean beforeMaps; 112 113 public String[] args; 114 public String[] original_args; 115 116 public HadoopFastKMeansOptions(String[] args) { 117 this(args, false); 118 } 119 120 public HadoopFastKMeansOptions(String[] args, boolean beforeMaps) { 121 this.beforeMaps = beforeMaps; 122 this.args = args; 123 } 124 125 public HadoopFastKMeansOptions(String[] args, String[] original_args, boolean b) { 126 this.args = args; 127 this.original_args = original_args; 128 this.beforeMaps = b; 129 } 130 131 public static FileSystem getFileSystem(URI uri) throws IOException { 132 final Configuration config = new Configuration(); 133 FileSystem fs = FileSystem.get(uri, config); 134 if (fs instanceof LocalFileSystem) 135 fs = ((LocalFileSystem) fs).getRaw(); 136 return fs; 137 } 138 139 public void prepare() { 140 final CmdLineParser parser = new CmdLineParser(this); 141 try { 142 parser.parseArgument(args); 143 this.validate(); 144 } catch (final CmdLineException e) { 145 System.err.println(e.getMessage()); 146 System.err.println("Usage: java -jar HadoopFastKMeans.jar [options...] [files...]"); 147 parser.printUsage(System.err); 148 System.err.print(HadoopFastKMeans.EXTRA_USAGE_INFO); 149 150 System.exit(1); 151 } 152 153 } 154 155 private void validate() { 156 System.out.println("forcerm " + this.forceRM + " beforemaps " + this.beforeMaps); 157 if (this.forceRM && this.beforeMaps) { 158 System.out.println("Attempting to delete: " + this.output); 159 try { 160 final URI outuri = SequenceFileUtility.convertToURI(this.output); 161 final FileSystem fs = getFileSystem(outuri); 162 fs.delete(new Path(outuri.toString()), true); 163 164 } catch (final IOException e) { 165 System.out.println("Error deleting!!"); 166 e.printStackTrace(); 167 } 168 } 169 } 170 171}