001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.clustering.kmeans; 031 032import java.util.concurrent.ExecutorService; 033 034import org.openimaj.knn.NearestNeighbours; 035import org.openimaj.knn.NearestNeighboursFactory; 036import org.openimaj.util.parallel.GlobalExecutorPool; 037 038/** 039 * Configuration for the K-Means algorithm. 040 * 041 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 042 * 043 * @param <NN> 044 * The type of {@link NearestNeighbours} to use 045 * @param <DATA> 046 * The type of data 047 */ 048public class KMeansConfiguration<NN extends NearestNeighbours<DATA, ?, ?>, DATA> implements Cloneable { 049 /** 050 * The default number of samples per parallel assignment instance. 051 */ 052 public static final int DEFAULT_BLOCK_SIZE = 50000; 053 054 /** 055 * The default number of iterations. 056 */ 057 public static final int DEFAULT_NUMBER_ITERATIONS = 30; 058 059 /** 060 * The number of clusters 061 */ 062 protected int K; 063 064 /** 065 * The factory for producing the {@link NearestNeighbours} objects used in 066 * assignment. 067 */ 068 protected NearestNeighboursFactory<? extends NN, DATA> factory; 069 070 /** 071 * The size of processing blocks for each thread 072 */ 073 protected int blockSize; 074 075 /** 076 * The max number of iterations 077 */ 078 protected int niters; 079 080 /** 081 * The threadpool for parallel processing 082 */ 083 protected ExecutorService threadpool; 084 085 /** 086 * Create configuration for data that will create <code>K</code> clusters. 087 * The algorithm will run for a maximum of 088 * {@link #DEFAULT_NUMBER_ITERATIONS} iterations, and make use of all 089 * available processors, processing with blocks of 090 * {@link #DEFAULT_BLOCK_SIZE} vectors. 091 * <p> 092 * The specified {@link NearestNeighboursFactory} determines the actual type 093 * of k-means that will be performed; it could be exact nearest-neighbours, 094 * or it could be an approximate method, for example based on an ensemble of 095 * kd-trees. 096 * 097 * @param K 098 * number of clusters to be found 099 * @param nnFactory 100 * the factory for producing the {@link NearestNeighbours}. 101 */ 102 public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory) { 103 this(K, nnFactory, DEFAULT_NUMBER_ITERATIONS, DEFAULT_BLOCK_SIZE, GlobalExecutorPool.getPool()); 104 } 105 106 /** 107 * Create configuration for data that will create <code>K</code> clusters. 108 * The algorithm will run for a maximum of the given number of iterations, 109 * and will make use of all available processors, processing with blocks of 110 * {@link #DEFAULT_BLOCK_SIZE} vectors. 111 * <p> 112 * The specified {@link NearestNeighboursFactory} determines the actual type 113 * of k-means that will be performed; it could be exact nearest-neighbours, 114 * or it could be an approximate method, for example based on an ensemble of 115 * kd-trees. 116 * 117 * @param K 118 * number of clusters to be found 119 * @param nnFactory 120 * the factory for producing the {@link NearestNeighbours}. 121 * @param niters 122 * number of iterations 123 */ 124 public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory, int niters) { 125 this(K, nnFactory, niters, DEFAULT_BLOCK_SIZE, GlobalExecutorPool.getPool()); 126 } 127 128 /** 129 * Create configuration for data that will create <code>K</code> clusters. 130 * The algorithm will run for a maximum of the given number of iterations, 131 * and will make use of the provided threadpool, processing with blocks of 132 * {@link #DEFAULT_BLOCK_SIZE} vectors. 133 * <p> 134 * The specified {@link NearestNeighboursFactory} determines the actual type 135 * of k-means that will be performed; it could be exact nearest-neighbours, 136 * or it could be an approximate method, for example based on an ensemble of 137 * kd-trees. 138 * 139 * @param K 140 * number of clusters to be found 141 * @param nnFactory 142 * the factory for producing the {@link NearestNeighbours}. 143 * @param threadpool 144 * threadpool to use for parallel processing 145 * @param niters 146 * number of training iterations 147 */ 148 public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory, int niters, 149 ExecutorService threadpool) 150 { 151 this(K, nnFactory, niters, DEFAULT_BLOCK_SIZE, threadpool); 152 } 153 154 /** 155 * Create configuration for data with <code>M</code> dimensions that will 156 * create <code>K</code> clusters. The algorithm will run for a maximum of 157 * the given number of iterations, and will make use of 158 * <code>nThreads</code> processors, processing with blocks of 159 * {@link #DEFAULT_BLOCK_SIZE} vectors. 160 * <p> 161 * The specified {@link NearestNeighboursFactory} determines the actual type 162 * of k-means that will be performed; it could be exact nearest-neighbours, 163 * or it could be an approximate method, for example based on an ensemble of 164 * kd-trees. 165 * 166 * @param K 167 * number of clusters to be found 168 * @param nnFactory 169 * the factory for producing the {@link NearestNeighbours}. 170 * @param threadpool 171 * threadpool to use for parallel processing 172 * @param blockSize 173 * number of samples per parallel thread 174 * @param niters 175 * number of training iterations 176 */ 177 public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory, int niters, 178 int blockSize, ExecutorService threadpool) 179 { 180 this.K = K; 181 this.factory = nnFactory; 182 this.niters = niters; 183 this.blockSize = blockSize; 184 this.threadpool = (threadpool == null ? GlobalExecutorPool.getPool() : threadpool); 185 } 186 187 /** 188 * A completely default configuration used primarily as a convenience 189 * function for reading. The number of dimensions, number of clusters and 190 * nearest-neighbours factory must be set before the configuration is used. 191 */ 192 public KMeansConfiguration() { 193 this(0, null); 194 } 195 196 /* 197 * (non-Javadoc) 198 * 199 * @see java.lang.Object#clone() 200 */ 201 @SuppressWarnings("unchecked") 202 @Override 203 public KMeansConfiguration<NN, DATA> clone() { 204 try { 205 return (KMeansConfiguration<NN, DATA>) super.clone(); 206 } catch (final CloneNotSupportedException e) { 207 throw new RuntimeException(e); 208 } 209 } 210 211 /** 212 * Get the number of clusters 213 * 214 * @return the number of clusters 215 */ 216 public int getK() { 217 return K; 218 } 219 220 /** 221 * Set the number of clusters 222 * 223 * @param k 224 * the number of clusters 225 */ 226 public void setK(int k) { 227 K = k; 228 } 229 230 /** 231 * Get the number of clusters 232 * 233 * @return the number of clusters 234 */ 235 public int numClusters() { 236 return K; 237 } 238 239 /** 240 * Set the number of clusters 241 * 242 * @param k 243 * the number of clusters 244 */ 245 public void setNumClusters(int k) { 246 K = k; 247 } 248 249 /** 250 * Get the number of samples processed in a batch by a thread. This needs to 251 * be small enough that that the memory isn't exhausted, but big enough for 252 * the thread to have enough data to work for a while. 253 * 254 * @return the the number of samples processed in a batch by a thread 255 */ 256 public int getBlockSize() { 257 return blockSize; 258 } 259 260 /** 261 * Set the number of samples processed in a batch by a thread. This needs to 262 * be small enough that that the memory isn't exhausted, but big enough for 263 * the thread to have enough data to work for a while. 264 * 265 * @param blockSize 266 * the number of samples processed in a batch by a thread 267 */ 268 public void setBlockSize(int blockSize) { 269 this.blockSize = blockSize; 270 } 271 272 /** 273 * Get the maximum allowed number of iterations. 274 * 275 * @return the maximum allowed number of iterations. 276 */ 277 public int getMaxIterations() { 278 return niters; 279 } 280 281 /** 282 * Set the maximum allowed number of iterations. 283 * 284 * @param niters 285 * the maximum allowed number of iterations. 286 */ 287 public void setMaxIterations(int niters) { 288 this.niters = niters; 289 } 290 291 /** 292 * Get the factory that produces the {@link NearestNeighbours} during 293 * clustering. 294 * 295 * @return the factory 296 */ 297 public NearestNeighboursFactory<? extends NN, DATA> getNearestNeighbourFactory() { 298 return factory; 299 } 300 301 /** 302 * Set the factory that produces the {@link NearestNeighbours} during 303 * clustering. 304 * 305 * @param factory 306 * the factory to set 307 */ 308 public void setNearestNeighbourFactory(NearestNeighboursFactory<? extends NN, DATA> factory) { 309 this.factory = factory; 310 } 311}