001/* 002 AUTOMATICALLY GENERATED BY jTemp FROM 003 /Users/jsh2/Work/openimaj/target/checkout/machine-learning/clustering/src/main/jtemp/org/openimaj/ml/clustering/kmeans/#T#KMeans.jtemp 004*/ 005/** 006 * Copyright (c) 2011, The University of Southampton and the individual contributors. 007 * All rights reserved. 008 * 009 * Redistribution and use in source and binary forms, with or without modification, 010 * are permitted provided that the following conditions are met: 011 * 012 * * Redistributions of source code must retain the above copyright notice, 013 * this list of conditions and the following disclaimer. 014 * 015 * * Redistributions in binary form must reproduce the above copyright notice, 016 * this list of conditions and the following disclaimer in the documentation 017 * and/or other materials provided with the distribution. 018 * 019 * * Neither the name of the University of Southampton nor the names of its 020 * contributors may be used to endorse or promote products derived from this 021 * software without specific prior written permission. 022 * 023 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 024 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 025 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 026 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 027 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 028 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 029 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 030 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 032 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 033 */ 034 035package org.openimaj.ml.clustering.kmeans; 036 037import java.util.ArrayList; 038import java.util.Arrays; 039import java.util.List; 040import java.util.Random; 041import java.util.concurrent.Callable; 042import java.util.concurrent.ExecutorService; 043 044import org.openimaj.data.DataSource; 045import org.openimaj.data.LongArrayBackedDataSource; 046import org.openimaj.ml.clustering.IndexClusters; 047import org.openimaj.ml.clustering.SpatialClusterer; 048import org.openimaj.ml.clustering.assignment.HardAssigner; 049import org.openimaj.ml.clustering.assignment.hard.KDTreeLongEuclideanAssigner; 050import org.openimaj.ml.clustering.assignment.hard.ExactLongAssigner; 051import org.openimaj.ml.clustering.LongCentroidsResult; 052import org.openimaj.knn.LongNearestNeighbours; 053import org.openimaj.knn.LongNearestNeighboursExact; 054import org.openimaj.knn.LongNearestNeighboursProvider; 055import org.openimaj.knn.NearestNeighboursFactory; 056import org.openimaj.knn.approximate.LongNearestNeighboursKDTree; 057import org.openimaj.util.pair.IntDoublePair; 058 059/** 060 * Fast, parallel implementation of the K-Means algorithm with support for 061 * bigger-than-memory data. Various flavors of K-Means are supported through the 062 * selection of different subclasses of {@link LongNearestNeighbours}; for 063 * example, approximate K-Means can be achieved using a 064 * {@link LongNearestNeighboursKDTree} whilst exact K-Means can be achieved 065 * using an {@link LongNearestNeighboursExact}. The specific choice of 066 * nearest-neighbour object is controlled through the 067 * {@link NearestNeighboursFactory} provided to the {@link KMeansConfiguration} 068 * used to construct instances of this class. The choice of 069 * {@link LongNearestNeighbours} affects the speed of clustering; using 070 * approximate nearest-neighbours algorithms for the K-Means can produces 071 * comparable results to the exact KMeans algorithm in much shorter time. 072 * The choice and configuration of {@link LongNearestNeighbours} can also 073 * control the type of distance function being used in the clustering. 074 * <p> 075 * The algorithm is implemented as follows: Clustering is initiated using a 076 * {@link LongKMeansInit} and is iterative. In each round, batches of 077 * samples are assigned to centroids in parallel. The centroid assignment is 078 * performed using the pre-configured {@link LongNearestNeighbours} instances 079 * created from the {@link KMeansConfiguration}. Once all samples are assigned 080 * new centroids are calculated and the next round started. Data point pushing 081 * is performed using the same techniques as center point assignment. 082 * <p> 083 * This implementation is able to deal with larger-than-memory datasets by 084 * streaming the samples from disk using an appropriate {@link DataSource}. The 085 * only requirement is that there is enough memory to hold all the centroids 086 * plus working memory for the batches of samples being assigned. 087 * 088 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 089 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 090 */ 091 public class LongKMeans implements SpatialClusterer<LongCentroidsResult, long[]> { 092 private static class CentroidAssignmentJob implements Callable<Boolean> { 093 private final DataSource<long[]> ds; 094 private final int startRow; 095 private final int stopRow; 096 private final LongNearestNeighbours nno; 097 private final double [][] centroids_accum; 098 private final int [] counts; 099 100 public CentroidAssignmentJob(DataSource<long[]> ds, int startRow, int stopRow, LongNearestNeighbours nno, double [][] centroids_accum, int [] counts) { 101 this.ds = ds; 102 this.startRow = startRow; 103 this.stopRow = stopRow; 104 this.nno = nno; 105 this.centroids_accum = centroids_accum; 106 this.counts = counts; 107 } 108 109 @Override 110 public Boolean call() { 111 try { 112 int D = nno.numDimensions(); 113 114 long [][] points = new long[stopRow-startRow][D]; 115 ds.getData(startRow, stopRow, points); 116 117 int [] argmins = new int[points.length]; 118 double [] mins = new double[points.length]; 119 120 nno.searchNN(points, argmins, mins); 121 122 synchronized(centroids_accum){ 123 for (int i=0; i < points.length; ++i) { 124 int k = argmins[i]; 125 for (int d=0; d < D; ++d) { 126 centroids_accum[k][d] += points[i][d]; 127 } 128 counts[k] += 1; 129 } 130 } 131 } catch(Exception e) { 132 e.printStackTrace(); 133 } 134 return true; 135 } 136 } 137 138 /** 139 * Result object for LongKMeans, extending LongCentroidsResult and LongNearestNeighboursProvider, 140 * as well as giving access to state information from the operation of the K-Means algorithm 141 * (i.e. number of iterations, and convergence state). 142 * 143 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 144 */ 145 public static class Result extends LongCentroidsResult implements LongNearestNeighboursProvider { 146 protected LongNearestNeighbours nn; 147 protected int iterations; 148 protected int changedCentroidCount; 149 150 @Override 151 public HardAssigner<long[], double[], IntDoublePair> defaultHardAssigner() { 152 if (nn instanceof LongNearestNeighboursExact) 153 return new ExactLongAssigner(this, ((LongNearestNeighboursExact)nn).distanceComparator()); 154 155 return new KDTreeLongEuclideanAssigner(this); 156 } 157 158 @Override 159 public LongNearestNeighbours getNearestNeighbours() { 160 return nn; 161 } 162 163 /** 164 * Get the number of K-Means iterations that produced this result. 165 * @return the number of iterations 166 */ 167 public int numIterations() { 168 return iterations; 169 } 170 171 /** 172 * Get the number of changed centroids in the last iteration. This is 173 * an indicator of convergence as over time this should reduce to 0. 174 * @return the number of changed centroids 175 */ 176 public int numChangedCentroids() { 177 return changedCentroidCount; 178 } 179 } 180 181 private LongKMeansInit init = new LongKMeansInit.RANDOM(); 182 private KMeansConfiguration<LongNearestNeighbours, long[]> conf; 183 private Random rng = new Random(); 184 185 /** 186 * Construct the clusterer with the the given configuration. 187 * 188 * @param conf The configuration. 189 */ 190 public LongKMeans(KMeansConfiguration<LongNearestNeighbours, long[]> conf) { 191 this.conf = conf; 192 } 193 194 /** 195 * A completely default {@link LongKMeans} used primarily as a convenience function for reading. 196 */ 197 protected LongKMeans() { 198 this(new KMeansConfiguration<LongNearestNeighbours, long[]>()); 199 } 200 201 /** 202 * Get the current initialisation algorithm 203 * 204 * @return the init algorithm being used 205 */ 206 public LongKMeansInit getInit() { 207 return init; 208 } 209 210 /** 211 * Set the current initialisation algorithm 212 * 213 * @param init the init algorithm to be used 214 */ 215 public void setInit(LongKMeansInit init) { 216 this.init = init; 217 } 218 219 /** 220 * Set the seed for the internal random number generator. 221 * 222 * @param seed the random seed for init random sample selection, no seed if seed < -1 223 */ 224 public void seed(long seed) { 225 if(seed < 0) 226 this.rng = new Random(); 227 else 228 this.rng = new Random(seed); 229 } 230 231 @Override 232 public Result cluster(long[][] data) { 233 DataSource<long[]> ds = new LongArrayBackedDataSource(data, rng); 234 235 try { 236 Result result = cluster(ds, conf.K); 237 result.nn = conf.factory.create(result.centroids); 238 239 return result; 240 } catch (Exception e) { 241 throw new RuntimeException(e); 242 } 243 } 244 245 @Override 246 public int[][] performClustering(long[][] data) { 247 LongCentroidsResult clusters = this.cluster(data); 248 return new IndexClusters(clusters.defaultHardAssigner().assign(data)).clusters(); 249 } 250 251 /** 252 * Initiate clustering with the given data and number of clusters. 253 * Internally this method constructs the array to hold the centroids 254 * and calls {@link #cluster(DataSource, long [][])}. 255 * 256 * @param data data source to cluster with 257 * @param K number of clusters to find 258 * @return cluster centroids 259 */ 260 protected Result cluster(DataSource<long[]> data, int K) throws Exception { 261 int D = data.numDimensions(); 262 263 Result result = new Result(); 264 result.centroids = new long[K][D]; 265 266 init.initKMeans(data, result.centroids); 267 268 cluster(data, result); 269 270 return result; 271 } 272 273 /** 274 * Main clustering algorithm. A number of threads as specified are 275 * started each containing an assignment job and a reference to 276 * the same set of LongNearestNeighbours object (i.e. Exact or KDTree). 277 * Each thread is added to a job pool and started in parallel. 278 * A single accumulator is shared between all threads and locked on update. 279 * <br/> 280 * This methods expects that the initial centroids have already been set in 281 * the <code>result</code> object and as such <strong>ignores</strong> the 282 * init object. <strong>In normal operation you should call one of the other <code>cluster</code> 283 * cluster methods instead of this one.</strong> However, if you wish to resume clustering 284 * iterations from a result that you've already generated this is the method 285 * to use. 286 * 287 * @param data the data to be clustered 288 * @param result the results object to be populated 289 * @throws InterruptedException if interrupted while waiting, in 290 * which case unfinished tasks are cancelled. 291 */ 292 public void cluster(long[][] data, Result result) throws InterruptedException { 293 DataSource<long[]> ds = new LongArrayBackedDataSource(data, rng); 294 295 cluster(ds, result); 296 } 297 298 /** 299 * Main clustering algorithm. A number of threads as specified are 300 * started each containing an assignment job and a reference to 301 * the same set of LongNearestNeighbours object (i.e. Exact or KDTree). 302 * Each thread is added to a job pool and started in parallel. 303 * A single accumulator is shared between all threads and locked on update. 304 * <br/> 305 * This methods expects that the initial centroids have already been set in 306 * the <code>result</code> object and as such <strong>ignores</strong> the 307 * init object. In normal operation you should call one of the other <code>cluster</code> 308 * cluster methods instead of this one. However, if you wish to resume clustering 309 * iterations from a result that you've already generated this is the method 310 * to use. 311 * 312 * @param data the data to be clustered 313 * @param result the results object to be populated 314 * @throws InterruptedException if interrupted while waiting, in 315 * which case unfinished tasks are cancelled. 316 */ 317 public void cluster(DataSource<long[]> data, Result result) throws InterruptedException { 318 final long[][] centroids = result.centroids; 319 final int K = centroids.length; 320 final int D = centroids[0].length; 321 final int N = data.size(); 322 double [][] centroids_accum = new double[K][D]; 323 int [] new_counts = new int[K]; 324 325 ExecutorService service = conf.threadpool; 326 327 for (int i=0; i<conf.niters; i++) { 328 result.iterations++; 329 330 for (int j=0; j<K; j++) 331 Arrays.fill(centroids_accum[j], 0); 332 Arrays.fill(new_counts, 0); 333 334 LongNearestNeighbours nno = conf.factory.create(centroids); 335 336 List<CentroidAssignmentJob> jobs = new ArrayList<CentroidAssignmentJob>(); 337 for (int bl = 0; bl < N; bl += conf.blockSize) { 338 int br = Math.min(bl + conf.blockSize, N); 339 jobs.add(new CentroidAssignmentJob(data, bl, br, nno, centroids_accum, new_counts)); 340 } 341 342 service.invokeAll(jobs); 343 344 result.changedCentroidCount = 0; 345 for (int k=0; k < K; ++k) { 346 double ssd = 0; 347 if (new_counts[k] == 0) { 348 // If there's an empty cluster we replace it with a random point. 349 new_counts[k] = 1; 350 351 long [][] rnd = new long[][] {centroids[k]}; 352 data.getRandomRows(rnd); 353 result.changedCentroidCount++; 354 } else { 355 for (int d=0; d < D; ++d) { 356 long newValue = (long)((double)roundDouble((double)centroids_accum[k][d] / (double)new_counts[k])); 357 358 // we're going to accumulate the SSD of the old vs new centroids 359 // as a way of determining if this centroid has changed 360 double diff = newValue - centroids[k][d]; 361 ssd += diff*diff; 362 363 //update to new centroid 364 centroids[k][d] = newValue; 365 } 366 367 if (ssd != 0) 368 result.changedCentroidCount++; 369 } 370 } 371 372 if (result.changedCentroidCount == 0) 373 break; // convergence 374 } 375 } 376 377 protected float roundFloat(double value) { return (float) value; } 378 protected double roundDouble(double value) { return value; } 379 protected long roundLong(double value) { return (long)Math.round(value); } 380 protected int roundInt(double value) { return (int)Math.round(value); } 381 382 @Override 383 public Result cluster(DataSource<long[]> ds) { 384 try { 385 Result result = cluster(ds, conf.K); 386 result.nn = conf.factory.create(result.centroids); 387 388 return result; 389 } catch (Exception e) { 390 throw new RuntimeException(e); 391 } 392 } 393 394 /** 395 * Get the configuration 396 * 397 * @return the configuration 398 */ 399 public KMeansConfiguration<LongNearestNeighbours, long[]> getConfiguration() { 400 return conf; 401 } 402 403 /** 404 * Set the configuration 405 * 406 * @param conf 407 * the configuration to set 408 */ 409 public void setConfiguration(KMeansConfiguration<LongNearestNeighbours, long[]> conf) { 410 this.conf = conf; 411 } 412 413 /** 414 * Convenience method to quickly create an exact {@link LongKMeans}. All 415 * parameters other than the number of clusters are set 416 * at their defaults, but can be manipulated through the configuration 417 * returned by {@link #getConfiguration()}. 418 * <p> 419 * Euclidean distance is used to measure the distance between points. 420 * 421 * @param K 422 * the number of clusters 423 * @return a {@link LongKMeans} instance configured for exact k-means 424 */ 425 public static LongKMeans createExact(int K) { 426 final KMeansConfiguration<LongNearestNeighbours, long[]> conf = 427 new KMeansConfiguration<LongNearestNeighbours, long[]>(K, new LongNearestNeighboursExact.Factory()); 428 429 return new LongKMeans(conf); 430 } 431 432 /** 433 * Convenience method to quickly create an exact {@link LongKMeans}. All 434 * parameters other than the number of clusters and number 435 * of iterations are set at their defaults, but can be manipulated through 436 * the configuration returned by {@link #getConfiguration()}. 437 * <p> 438 * Euclidean distance is used to measure the distance between points. 439 * 440 * @param K 441 * the number of clusters 442 * @param niters 443 * maximum number of iterations 444 * @return a {@link LongKMeans} instance configured for exact k-means 445 */ 446 public static LongKMeans createExact(int K, int niters) { 447 final KMeansConfiguration<LongNearestNeighbours, long[]> conf = 448 new KMeansConfiguration<LongNearestNeighbours, long[]>(K, new LongNearestNeighboursExact.Factory(), niters); 449 450 return new LongKMeans(conf); 451 } 452 453 /** 454 * Convenience method to quickly create an approximate {@link LongKMeans} 455 * using an ensemble of KD-Trees to perform nearest-neighbour lookup. All 456 * parameters other than the number of clusters are set 457 * at their defaults, but can be manipulated through the configuration 458 * returned by {@link #getConfiguration()}. 459 * <p> 460 * Euclidean distance is used to measure the distance between points. 461 * 462 * @param K 463 * the number of clusters 464 * @return a {@link LongKMeans} instance configured for approximate k-means 465 * using an ensemble of KD-Trees 466 */ 467 public static LongKMeans createKDTreeEnsemble(int K) { 468 final KMeansConfiguration<LongNearestNeighbours, long[]> conf = 469 new KMeansConfiguration<LongNearestNeighbours, long[]>(K, new LongNearestNeighboursKDTree.Factory()); 470 471 return new LongKMeans(conf); 472 } 473 474 @Override 475 public String toString() { 476 return String.format("%s: {K=%d, NN=%s}", this.getClass().getSimpleName(), this.conf.K, this.conf.getNearestNeighbourFactory().getClass().getSimpleName()); 477 } 478}