001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.clustering.kmeans; 031 032import java.util.ArrayList; 033import java.util.List; 034import java.util.Random; 035 036import org.openimaj.data.DataSource; 037import org.openimaj.data.DoubleArrayBackedDataSource; 038import org.openimaj.ml.clustering.IndexClusters; 039import org.openimaj.ml.clustering.SpatialClusterer; 040import org.openimaj.util.function.Operation; 041import org.openimaj.util.parallel.Parallel; 042import org.openimaj.util.parallel.Parallel.IntRange; 043 044/** 045 * Multithreaded (optionally) damped spherical k-means with support for 046 * bigger-than-memory data. 047 * <p> 048 * Spherical K-Means uses the inner product as the similarity metric, and is 049 * constrained to finding centroids that lie on the surface of the unit 050 * hypersphere (i.e. their length is 1). More formally, it solves: 051 * <p> 052 * min_{D,s}(sum_i(||Ds^(i) - x^(i)||_2^2)) 053 * <p> 054 * s.t. ||s^(i)||_0 <= 1, for all i and ||D^(j)||_2 = 1, for all i 055 * <p> 056 * where D is a dictionary of centroids (with unit length) and s is an indicator 057 * vector that is all zeros, except for a non-zero value in the position 058 * corresponding top the assigned centroid. 059 * <p> 060 * The optional damping operation includes the previous centroid position in the 061 * update computation, ensuring smoother convergence. 062 * <p> 063 * This implementation performs initialisation by randomly sampling centroids 064 * from a Gaussian distribution, and then normalising to unit length. Any 065 * centroids that become empty during the iterations are replaced by a new 066 * random centroid generated in the same manner. 067 * <p> 068 * This implementation is able to deal with larger-than-memory datasets by 069 * streaming the samples from disk using an appropriate {@link DataSource}. The 070 * only requirement is that there is enough memory to hold all the centroids 071 * plus working memory for the batches of samples being assigned. 072 * 073 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 074 * 075 */ 076public class SphericalKMeans implements SpatialClusterer<SphericalKMeansResult, double[]> { 077 /** 078 * Object storing the result of the previous iteration of spherical kmeans. 079 * The object should be considered to be immutable, and read only. The 080 * kmeans implementation will reuse the same instance for each iteration. 081 * 082 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 083 */ 084 public static class IterationResult { 085 /** 086 * The iteration number, starting from 0 087 */ 088 public int iteration; 089 /** 090 * The change in fitness from the previous iteration 091 */ 092 public double delta; 093 /** 094 * The current results 095 */ 096 public SphericalKMeansResult result; 097 } 098 099 protected final Random rng = new Random(); 100 protected final boolean damped; 101 protected final int maxIters; 102 protected final int k; 103 protected final double terminationEps = 0.1; 104 protected List<Operation<IterationResult>> iterationListeners = new ArrayList<Operation<IterationResult>>(0); 105 106 /** 107 * Construct with the given parameters. Uses damped updates and terminates 108 * if the change in fit between iterations is less than 0.1. 109 * 110 * @param k 111 * number of clusters 112 * @param maxIters 113 * maximum number of iterations 114 */ 115 public SphericalKMeans(int k, int maxIters) { 116 this(k, maxIters, true); 117 } 118 119 /** 120 * Construct with the given parameters. Terminates if the change in fit 121 * between iterations is less than 0.1. 122 * 123 * @param k 124 * number of clusters 125 * @param maxIters 126 * maximum number of iterations 127 * @param damped 128 * use damped updates 129 */ 130 public SphericalKMeans(int k, int maxIters, boolean damped) { 131 this.k = k; 132 this.maxIters = maxIters; 133 this.damped = damped; 134 } 135 136 /** 137 * Construct with the given parameters. Uses damped updates and terminates 138 * if the change in fit between iterations is less than 0.1 or 10 iterations 139 * is reached. 140 * 141 * @param k 142 * number of clusters 143 */ 144 public SphericalKMeans(int k) { 145 this(k, 10); 146 } 147 148 private void makeRandomCentroid(double[] ds) { 149 double sumsq = 0; 150 for (int i = 0; i < ds.length; i++) { 151 ds[i] = rng.nextGaussian(); 152 sumsq += (ds[i] * ds[i]); 153 } 154 155 sumsq = 1 / Math.sqrt(sumsq); 156 for (int i = 0; i < ds.length; i++) { 157 ds[i] *= sumsq; 158 } 159 } 160 161 double performIteration(final DataSource<double[]> data, final SphericalKMeansResult result) { 162 final int[] clusterSizes = new int[result.centroids.length]; 163 final double[][] newCentroids = new double[result.centroids.length][result.centroids[0].length]; 164 165 final double[] delta = { 0 }; 166 167 // perform the assignments 168 Parallel.forRange(0, data.size(), 1, new Operation<Parallel.IntRange>() { 169 @Override 170 public void perform(IntRange range) { 171 for (int i = range.start; i < range.stop; i++) { 172 final double[] vector = data.getData(i); 173 double assignmentWeight = Double.MIN_VALUE; 174 for (int j = 0; j < result.centroids.length; j++) { 175 final double[] centroid = result.centroids[j]; 176 177 double dp = 0; 178 for (int k = 0; k < centroid.length; k++) { 179 dp += centroid[k] * vector[k]; 180 } 181 182 if (dp > assignmentWeight) { 183 assignmentWeight = dp; 184 result.assignments[i] = j; 185 } 186 } 187 188 // aggregate the assignments to the relevant cluster 189 synchronized (newCentroids) { 190 clusterSizes[result.assignments[i]]++; 191 delta[0] += assignmentWeight; 192 for (int k = 0; k < newCentroids[0].length; k++) { 193 newCentroids[result.assignments[i]][k] += vector[k]; 194 } 195 } 196 } 197 } 198 }); 199 200 // update the centroids 201 Parallel.forRange(0, result.centroids.length, 1, new Operation<Parallel.IntRange>() { 202 @Override 203 public void perform(IntRange range) { 204 for (int j = range.start; j < range.stop; j++) { 205 if (clusterSizes[j] == 0) { 206 // reinit to random vector 207 makeRandomCentroid(result.centroids[j]); 208 } else { 209 final double[] centroid = result.centroids[j]; 210 final double[] ncentroid = newCentroids[j]; 211 212 double norm = 0; 213 if (damped) { 214 for (int k = 0; k < centroid.length; k++) { 215 centroid[k] += ncentroid[k]; 216 norm += centroid[k] * centroid[k]; 217 } 218 } else { 219 for (int k = 0; k < centroid.length; k++) { 220 centroid[k] = ncentroid[k]; 221 norm += centroid[k] * centroid[k]; 222 } 223 } 224 norm = 1.0 / Math.sqrt(norm); 225 226 for (int k = 0; k < ncentroid.length; k++) { 227 centroid[k] *= norm; 228 } 229 } 230 } 231 } 232 }); 233 234 return delta[0]; 235 } 236 237 @Override 238 public int[][] performClustering(double[][] data) { 239 return new IndexClusters(cluster(data).assignments).clusters(); 240 } 241 242 @Override 243 public SphericalKMeansResult cluster(double[][] data) { 244 return cluster(new DoubleArrayBackedDataSource(data)); 245 } 246 247 @Override 248 public SphericalKMeansResult cluster(DataSource<double[]> data) { 249 final IterationResult ir = new IterationResult(); 250 ir.result = new SphericalKMeansResult(); 251 ir.result.centroids = new double[k][data.numDimensions()]; 252 ir.result.assignments = new int[data.size()]; 253 254 for (int j = 0; j < ir.result.centroids.length; j++) { 255 makeRandomCentroid(ir.result.centroids[j]); 256 } 257 258 double last = 0; 259 for (ir.iteration = 0; ir.iteration < maxIters; ir.iteration++) { 260 for (final Operation<IterationResult> l : iterationListeners) 261 l.perform(ir); 262 263 final double d = performIteration(data, ir.result); 264 ir.delta = d - last; 265 266 if (ir.delta < terminationEps) 267 break; 268 last = d; 269 } 270 271 return ir.result; 272 } 273 274 /** 275 * Add a listener that will be called before every iteration. 276 * 277 * @param op 278 * the listener 279 */ 280 public void addIterationListener(Operation<IterationResult> op) { 281 iterationListeners.add(op); 282 } 283}