001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.ml.clustering.kmeans;
031
032import java.util.ArrayList;
033import java.util.List;
034import java.util.Random;
035
036import org.openimaj.data.DataSource;
037import org.openimaj.data.DoubleArrayBackedDataSource;
038import org.openimaj.ml.clustering.IndexClusters;
039import org.openimaj.ml.clustering.SpatialClusterer;
040import org.openimaj.util.function.Operation;
041import org.openimaj.util.parallel.Parallel;
042import org.openimaj.util.parallel.Parallel.IntRange;
043
044/**
045 * Multithreaded (optionally) damped spherical k-means with support for
046 * bigger-than-memory data.
047 * <p>
048 * Spherical K-Means uses the inner product as the similarity metric, and is
049 * constrained to finding centroids that lie on the surface of the unit
050 * hypersphere (i.e. their length is 1). More formally, it solves:
051 * <p>
052 * min_{D,s}(sum_i(||Ds^(i) - x^(i)||_2^2))
053 * <p>
054 * s.t. ||s^(i)||_0 <= 1, for all i and ||D^(j)||_2 = 1, for all i
055 * <p>
056 * where D is a dictionary of centroids (with unit length) and s is an indicator
057 * vector that is all zeros, except for a non-zero value in the position
058 * corresponding top the assigned centroid.
059 * <p>
060 * The optional damping operation includes the previous centroid position in the
061 * update computation, ensuring smoother convergence.
062 * <p>
063 * This implementation performs initialisation by randomly sampling centroids
064 * from a Gaussian distribution, and then normalising to unit length. Any
065 * centroids that become empty during the iterations are replaced by a new
066 * random centroid generated in the same manner.
067 * <p>
068 * This implementation is able to deal with larger-than-memory datasets by
069 * streaming the samples from disk using an appropriate {@link DataSource}. The
070 * only requirement is that there is enough memory to hold all the centroids
071 * plus working memory for the batches of samples being assigned.
072 *
073 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
074 *
075 */
076public class SphericalKMeans implements SpatialClusterer<SphericalKMeansResult, double[]> {
077        /**
078         * Object storing the result of the previous iteration of spherical kmeans.
079         * The object should be considered to be immutable, and read only. The
080         * kmeans implementation will reuse the same instance for each iteration.
081         *
082         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
083         */
084        public static class IterationResult {
085                /**
086                 * The iteration number, starting from 0
087                 */
088                public int iteration;
089                /**
090                 * The change in fitness from the previous iteration
091                 */
092                public double delta;
093                /**
094                 * The current results
095                 */
096                public SphericalKMeansResult result;
097        }
098
099        protected final Random rng = new Random();
100        protected final boolean damped;
101        protected final int maxIters;
102        protected final int k;
103        protected final double terminationEps = 0.1;
104        protected List<Operation<IterationResult>> iterationListeners = new ArrayList<Operation<IterationResult>>(0);
105
106        /**
107         * Construct with the given parameters. Uses damped updates and terminates
108         * if the change in fit between iterations is less than 0.1.
109         *
110         * @param k
111         *            number of clusters
112         * @param maxIters
113         *            maximum number of iterations
114         */
115        public SphericalKMeans(int k, int maxIters) {
116                this(k, maxIters, true);
117        }
118
119        /**
120         * Construct with the given parameters. Terminates if the change in fit
121         * between iterations is less than 0.1.
122         *
123         * @param k
124         *            number of clusters
125         * @param maxIters
126         *            maximum number of iterations
127         * @param damped
128         *            use damped updates
129         */
130        public SphericalKMeans(int k, int maxIters, boolean damped) {
131                this.k = k;
132                this.maxIters = maxIters;
133                this.damped = damped;
134        }
135
136        /**
137         * Construct with the given parameters. Uses damped updates and terminates
138         * if the change in fit between iterations is less than 0.1 or 10 iterations
139         * is reached.
140         *
141         * @param k
142         *            number of clusters
143         */
144        public SphericalKMeans(int k) {
145                this(k, 10);
146        }
147
148        private void makeRandomCentroid(double[] ds) {
149                double sumsq = 0;
150                for (int i = 0; i < ds.length; i++) {
151                        ds[i] = rng.nextGaussian();
152                        sumsq += (ds[i] * ds[i]);
153                }
154
155                sumsq = 1 / Math.sqrt(sumsq);
156                for (int i = 0; i < ds.length; i++) {
157                        ds[i] *= sumsq;
158                }
159        }
160
161        double performIteration(final DataSource<double[]> data, final SphericalKMeansResult result) {
162                final int[] clusterSizes = new int[result.centroids.length];
163                final double[][] newCentroids = new double[result.centroids.length][result.centroids[0].length];
164
165                final double[] delta = { 0 };
166
167                // perform the assignments
168                Parallel.forRange(0, data.size(), 1, new Operation<Parallel.IntRange>() {
169                        @Override
170                        public void perform(IntRange range) {
171                                for (int i = range.start; i < range.stop; i++) {
172                                        final double[] vector = data.getData(i);
173                                        double assignmentWeight = Double.MIN_VALUE;
174                                        for (int j = 0; j < result.centroids.length; j++) {
175                                                final double[] centroid = result.centroids[j];
176
177                                                double dp = 0;
178                                                for (int k = 0; k < centroid.length; k++) {
179                                                        dp += centroid[k] * vector[k];
180                                                }
181
182                                                if (dp > assignmentWeight) {
183                                                        assignmentWeight = dp;
184                                                        result.assignments[i] = j;
185                                                }
186                                        }
187
188                                        // aggregate the assignments to the relevant cluster
189                                        synchronized (newCentroids) {
190                                                clusterSizes[result.assignments[i]]++;
191                                                delta[0] += assignmentWeight;
192                                                for (int k = 0; k < newCentroids[0].length; k++) {
193                                                        newCentroids[result.assignments[i]][k] += vector[k];
194                                                }
195                                        }
196                                }
197                        }
198                });
199
200                // update the centroids
201                Parallel.forRange(0, result.centroids.length, 1, new Operation<Parallel.IntRange>() {
202                        @Override
203                        public void perform(IntRange range) {
204                                for (int j = range.start; j < range.stop; j++) {
205                                        if (clusterSizes[j] == 0) {
206                                                // reinit to random vector
207                                                makeRandomCentroid(result.centroids[j]);
208                                        } else {
209                                                final double[] centroid = result.centroids[j];
210                                                final double[] ncentroid = newCentroids[j];
211
212                                                double norm = 0;
213                                                if (damped) {
214                                                        for (int k = 0; k < centroid.length; k++) {
215                                                                centroid[k] += ncentroid[k];
216                                                                norm += centroid[k] * centroid[k];
217                                                        }
218                                                } else {
219                                                        for (int k = 0; k < centroid.length; k++) {
220                                                                centroid[k] = ncentroid[k];
221                                                                norm += centroid[k] * centroid[k];
222                                                        }
223                                                }
224                                                norm = 1.0 / Math.sqrt(norm);
225
226                                                for (int k = 0; k < ncentroid.length; k++) {
227                                                        centroid[k] *= norm;
228                                                }
229                                        }
230                                }
231                        }
232                });
233
234                return delta[0];
235        }
236
237        @Override
238        public int[][] performClustering(double[][] data) {
239                return new IndexClusters(cluster(data).assignments).clusters();
240        }
241
242        @Override
243        public SphericalKMeansResult cluster(double[][] data) {
244                return cluster(new DoubleArrayBackedDataSource(data));
245        }
246
247        @Override
248        public SphericalKMeansResult cluster(DataSource<double[]> data) {
249                final IterationResult ir = new IterationResult();
250                ir.result = new SphericalKMeansResult();
251                ir.result.centroids = new double[k][data.numDimensions()];
252                ir.result.assignments = new int[data.size()];
253
254                for (int j = 0; j < ir.result.centroids.length; j++) {
255                        makeRandomCentroid(ir.result.centroids[j]);
256                }
257
258                double last = 0;
259                for (ir.iteration = 0; ir.iteration < maxIters; ir.iteration++) {
260                        for (final Operation<IterationResult> l : iterationListeners)
261                                l.perform(ir);
262
263                        final double d = performIteration(data, ir.result);
264                        ir.delta = d - last;
265
266                        if (ir.delta < terminationEps)
267                                break;
268                        last = d;
269                }
270
271                return ir.result;
272        }
273
274        /**
275         * Add a listener that will be called before every iteration.
276         *
277         * @param op
278         *            the listener
279         */
280        public void addIterationListener(Operation<IterationResult> op) {
281                iterationListeners.add(op);
282        }
283}