001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.ml.clustering.kmeans;
031
032import java.util.concurrent.ExecutorService;
033
034import org.openimaj.knn.NearestNeighbours;
035import org.openimaj.knn.NearestNeighboursFactory;
036import org.openimaj.util.parallel.GlobalExecutorPool;
037
038/**
039 * Configuration for the K-Means algorithm.
040 * 
041 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
042 * 
043 * @param <NN>
044 *            The type of {@link NearestNeighbours} to use
045 * @param <DATA>
046 *            The type of data
047 */
048public class KMeansConfiguration<NN extends NearestNeighbours<DATA, ?, ?>, DATA> implements Cloneable {
049        /**
050         * The default number of samples per parallel assignment instance.
051         */
052        public static final int DEFAULT_BLOCK_SIZE = 50000;
053
054        /**
055         * The default number of iterations.
056         */
057        public static final int DEFAULT_NUMBER_ITERATIONS = 30;
058
059        /**
060         * The number of clusters
061         */
062        protected int K;
063
064        /**
065         * The factory for producing the {@link NearestNeighbours} objects used in
066         * assignment.
067         */
068        protected NearestNeighboursFactory<? extends NN, DATA> factory;
069
070        /**
071         * The size of processing blocks for each thread
072         */
073        protected int blockSize;
074
075        /**
076         * The max number of iterations
077         */
078        protected int niters;
079
080        /**
081         * The threadpool for parallel processing
082         */
083        protected ExecutorService threadpool;
084
085        /**
086         * Create configuration for data that will create <code>K</code> clusters.
087         * The algorithm will run for a maximum of
088         * {@link #DEFAULT_NUMBER_ITERATIONS} iterations, and make use of all
089         * available processors, processing with blocks of
090         * {@link #DEFAULT_BLOCK_SIZE} vectors.
091         * <p>
092         * The specified {@link NearestNeighboursFactory} determines the actual type
093         * of k-means that will be performed; it could be exact nearest-neighbours,
094         * or it could be an approximate method, for example based on an ensemble of
095         * kd-trees.
096         * 
097         * @param K
098         *            number of clusters to be found
099         * @param nnFactory
100         *            the factory for producing the {@link NearestNeighbours}.
101         */
102        public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory) {
103                this(K, nnFactory, DEFAULT_NUMBER_ITERATIONS, DEFAULT_BLOCK_SIZE, GlobalExecutorPool.getPool());
104        }
105
106        /**
107         * Create configuration for data that will create <code>K</code> clusters.
108         * The algorithm will run for a maximum of the given number of iterations,
109         * and will make use of all available processors, processing with blocks of
110         * {@link #DEFAULT_BLOCK_SIZE} vectors.
111         * <p>
112         * The specified {@link NearestNeighboursFactory} determines the actual type
113         * of k-means that will be performed; it could be exact nearest-neighbours,
114         * or it could be an approximate method, for example based on an ensemble of
115         * kd-trees.
116         * 
117         * @param K
118         *            number of clusters to be found
119         * @param nnFactory
120         *            the factory for producing the {@link NearestNeighbours}.
121         * @param niters
122         *            number of iterations
123         */
124        public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory, int niters) {
125                this(K, nnFactory, niters, DEFAULT_BLOCK_SIZE, GlobalExecutorPool.getPool());
126        }
127
128        /**
129         * Create configuration for data that will create <code>K</code> clusters.
130         * The algorithm will run for a maximum of the given number of iterations,
131         * and will make use of the provided threadpool, processing with blocks of
132         * {@link #DEFAULT_BLOCK_SIZE} vectors.
133         * <p>
134         * The specified {@link NearestNeighboursFactory} determines the actual type
135         * of k-means that will be performed; it could be exact nearest-neighbours,
136         * or it could be an approximate method, for example based on an ensemble of
137         * kd-trees.
138         * 
139         * @param K
140         *            number of clusters to be found
141         * @param nnFactory
142         *            the factory for producing the {@link NearestNeighbours}.
143         * @param threadpool
144         *            threadpool to use for parallel processing
145         * @param niters
146         *            number of training iterations
147         */
148        public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory, int niters,
149                        ExecutorService threadpool)
150        {
151                this(K, nnFactory, niters, DEFAULT_BLOCK_SIZE, threadpool);
152        }
153
154        /**
155         * Create configuration for data with <code>M</code> dimensions that will
156         * create <code>K</code> clusters. The algorithm will run for a maximum of
157         * the given number of iterations, and will make use of
158         * <code>nThreads</code> processors, processing with blocks of
159         * {@link #DEFAULT_BLOCK_SIZE} vectors.
160         * <p>
161         * The specified {@link NearestNeighboursFactory} determines the actual type
162         * of k-means that will be performed; it could be exact nearest-neighbours,
163         * or it could be an approximate method, for example based on an ensemble of
164         * kd-trees.
165         * 
166         * @param K
167         *            number of clusters to be found
168         * @param nnFactory
169         *            the factory for producing the {@link NearestNeighbours}.
170         * @param threadpool
171         *            threadpool to use for parallel processing
172         * @param blockSize
173         *            number of samples per parallel thread
174         * @param niters
175         *            number of training iterations
176         */
177        public KMeansConfiguration(int K, NearestNeighboursFactory<? extends NN, DATA> nnFactory, int niters,
178                        int blockSize, ExecutorService threadpool)
179        {
180                this.K = K;
181                this.factory = nnFactory;
182                this.niters = niters;
183                this.blockSize = blockSize;
184                this.threadpool = (threadpool == null ? GlobalExecutorPool.getPool() : threadpool);
185        }
186
187        /**
188         * A completely default configuration used primarily as a convenience
189         * function for reading. The number of dimensions, number of clusters and
190         * nearest-neighbours factory must be set before the configuration is used.
191         */
192        public KMeansConfiguration() {
193                this(0, null);
194        }
195
196        /*
197         * (non-Javadoc)
198         * 
199         * @see java.lang.Object#clone()
200         */
201        @SuppressWarnings("unchecked")
202        @Override
203        public KMeansConfiguration<NN, DATA> clone() {
204                try {
205                        return (KMeansConfiguration<NN, DATA>) super.clone();
206                } catch (final CloneNotSupportedException e) {
207                        throw new RuntimeException(e);
208                }
209        }
210
211        /**
212         * Get the number of clusters
213         * 
214         * @return the number of clusters
215         */
216        public int getK() {
217                return K;
218        }
219
220        /**
221         * Set the number of clusters
222         * 
223         * @param k
224         *            the number of clusters
225         */
226        public void setK(int k) {
227                K = k;
228        }
229
230        /**
231         * Get the number of clusters
232         * 
233         * @return the number of clusters
234         */
235        public int numClusters() {
236                return K;
237        }
238
239        /**
240         * Set the number of clusters
241         * 
242         * @param k
243         *            the number of clusters
244         */
245        public void setNumClusters(int k) {
246                K = k;
247        }
248
249        /**
250         * Get the number of samples processed in a batch by a thread. This needs to
251         * be small enough that that the memory isn't exhausted, but big enough for
252         * the thread to have enough data to work for a while.
253         * 
254         * @return the the number of samples processed in a batch by a thread
255         */
256        public int getBlockSize() {
257                return blockSize;
258        }
259
260        /**
261         * Set the number of samples processed in a batch by a thread. This needs to
262         * be small enough that that the memory isn't exhausted, but big enough for
263         * the thread to have enough data to work for a while.
264         * 
265         * @param blockSize
266         *            the number of samples processed in a batch by a thread
267         */
268        public void setBlockSize(int blockSize) {
269                this.blockSize = blockSize;
270        }
271
272        /**
273         * Get the maximum allowed number of iterations.
274         * 
275         * @return the maximum allowed number of iterations.
276         */
277        public int getMaxIterations() {
278                return niters;
279        }
280
281        /**
282         * Set the maximum allowed number of iterations.
283         * 
284         * @param niters
285         *            the maximum allowed number of iterations.
286         */
287        public void setMaxIterations(int niters) {
288                this.niters = niters;
289        }
290
291        /**
292         * Get the factory that produces the {@link NearestNeighbours} during
293         * clustering.
294         * 
295         * @return the factory
296         */
297        public NearestNeighboursFactory<? extends NN, DATA> getNearestNeighbourFactory() {
298                return factory;
299        }
300
301        /**
302         * Set the factory that produces the {@link NearestNeighbours} during
303         * clustering.
304         * 
305         * @param factory
306         *            the factory to set
307         */
308        public void setNearestNeighbourFactory(NearestNeighboursFactory<? extends NN, DATA> factory) {
309                this.factory = factory;
310        }
311}