001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.ml.clustering.rac;
031
032import java.util.SortedMap;
033import java.util.TreeMap;
034
035import org.apache.commons.math.FunctionEvaluationException;
036import org.apache.commons.math.analysis.UnivariateRealFunction;
037
038/**
039 * Similar to {@link IntRAC} but explicitly specify the limit the number of
040 * clusters. Attempts to replace clusters which are "closer" with those that are
041 * "further".
042 * <p>
043 * As with the {@link IntRAC} this is a combined clusterer, clusters and
044 * assigner. See the {@link IntRAC} documentation for more details.
045 * 
046 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
047 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
048 */
049public class ClusterLimitedIntRAC extends IntRAC {
050        static class ClusterMinimisationFunction implements UnivariateRealFunction {
051                private int[][] distances;
052                private int[][] samples;
053                private int nClusters;
054
055                public ClusterMinimisationFunction(int[][] samples, int[][] distances, int nClusters) {
056                        this.distances = distances;
057                        this.samples = samples;
058                        this.nClusters = nClusters;
059                }
060
061                @Override
062                public double value(double radius) throws FunctionEvaluationException {
063                        final ClusterLimitedIntRAC r = new ClusterLimitedIntRAC(radius);
064                        r.train(samples, distances);
065                        final int diff = this.nClusters - r.numClusters();
066                        return diff;
067                }
068
069        }
070
071        private int expectedClusters;
072        private SortedMap<Float, Integer> thresholdOvershots;
073
074        /**
075         * Sets the expected number of clusters to 100 and radius to 128.
076         */
077        public ClusterLimitedIntRAC() {
078                super();
079                this.expectedClusters = 100;
080                thresholdOvershots = new TreeMap<Float, Integer>();
081        }
082
083        /**
084         * Set the number of clusters to 100.
085         * 
086         * @param radiusSquared
087         */
088        public ClusterLimitedIntRAC(double radiusSquared) {
089                super(radiusSquared);
090                this.expectedClusters = 100;
091                thresholdOvershots = new TreeMap<Float, Integer>();
092        }
093
094        /**
095         * Attempt to learn the threshold and uses this as an expected number of
096         * clusters.
097         * 
098         * @param bKeys
099         * @param subSamples
100         * @param nClusters
101         */
102        public ClusterLimitedIntRAC(int[][] bKeys, int subSamples, int nClusters) {
103                super(bKeys, subSamples, nClusters);
104                this.expectedClusters = (int) ((((float) nClusters) / subSamples) * bKeys.length);
105        }
106
107        @Override
108        public ClusterLimitedIntRAC cluster(int[][] data) {
109                int foundLength = this.nDims;
110
111                for (final int[] entry : data) {
112                        if (foundLength == -1)
113                                foundLength = entry.length;
114
115                        // all the data entries must be the same length otherwise this
116                        // doesn't make sense
117                        if (foundLength != entry.length) {
118                                throw new RuntimeException();
119                        }
120
121                        boolean found = false;
122                        float minDiff = 0;
123
124                        for (final int[] existing : this.codebook) {
125                                float distance = distanceEuclidianSquared(entry, existing);
126                                if (distance < threshold) {
127                                        found = true;
128                                        break;
129                                }
130                                distance = (float) (distance - threshold);
131                                if (minDiff == 0 || distance < minDiff)
132                                        minDiff = distance;
133                        }
134
135                        if (!found) {
136                                if (this.numClusters() >= this.expectedClusters) {
137                                        // Remove the current smallest distance with this centroid
138                                        final Float smallestDistance = this.thresholdOvershots.firstKey();
139                                        if (smallestDistance < minDiff) {
140
141                                                final Integer index = this.thresholdOvershots.get(smallestDistance);
142                                                this.codebook.remove((int) index);
143                                                this.codebook.add(index, entry);
144                                                this.thresholdOvershots.remove(smallestDistance);
145                                                this.thresholdOvershots.put(minDiff, this.numClusters() - 1);
146                                                // System.out.println("I have replaced a less significant distance, new least significant distance is "
147                                                // + this.thresholdOvershots.firstKey());
148                                        }
149                                } else {
150                                        this.codebook.add(entry);
151                                        if (this.numClusters() % 1000 == 0) {
152                                                System.out.println("Codebook increased to size " + this.numClusters());
153                                                System.out.println("with nSamples = " + this.totalSamples);
154                                        }
155                                        this.thresholdOvershots.put(minDiff, this.numClusters() - 1);
156                                }
157
158                        }
159                        this.totalSamples += 1;
160                }
161                this.nDims = foundLength;
162
163                return this;
164        }
165}