001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.clustering.rac; 031 032import java.util.SortedMap; 033import java.util.TreeMap; 034 035import org.apache.commons.math.FunctionEvaluationException; 036import org.apache.commons.math.analysis.UnivariateRealFunction; 037 038/** 039 * Similar to {@link IntRAC} but explicitly specify the limit the number of 040 * clusters. Attempts to replace clusters which are "closer" with those that are 041 * "further". 042 * <p> 043 * As with the {@link IntRAC} this is a combined clusterer, clusters and 044 * assigner. See the {@link IntRAC} documentation for more details. 045 * 046 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 047 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 048 */ 049public class ClusterLimitedIntRAC extends IntRAC { 050 static class ClusterMinimisationFunction implements UnivariateRealFunction { 051 private int[][] distances; 052 private int[][] samples; 053 private int nClusters; 054 055 public ClusterMinimisationFunction(int[][] samples, int[][] distances, int nClusters) { 056 this.distances = distances; 057 this.samples = samples; 058 this.nClusters = nClusters; 059 } 060 061 @Override 062 public double value(double radius) throws FunctionEvaluationException { 063 final ClusterLimitedIntRAC r = new ClusterLimitedIntRAC(radius); 064 r.train(samples, distances); 065 final int diff = this.nClusters - r.numClusters(); 066 return diff; 067 } 068 069 } 070 071 private int expectedClusters; 072 private SortedMap<Float, Integer> thresholdOvershots; 073 074 /** 075 * Sets the expected number of clusters to 100 and radius to 128. 076 */ 077 public ClusterLimitedIntRAC() { 078 super(); 079 this.expectedClusters = 100; 080 thresholdOvershots = new TreeMap<Float, Integer>(); 081 } 082 083 /** 084 * Set the number of clusters to 100. 085 * 086 * @param radiusSquared 087 */ 088 public ClusterLimitedIntRAC(double radiusSquared) { 089 super(radiusSquared); 090 this.expectedClusters = 100; 091 thresholdOvershots = new TreeMap<Float, Integer>(); 092 } 093 094 /** 095 * Attempt to learn the threshold and uses this as an expected number of 096 * clusters. 097 * 098 * @param bKeys 099 * @param subSamples 100 * @param nClusters 101 */ 102 public ClusterLimitedIntRAC(int[][] bKeys, int subSamples, int nClusters) { 103 super(bKeys, subSamples, nClusters); 104 this.expectedClusters = (int) ((((float) nClusters) / subSamples) * bKeys.length); 105 } 106 107 @Override 108 public ClusterLimitedIntRAC cluster(int[][] data) { 109 int foundLength = this.nDims; 110 111 for (final int[] entry : data) { 112 if (foundLength == -1) 113 foundLength = entry.length; 114 115 // all the data entries must be the same length otherwise this 116 // doesn't make sense 117 if (foundLength != entry.length) { 118 throw new RuntimeException(); 119 } 120 121 boolean found = false; 122 float minDiff = 0; 123 124 for (final int[] existing : this.codebook) { 125 float distance = distanceEuclidianSquared(entry, existing); 126 if (distance < threshold) { 127 found = true; 128 break; 129 } 130 distance = (float) (distance - threshold); 131 if (minDiff == 0 || distance < minDiff) 132 minDiff = distance; 133 } 134 135 if (!found) { 136 if (this.numClusters() >= this.expectedClusters) { 137 // Remove the current smallest distance with this centroid 138 final Float smallestDistance = this.thresholdOvershots.firstKey(); 139 if (smallestDistance < minDiff) { 140 141 final Integer index = this.thresholdOvershots.get(smallestDistance); 142 this.codebook.remove((int) index); 143 this.codebook.add(index, entry); 144 this.thresholdOvershots.remove(smallestDistance); 145 this.thresholdOvershots.put(minDiff, this.numClusters() - 1); 146 // System.out.println("I have replaced a less significant distance, new least significant distance is " 147 // + this.thresholdOvershots.firstKey()); 148 } 149 } else { 150 this.codebook.add(entry); 151 if (this.numClusters() % 1000 == 0) { 152 System.out.println("Codebook increased to size " + this.numClusters()); 153 System.out.println("with nSamples = " + this.totalSamples); 154 } 155 this.thresholdOvershots.put(minDiff, this.numClusters() - 1); 156 } 157 158 } 159 this.totalSamples += 1; 160 } 161 this.nDims = foundLength; 162 163 return this; 164 } 165}