001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.clustering.kdtree; 031 032import java.util.ArrayList; 033import java.util.HashSet; 034import java.util.List; 035import java.util.Set; 036 037import org.apache.log4j.Logger; 038 039/** 040 * Load clusters from http://people.cs.nctu.edu.tw/~rsliang/dbscan/testdatagen.html 041 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 042 * 043 */ 044public class ClusterTestDataLoader{ 045 /** 046 * Test details 047 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 048 * 049 */ 050 public static class TestStats{ 051 /** 052 * EPS variable 053 */ 054 public double eps; 055 /** 056 * minpts variable 057 */ 058 public int minpts; 059 /** 060 * nclusters variable 061 */ 062 public int ncluster; 063 /** 064 * noutliers variable 065 */ 066 public int noutliers; 067 /** 068 * mineps variable 069 */ 070 public double mineps; 071 } 072 private int percluster = -1; 073 private boolean outliers = true; 074 075 076 /** 077 * 078 */ 079 public ClusterTestDataLoader() { 080 this.percluster = -1; 081 } 082 083 /** 084 * @param percluster 085 * @param outliers 086 * 087 */ 088 public ClusterTestDataLoader(int percluster, boolean outliers) { 089 this.percluster = percluster; 090 this.outliers = outliers; 091 } 092 093 private Logger logger = Logger.getLogger(ClusterTestDataLoader.class); 094 private TestStats testStats; 095 private int[][] testClusters; 096 private double[][] testData; 097 /** 098 * @param data 099 * @return read {@link TestStats} 100 */ 101 private TestStats readTestStats(String[] data) { 102 ClusterTestDataLoader.TestStats ret = new TestStats(); 103 int i = 0; 104 ret.eps = Double.parseDouble(data[i++].split("=")[1].trim()); 105 ret.minpts = Integer.parseInt(data[i++].split("=")[1].trim()); 106 ret.ncluster = Integer.parseInt(data[i++].split("=")[1].trim()); 107 ret.noutliers = Integer.parseInt(data[i++].split("=")[1].trim()); 108 ret.mineps = Double.parseDouble(data[i++].split("=")[1].trim()); 109 return ret; 110 } 111 112 113 /** 114 * @param data 115 * @return read the correct clusters 116 */ 117 private int[][] readTestClusters(String[] data) { 118 int i = 0; 119 for (;data[i].length()!=0; i++); 120 for (i=i+1;data[i].length()!=0; i++); 121 List<int[]> clusters = new ArrayList<int[]>(); 122 int count = 0; 123 for (i=i+1;i<data.length; i++){ 124 int[] readIntDataLine = readIntDataLine(data[i]); 125 clusters.add(readIntDataLine); 126 count += readIntDataLine.length; 127 } 128 logger .debug(String.format("Loading %d items in %d clusters\n",count,clusters.size())); 129 return clusters.toArray(new int[clusters.size()][]); 130 } 131 132 133 /** 134 * @param string 135 * @return read 136 */ 137 public int[] readIntDataLine(String string) { 138 String[] split = string.split(","); 139 int[] arr = new int[split.length-1]; 140 int i = 0; 141 142 for (String s : split) { 143 if(s.contains("<"))continue; // skip the first, it is the cluster index 144 s = s.replace(">", "").trim(); 145 arr[i++] = Integer.parseInt(s)-1; 146 147 } 148 return arr; 149 } 150 /** 151 * @param data 152 * @return read the test data 153 */ 154 private double[][] readTestData(String[] data) { 155 156 int i = 0; 157 for (;data[i].length()!=0; i++); 158 List<double[]> dataL = new ArrayList<double[]>(); 159 int start = i+1; 160 for (i=start;data[i].length()!=0; i++){ 161 dataL.add(readDataLine(data[i])); 162 } 163 logger.debug(String.format("Loading %d data items\n",dataL.size())); 164 return dataL.toArray(new double[dataL.size()][]); 165 } 166 private Set<Integer> existing(int[][] correct) { 167 Set<Integer> exist = new HashSet<Integer>(); 168 for (int[] is : correct) { 169 for (int i : is) { 170 exist.add(i); 171 } 172 } 173 return exist; 174 } 175 176 private double[] readDataLine(String string) { 177 String[] split = string.split(" "); 178 double[] arr = new double[]{ 179 Double.parseDouble(split[1]), 180 Double.parseDouble(split[2]) 181 }; 182 return arr; 183 } 184 185 public void prepare(String[] data) { 186 this.testStats = this.readTestStats(data); 187 this.testClusters = this.readTestClusters(data); 188 this.testData = this.readTestData(data); 189 correctClusters(); 190 } 191 192 private void correctClusters() { 193 194 if(this.percluster != -1){ 195 double[][] correctedData = null; 196 int[][] correctedClusters = new int[this.testClusters.length][this.percluster]; 197 int seen ; 198 if(this.outliers){ 199 seen = this.testStats.noutliers; 200 correctedData= new double[this.percluster * this.testClusters.length + seen][]; 201 for (int i = 0; i < seen; i++) { 202 correctedData[i] = this.testData[i]; 203 } 204 205 } 206 else{ 207 seen = 0; 208 correctedData = new double[this.percluster * this.testClusters.length][]; 209 } 210 for (int i = 0; i < this.testClusters.length; i++) { 211 int[] clust = this.testClusters[i]; 212 for (int j = 0; j < this.percluster; j++) { 213 int d = clust[j]; 214 correctedData[seen] = this.testData[d]; 215 correctedClusters[i][j] = seen; 216 seen++; 217 } 218 } 219 220 this.testClusters = correctedClusters; 221 this.testData = correctedData; 222 } 223 } 224 225 public TestStats getTestStats() { 226 return this.testStats; 227 } 228 229 public double[][] getTestData() { 230 return this.testData; 231 } 232 233 public int[][] getTestClusters() { 234 return this.testClusters; 235 } 236}