View Javadoc

1   /**
2    * Copyright (c) 2011, The University of Southampton and the individual contributors.
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without modification,
6    * are permitted provided that the following conditions are met:
7    *
8    *   * 	Redistributions of source code must retain the above copyright notice,
9    * 	this list of conditions and the following disclaimer.
10   *
11   *   *	Redistributions in binary form must reproduce the above copyright notice,
12   * 	this list of conditions and the following disclaimer in the documentation
13   * 	and/or other materials provided with the distribution.
14   *
15   *   *	Neither the name of the University of Southampton nor the names of its
16   * 	contributors may be used to endorse or promote products derived from this
17   * 	software without specific prior written permission.
18   *
19   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22   * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29   */
30  package org.openimaj.ml.clustering.kdtree;
31  
32  import java.util.ArrayList;
33  import java.util.HashSet;
34  import java.util.List;
35  import java.util.Set;
36  
37  import org.apache.log4j.Logger;
38  
39  /**
40   * Load clusters from http://people.cs.nctu.edu.tw/~rsliang/dbscan/testdatagen.html
41   * @author Sina Samangooei (ss@ecs.soton.ac.uk)
42   *
43   */
44  public class ClusterTestDataLoader{
45  	/**
46  	 * Test details
47  	 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
48  	 *
49  	 */
50  	public static class TestStats{
51  		/**
52  		 * EPS variable
53  		 */
54  		public double eps;
55  		/**
56  		 * minpts variable
57  		 */
58  		public int minpts;
59  		/**
60  		 * nclusters variable
61  		 */
62  		public int ncluster;
63  		/**
64  		 * noutliers variable
65  		 */
66  		public int noutliers;
67  		/**
68  		 * mineps variable
69  		 */
70  		public double mineps;
71  	}
72  	private int percluster = -1;
73  	private boolean outliers = true;
74  	
75  	
76  	/**
77  	 * 
78  	 */
79  	public ClusterTestDataLoader() {
80  		this.percluster = -1;
81  	}
82  	
83  	/**
84  	 * @param percluster 
85  	 * @param outliers 
86  	 * 
87  	 */
88  	public ClusterTestDataLoader(int percluster, boolean outliers) {
89  		this.percluster = percluster;
90  		this.outliers = outliers;
91  	}
92  
93  	private Logger logger = Logger.getLogger(ClusterTestDataLoader.class);
94  	private TestStats testStats;
95  	private int[][] testClusters;
96  	private double[][] testData;
97  	/**
98  	 * @param data
99  	 * @return read {@link TestStats}
100 	 */
101 	private TestStats readTestStats(String[] data) {
102 		ClusterTestDataLoader.TestStats ret = new TestStats();
103 		int i = 0;
104 		ret.eps = Double.parseDouble(data[i++].split("=")[1].trim());
105 		ret.minpts = Integer.parseInt(data[i++].split("=")[1].trim());
106 		ret.ncluster = Integer.parseInt(data[i++].split("=")[1].trim());
107 		ret.noutliers = Integer.parseInt(data[i++].split("=")[1].trim());
108 		ret.mineps = Double.parseDouble(data[i++].split("=")[1].trim());
109 		return ret;
110 	}
111 
112 
113 	/**
114 	 * @param data
115 	 * @return read the correct clusters
116 	 */
117 	private int[][] readTestClusters(String[] data) {
118 		int i = 0;
119 		for (;data[i].length()!=0; i++);
120 		for (i=i+1;data[i].length()!=0; i++);
121 		List<int[]> clusters = new ArrayList<int[]>();
122 		int count = 0;
123 		for (i=i+1;i<data.length; i++){
124 			int[] readIntDataLine = readIntDataLine(data[i]);
125 			clusters.add(readIntDataLine);
126 			count += readIntDataLine.length;
127 		}
128 		logger .debug(String.format("Loading %d items in %d clusters\n",count,clusters.size()));
129 		return clusters.toArray(new int[clusters.size()][]);
130 	}
131 	
132 
133 	/**
134 	 * @param string
135 	 * @return read
136 	 */
137 	public int[] readIntDataLine(String string) {
138 		String[] split = string.split(",");
139 		int[] arr = new int[split.length-1];
140 		int i = 0;
141 
142 		for (String s : split) {
143 			if(s.contains("<"))continue; // skip the first, it is the cluster index
144 			s = s.replace(">", "").trim();
145 			arr[i++] = Integer.parseInt(s)-1;
146 
147 		}
148 		return arr;
149 	}
150 	/**
151 	 * @param data
152 	 * @return read the test data
153 	 */
154 	private double[][] readTestData(String[] data) {
155 		
156 		int i = 0;
157 		for (;data[i].length()!=0; i++);
158 		List<double[]> dataL = new ArrayList<double[]>();
159 		int start = i+1;
160 		for (i=start;data[i].length()!=0; i++){
161 			dataL.add(readDataLine(data[i]));
162 		}
163 		logger.debug(String.format("Loading %d data items\n",dataL.size()));
164 		return dataL.toArray(new double[dataL.size()][]);
165 	}
166 	private Set<Integer> existing(int[][] correct) {
167 		Set<Integer> exist = new HashSet<Integer>();
168 		for (int[] is : correct) {
169 			for (int i : is) {
170 				exist.add(i);
171 			}
172 		}
173 		return exist;
174 	}
175 
176 	private double[] readDataLine(String string) {
177 		String[] split = string.split(" ");
178 		double[] arr = new double[]{
179 				Double.parseDouble(split[1]),
180 				Double.parseDouble(split[2])
181 		};
182 		return arr;
183 	}
184 
185 	public void prepare(String[] data) {
186 		this.testStats = this.readTestStats(data);
187 		this.testClusters = this.readTestClusters(data);
188 		this.testData = this.readTestData(data);
189 		correctClusters();
190 	}
191 
192 	private void correctClusters() {
193 		
194 		if(this.percluster != -1){
195 			double[][] correctedData = null;
196 			int[][] correctedClusters = new int[this.testClusters.length][this.percluster];	
197 			int seen ;
198 			if(this.outliers){
199 				seen = this.testStats.noutliers;
200 				correctedData= new double[this.percluster * this.testClusters.length + seen][];
201 				for (int i = 0; i < seen; i++) {
202 					correctedData[i] = this.testData[i];
203 				}
204 				
205 			}
206 			else{
207 				seen = 0;
208 				correctedData = new double[this.percluster * this.testClusters.length][];
209 			}
210 			for (int i = 0; i < this.testClusters.length; i++) {
211 				int[] clust = this.testClusters[i];
212 				for (int j = 0; j < this.percluster; j++) {
213 					int d = clust[j];
214 					correctedData[seen] = this.testData[d];
215 					correctedClusters[i][j] = seen;
216 					seen++;
217 				}
218 			}
219 			
220 			this.testClusters = correctedClusters;
221 			this.testData = correctedData;
222 		}
223 	}
224 
225 	public TestStats getTestStats() {
226 		return this.testStats;
227 	}
228 
229 	public double[][] getTestData() {
230 		return this.testData;
231 	}
232 
233 	public int[][] getTestClusters() {
234 		return this.testClusters;
235 	}
236 }