001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.examples.ml.clustering.kmeans; 031 032import java.io.Closeable; 033import java.io.DataOutputStream; 034import java.io.File; 035import java.io.FileOutputStream; 036import java.io.IOException; 037import java.io.RandomAccessFile; 038 039import org.openimaj.data.AbstractDataSource; 040import org.openimaj.data.DataSource; 041import org.openimaj.data.RandomData; 042import org.openimaj.ml.clustering.kmeans.HierarchicalByteKMeans; 043import org.openimaj.ml.clustering.kmeans.HierarchicalByteKMeansResult; 044 045/** 046 * Example showing how to use OpenIMAJ to cluster data that won't fit in memory 047 * using a {@link DataSource} that reads data from disk. Hierarchical KMeans 048 * clustering is demonstrated, but exact and approximate K-Means can also be 049 * used. 050 * 051 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 052 */ 053public class BigDataClusterExample { 054 /** 055 * Main method for the example. 056 * 057 * @param args 058 * Ignored. 059 * @throws IOException 060 */ 061 public static void main(String[] args) throws IOException { 062 // Set up the variables needed to define the clustering operation 063 final int dimensionality = 100; 064 final int numItems = 10000; 065 final int clustersPerNode = 4; 066 final int depth = 2; 067 068 // Create the clusterer; there are specific types for all kinds of data 069 // (we're using byte data here). 070 final HierarchicalByteKMeans kmeans = new HierarchicalByteKMeans(dimensionality, clustersPerNode, depth); 071 072 // Generate a file with some random data 073 System.out.println("Generating Data"); 074 final File dataFile = createDataFile(dimensionality, numItems); 075 076 // Create a datasource for the data 077 System.out.println("Creating DataSource"); 078 final ExampleDatasource ds = new ExampleDatasource(dataFile); 079 080 // Perform the clustering 081 System.out.println("Clustering"); 082 final HierarchicalByteKMeansResult cluster = kmeans.cluster(ds); 083 084 // As we're done with the datasource, we should close it 085 ds.close(); 086 087 // Now the cluster is created you can do things with it... 088 // See HierarchicalKMeansExample for some examples. 089 System.out.println("Done"); 090 System.out.println(cluster); 091 } 092 093 /** 094 * An example datasource backed by a file of the format created by 095 * {@link #createDataFile}. Note that the {@link #getData(int)} and 096 * {@link #getData(int, int, byte[][])} are synchronized to ensure that 097 * multiple threads don't interfere with the underlying 098 * {@link RandomAccessFile}. 099 * 100 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 101 * 102 */ 103 static class ExampleDatasource extends AbstractDataSource<byte[]> implements Closeable { 104 /** 105 * The header of the file is 2 4-bytes integers 106 */ 107 private static final int HEADER_BYTES = 8; 108 109 private RandomAccessFile raf; 110 private final int numItems; 111 private final int dimensionality; 112 113 public ExampleDatasource(File file) throws IOException { 114 // open the file and read the header 115 raf = new RandomAccessFile(file, "r"); 116 numItems = raf.readInt(); 117 dimensionality = raf.readInt(); 118 } 119 120 @Override 121 public synchronized void getData(int startRow, int stopRow, byte[][] data) { 122 try { 123 // seek to the location of the start-row and read the data 124 raf.seek(HEADER_BYTES + startRow * dimensionality); 125 for (int i = 0; i < stopRow - startRow; i++) 126 raf.read(data[i], 0, dimensionality); 127 128 } catch (final IOException e) { 129 throw new RuntimeException(e); 130 } 131 } 132 133 @Override 134 public synchronized byte[] getData(int row) { 135 try { 136 // allocate data 137 final byte[] data = new byte[dimensionality]; 138 139 // seek to the row and read 140 raf.seek(HEADER_BYTES + row * dimensionality); 141 raf.read(data); 142 return data; 143 } catch (final IOException e) { 144 throw new RuntimeException(e); 145 } 146 } 147 148 @Override 149 public int numDimensions() { 150 return dimensionality; 151 } 152 153 @Override 154 public int size() { 155 return numItems; 156 } 157 158 @Override 159 public void close() throws IOException { 160 raf.close(); 161 } 162 163 @Override 164 public byte[][] createTemporaryArray(int size) { 165 return new byte[size][dimensionality]; 166 } 167 } 168 169 /** 170 * Write some randomly generated vectors to a temporary file. 171 * <p> 172 * The file format is simple: there is a two integer header representing the 173 * number of vectors and dimensionality. The remainder of the file is the 174 * vector data, one vector at a time, with each vector encoded as 175 * <code>dimensionality</code> bytes. 176 * 177 * @param dimensionality 178 * length of the vectors 179 * @param numItems 180 * number of vectors 181 * @return the file that was created 182 * @throws IOException 183 * if an error occurs 184 */ 185 static File createDataFile(int dimensionality, int numItems) throws IOException { 186 final File file = File.createTempFile("clusteringExampleData", ".txt"); 187 file.deleteOnExit(); 188 189 FileOutputStream fos = null; 190 DataOutputStream dos = null; 191 try { 192 fos = new FileOutputStream(file); 193 dos = new DataOutputStream(fos); 194 195 dos.writeInt(numItems); 196 dos.writeInt(dimensionality); 197 198 for (int i = 0; i < numItems; i++) { 199 final byte[] vector = RandomData.getRandomByteArray(dimensionality, Byte.MIN_VALUE, Byte.MAX_VALUE); 200 201 for (int j = 0; j < dimensionality; j++) { 202 dos.writeByte(vector[j]); 203 } 204 } 205 } finally { 206 if (dos != null) 207 dos.close(); 208 if (fos != null) 209 fos.close(); 210 } 211 212 return file; 213 } 214}