001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.examples.ml.clustering.kmeans;
031
032import java.io.Closeable;
033import java.io.DataOutputStream;
034import java.io.File;
035import java.io.FileOutputStream;
036import java.io.IOException;
037import java.io.RandomAccessFile;
038
039import org.openimaj.data.AbstractDataSource;
040import org.openimaj.data.DataSource;
041import org.openimaj.data.RandomData;
042import org.openimaj.ml.clustering.kmeans.HierarchicalByteKMeans;
043import org.openimaj.ml.clustering.kmeans.HierarchicalByteKMeansResult;
044
045/**
046 * Example showing how to use OpenIMAJ to cluster data that won't fit in memory
047 * using a {@link DataSource} that reads data from disk. Hierarchical KMeans
048 * clustering is demonstrated, but exact and approximate K-Means can also be
049 * used.
050 *
051 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
052 */
053public class BigDataClusterExample {
054        /**
055         * Main method for the example.
056         *
057         * @param args
058         *            Ignored.
059         * @throws IOException
060         */
061        public static void main(String[] args) throws IOException {
062                // Set up the variables needed to define the clustering operation
063                final int dimensionality = 100;
064                final int numItems = 10000;
065                final int clustersPerNode = 4;
066                final int depth = 2;
067
068                // Create the clusterer; there are specific types for all kinds of data
069                // (we're using byte data here).
070                final HierarchicalByteKMeans kmeans = new HierarchicalByteKMeans(dimensionality, clustersPerNode, depth);
071
072                // Generate a file with some random data
073                System.out.println("Generating Data");
074                final File dataFile = createDataFile(dimensionality, numItems);
075
076                // Create a datasource for the data
077                System.out.println("Creating DataSource");
078                final ExampleDatasource ds = new ExampleDatasource(dataFile);
079
080                // Perform the clustering
081                System.out.println("Clustering");
082                final HierarchicalByteKMeansResult cluster = kmeans.cluster(ds);
083
084                // As we're done with the datasource, we should close it
085                ds.close();
086
087                // Now the cluster is created you can do things with it...
088                // See HierarchicalKMeansExample for some examples.
089                System.out.println("Done");
090                System.out.println(cluster);
091        }
092
093        /**
094         * An example datasource backed by a file of the format created by
095         * {@link #createDataFile}. Note that the {@link #getData(int)} and
096         * {@link #getData(int, int, byte[][])} are synchronized to ensure that
097         * multiple threads don't interfere with the underlying
098         * {@link RandomAccessFile}.
099         *
100         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
101         *
102         */
103        static class ExampleDatasource extends AbstractDataSource<byte[]> implements Closeable {
104                /**
105                 * The header of the file is 2 4-bytes integers
106                 */
107                private static final int HEADER_BYTES = 8;
108
109                private RandomAccessFile raf;
110                private final int numItems;
111                private final int dimensionality;
112
113                public ExampleDatasource(File file) throws IOException {
114                        // open the file and read the header
115                        raf = new RandomAccessFile(file, "r");
116                        numItems = raf.readInt();
117                        dimensionality = raf.readInt();
118                }
119
120                @Override
121                public synchronized void getData(int startRow, int stopRow, byte[][] data) {
122                        try {
123                                // seek to the location of the start-row and read the data
124                                raf.seek(HEADER_BYTES + startRow * dimensionality);
125                                for (int i = 0; i < stopRow - startRow; i++)
126                                        raf.read(data[i], 0, dimensionality);
127
128                        } catch (final IOException e) {
129                                throw new RuntimeException(e);
130                        }
131                }
132
133                @Override
134                public synchronized byte[] getData(int row) {
135                        try {
136                                // allocate data
137                                final byte[] data = new byte[dimensionality];
138
139                                // seek to the row and read
140                                raf.seek(HEADER_BYTES + row * dimensionality);
141                                raf.read(data);
142                                return data;
143                        } catch (final IOException e) {
144                                throw new RuntimeException(e);
145                        }
146                }
147
148                @Override
149                public int numDimensions() {
150                        return dimensionality;
151                }
152
153                @Override
154                public int size() {
155                        return numItems;
156                }
157
158                @Override
159                public void close() throws IOException {
160                        raf.close();
161                }
162
163                @Override
164                public byte[][] createTemporaryArray(int size) {
165                        return new byte[size][dimensionality];
166                }
167        }
168
169        /**
170         * Write some randomly generated vectors to a temporary file.
171         * <p>
172         * The file format is simple: there is a two integer header representing the
173         * number of vectors and dimensionality. The remainder of the file is the
174         * vector data, one vector at a time, with each vector encoded as
175         * <code>dimensionality</code> bytes.
176         *
177         * @param dimensionality
178         *            length of the vectors
179         * @param numItems
180         *            number of vectors
181         * @return the file that was created
182         * @throws IOException
183         *             if an error occurs
184         */
185        static File createDataFile(int dimensionality, int numItems) throws IOException {
186                final File file = File.createTempFile("clusteringExampleData", ".txt");
187                file.deleteOnExit();
188
189                FileOutputStream fos = null;
190                DataOutputStream dos = null;
191                try {
192                        fos = new FileOutputStream(file);
193                        dos = new DataOutputStream(fos);
194
195                        dos.writeInt(numItems);
196                        dos.writeInt(dimensionality);
197
198                        for (int i = 0; i < numItems; i++) {
199                                final byte[] vector = RandomData.getRandomByteArray(dimensionality, Byte.MIN_VALUE, Byte.MAX_VALUE);
200
201                                for (int j = 0; j < dimensionality; j++) {
202                                        dos.writeByte(vector[j]);
203                                }
204                        }
205                } finally {
206                        if (dos != null)
207                                dos.close();
208                        if (fos != null)
209                                fos.close();
210                }
211
212                return file;
213        }
214}