001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.workinprogress;
031
032import java.io.File;
033import java.io.IOException;
034import java.util.ArrayList;
035import java.util.List;
036
037import org.openimaj.feature.SparseFloatFV;
038import org.openimaj.feature.SparseFloatFVComparison;
039import org.openimaj.io.FileUtils;
040import org.openimaj.ml.clustering.FeatureVectorCentroidsResult;
041import org.openimaj.ml.clustering.IndexClusters;
042import org.openimaj.ml.clustering.assignment.hard.ExactFeatureVectorAssigner;
043import org.openimaj.ml.clustering.kmeans.FeatureVectorKMeans;
044import org.openimaj.util.array.ArrayUtils;
045
046import cern.colt.Arrays;
047
048public class Cluster {
049        public static void main(String[] args) throws Exception {
050                final File dir = new File("/Users/jon/Work/lmlk/trunk/bbc/subtitle-analyser/data_to_cluster");
051                final List<String> vocab = new ArrayList<String>();
052                final List<String> names = new ArrayList<String>();
053                final List<SparseFloatFV> features = new ArrayList<SparseFloatFV>();
054
055                System.err.println("Loading data");
056                for (final File f : dir.listFiles()) {
057                        if (f.getName().startsWith("TR")) {
058                                final SparseFloatFV fv = loadVector(f, vocab);
059
060                                names.add(f.getName());
061                                features.add(fv);
062                        }
063                }
064
065                System.err.println("Setting lengths");
066                for (final SparseFloatFV fv : features)
067                        fv.values.setLength(vocab.size());
068
069                final FeatureVectorKMeans<SparseFloatFV> fkm = FeatureVectorKMeans.createExact(120,
070                                SparseFloatFVComparison.CORRELATION, 100);
071                fkm.getConfiguration().setBlockSize(500);
072
073                final SparseFloatFV[] data = features.toArray(new SparseFloatFV[features.size()]);
074                final FeatureVectorCentroidsResult<SparseFloatFV> clusters = fkm.cluster(data);
075
076                final ExactFeatureVectorAssigner<SparseFloatFV> eoa = new ExactFeatureVectorAssigner<SparseFloatFV>(clusters,
077                                SparseFloatFVComparison.CORRELATION);
078                final int[][] assignments = new IndexClusters(eoa.assign(data)).clusters();
079
080                System.out.print("[");
081                for (int i = 0; i < assignments.length; i++) {
082                        System.out.print("{");
083
084                        System.out.print("\"name\":\"cluster" + i + "\",");
085
086                        final int[] a = assignments[i];
087                        final String[] items = new String[a.length];
088                        for (int j = 0; j < a.length; j++)
089                                items[j] = "\"" + names.get(a[j]) + "\"";
090                        System.out.print("\"items\":" + Arrays.toString(items) + ",");
091
092                        final double[] centroid = clusters.centroids[i].asDoubleVector();
093                        final int[] indexes = ArrayUtils.indexSort(centroid);
094                        System.out.print("\"labels\":[");
095                        for (int j = 0; j < 25; j++) {
096                                final int idx = indexes[indexes.length - 1 - j];
097                                final String tag = vocab.get(idx);
098                                final double score = centroid[idx];
099                                System.out.print("{\"tag\":\"" + tag + "\",\"weight\":" + score + "}");
100
101                                final double nextscore = centroid[indexes[indexes.length - 1 - (j + 1)]];
102
103                                if (nextscore == 0)
104                                        break;
105
106                                if (j < 25 - 1)
107                                        System.out.print(",");
108                        }
109
110                        System.out.print("]}");
111
112                        if (i < assignments.length - 1)
113                                System.out.print(",\n");
114                }
115
116                System.out.print("]");
117        }
118
119        private static SparseFloatFV loadVector(File f, List<String> vocab) throws IOException {
120                final String str = FileUtils.readall(f);
121
122                final String[] terms = str.split(",\\s*");
123                final SparseFloatFV fv = new SparseFloatFV(vocab.size());
124                for (String term : terms) {
125                        term = term.trim();
126                        if (term.length() < 1)
127                                continue;
128
129                        int idx = vocab.indexOf(term);
130                        if (idx == -1) {
131                                idx = vocab.size();
132                                vocab.add(term);
133                                fv.values.setLength(idx + 1);
134                                fv.values.set(idx, 1);
135                        } else {
136                                fv.values.increment(idx, 1);
137                        }
138                }
139
140                return fv;
141        }
142}