View Javadoc

1   /**
2    * Copyright (c) 2011, The University of Southampton and the individual contributors.
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without modification,
6    * are permitted provided that the following conditions are met:
7    *
8    *   * 	Redistributions of source code must retain the above copyright notice,
9    * 	this list of conditions and the following disclaimer.
10   *
11   *   *	Redistributions in binary form must reproduce the above copyright notice,
12   * 	this list of conditions and the following disclaimer in the documentation
13   * 	and/or other materials provided with the distribution.
14   *
15   *   *	Neither the name of the University of Southampton nor the names of its
16   * 	contributors may be used to endorse or promote products derived from this
17   * 	software without specific prior written permission.
18   *
19   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22   * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29   */
30  package org.openimaj.workinprogress;
31  
32  import java.io.File;
33  import java.io.IOException;
34  import java.util.ArrayList;
35  import java.util.List;
36  
37  import org.openimaj.feature.SparseFloatFV;
38  import org.openimaj.feature.SparseFloatFVComparison;
39  import org.openimaj.io.FileUtils;
40  import org.openimaj.ml.clustering.FeatureVectorCentroidsResult;
41  import org.openimaj.ml.clustering.IndexClusters;
42  import org.openimaj.ml.clustering.assignment.hard.ExactFeatureVectorAssigner;
43  import org.openimaj.ml.clustering.kmeans.FeatureVectorKMeans;
44  import org.openimaj.util.array.ArrayUtils;
45  
46  import cern.colt.Arrays;
47  
48  public class Cluster {
49  	public static void main(String[] args) throws Exception {
50  		final File dir = new File("/Users/jon/Work/lmlk/trunk/bbc/subtitle-analyser/data_to_cluster");
51  		final List<String> vocab = new ArrayList<String>();
52  		final List<String> names = new ArrayList<String>();
53  		final List<SparseFloatFV> features = new ArrayList<SparseFloatFV>();
54  
55  		System.err.println("Loading data");
56  		for (final File f : dir.listFiles()) {
57  			if (f.getName().startsWith("TR")) {
58  				final SparseFloatFV fv = loadVector(f, vocab);
59  
60  				names.add(f.getName());
61  				features.add(fv);
62  			}
63  		}
64  
65  		System.err.println("Setting lengths");
66  		for (final SparseFloatFV fv : features)
67  			fv.values.setLength(vocab.size());
68  
69  		final FeatureVectorKMeans<SparseFloatFV> fkm = FeatureVectorKMeans.createExact(120,
70  				SparseFloatFVComparison.CORRELATION, 100);
71  		fkm.getConfiguration().setBlockSize(500);
72  
73  		final SparseFloatFV[] data = features.toArray(new SparseFloatFV[features.size()]);
74  		final FeatureVectorCentroidsResult<SparseFloatFV> clusters = fkm.cluster(data);
75  
76  		final ExactFeatureVectorAssigner<SparseFloatFV> eoa = new ExactFeatureVectorAssigner<SparseFloatFV>(clusters,
77  				SparseFloatFVComparison.CORRELATION);
78  		final int[][] assignments = new IndexClusters(eoa.assign(data)).clusters();
79  
80  		System.out.print("[");
81  		for (int i = 0; i < assignments.length; i++) {
82  			System.out.print("{");
83  
84  			System.out.print("\"name\":\"cluster" + i + "\",");
85  
86  			final int[] a = assignments[i];
87  			final String[] items = new String[a.length];
88  			for (int j = 0; j < a.length; j++)
89  				items[j] = "\"" + names.get(a[j]) + "\"";
90  			System.out.print("\"items\":" + Arrays.toString(items) + ",");
91  
92  			final double[] centroid = clusters.centroids[i].asDoubleVector();
93  			final int[] indexes = ArrayUtils.indexSort(centroid);
94  			System.out.print("\"labels\":[");
95  			for (int j = 0; j < 25; j++) {
96  				final int idx = indexes[indexes.length - 1 - j];
97  				final String tag = vocab.get(idx);
98  				final double score = centroid[idx];
99  				System.out.print("{\"tag\":\"" + tag + "\",\"weight\":" + score + "}");
100 
101 				final double nextscore = centroid[indexes[indexes.length - 1 - (j + 1)]];
102 
103 				if (nextscore == 0)
104 					break;
105 
106 				if (j < 25 - 1)
107 					System.out.print(",");
108 			}
109 
110 			System.out.print("]}");
111 
112 			if (i < assignments.length - 1)
113 				System.out.print(",\n");
114 		}
115 
116 		System.out.print("]");
117 	}
118 
119 	private static SparseFloatFV loadVector(File f, List<String> vocab) throws IOException {
120 		final String str = FileUtils.readall(f);
121 
122 		final String[] terms = str.split(",\\s*");
123 		final SparseFloatFV fv = new SparseFloatFV(vocab.size());
124 		for (String term : terms) {
125 			term = term.trim();
126 			if (term.length() < 1)
127 				continue;
128 
129 			int idx = vocab.indexOf(term);
130 			if (idx == -1) {
131 				idx = vocab.size();
132 				vocab.add(term);
133 				fv.values.setLength(idx + 1);
134 				fv.values.set(idx, 1);
135 			} else {
136 				fv.values.increment(idx, 1);
137 			}
138 		}
139 
140 		return fv;
141 	}
142 }