1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 package org.openimaj.workinprogress;
31
32 import java.io.File;
33 import java.io.IOException;
34 import java.util.ArrayList;
35 import java.util.List;
36
37 import org.openimaj.feature.SparseFloatFV;
38 import org.openimaj.feature.SparseFloatFVComparison;
39 import org.openimaj.io.FileUtils;
40 import org.openimaj.ml.clustering.FeatureVectorCentroidsResult;
41 import org.openimaj.ml.clustering.IndexClusters;
42 import org.openimaj.ml.clustering.assignment.hard.ExactFeatureVectorAssigner;
43 import org.openimaj.ml.clustering.kmeans.FeatureVectorKMeans;
44 import org.openimaj.util.array.ArrayUtils;
45
46 import cern.colt.Arrays;
47
48 public class Cluster {
49 public static void main(String[] args) throws Exception {
50 final File dir = new File("/Users/jon/Work/lmlk/trunk/bbc/subtitle-analyser/data_to_cluster");
51 final List<String> vocab = new ArrayList<String>();
52 final List<String> names = new ArrayList<String>();
53 final List<SparseFloatFV> features = new ArrayList<SparseFloatFV>();
54
55 System.err.println("Loading data");
56 for (final File f : dir.listFiles()) {
57 if (f.getName().startsWith("TR")) {
58 final SparseFloatFV fv = loadVector(f, vocab);
59
60 names.add(f.getName());
61 features.add(fv);
62 }
63 }
64
65 System.err.println("Setting lengths");
66 for (final SparseFloatFV fv : features)
67 fv.values.setLength(vocab.size());
68
69 final FeatureVectorKMeans<SparseFloatFV> fkm = FeatureVectorKMeans.createExact(120,
70 SparseFloatFVComparison.CORRELATION, 100);
71 fkm.getConfiguration().setBlockSize(500);
72
73 final SparseFloatFV[] data = features.toArray(new SparseFloatFV[features.size()]);
74 final FeatureVectorCentroidsResult<SparseFloatFV> clusters = fkm.cluster(data);
75
76 final ExactFeatureVectorAssigner<SparseFloatFV> eoa = new ExactFeatureVectorAssigner<SparseFloatFV>(clusters,
77 SparseFloatFVComparison.CORRELATION);
78 final int[][] assignments = new IndexClusters(eoa.assign(data)).clusters();
79
80 System.out.print("[");
81 for (int i = 0; i < assignments.length; i++) {
82 System.out.print("{");
83
84 System.out.print("\"name\":\"cluster" + i + "\",");
85
86 final int[] a = assignments[i];
87 final String[] items = new String[a.length];
88 for (int j = 0; j < a.length; j++)
89 items[j] = "\"" + names.get(a[j]) + "\"";
90 System.out.print("\"items\":" + Arrays.toString(items) + ",");
91
92 final double[] centroid = clusters.centroids[i].asDoubleVector();
93 final int[] indexes = ArrayUtils.indexSort(centroid);
94 System.out.print("\"labels\":[");
95 for (int j = 0; j < 25; j++) {
96 final int idx = indexes[indexes.length - 1 - j];
97 final String tag = vocab.get(idx);
98 final double score = centroid[idx];
99 System.out.print("{\"tag\":\"" + tag + "\",\"weight\":" + score + "}");
100
101 final double nextscore = centroid[indexes[indexes.length - 1 - (j + 1)]];
102
103 if (nextscore == 0)
104 break;
105
106 if (j < 25 - 1)
107 System.out.print(",");
108 }
109
110 System.out.print("]}");
111
112 if (i < assignments.length - 1)
113 System.out.print(",\n");
114 }
115
116 System.out.print("]");
117 }
118
119 private static SparseFloatFV loadVector(File f, List<String> vocab) throws IOException {
120 final String str = FileUtils.readall(f);
121
122 final String[] terms = str.split(",\\s*");
123 final SparseFloatFV fv = new SparseFloatFV(vocab.size());
124 for (String term : terms) {
125 term = term.trim();
126 if (term.length() < 1)
127 continue;
128
129 int idx = vocab.indexOf(term);
130 if (idx == -1) {
131 idx = vocab.size();
132 vocab.add(term);
133 fv.values.setLength(idx + 1);
134 fv.values.set(idx, 1);
135 } else {
136 fv.values.increment(idx, 1);
137 }
138 }
139
140 return fv;
141 }
142 }