001/* 002 AUTOMATICALLY GENERATED BY jTemp FROM 003 /Users/jsh2/Work/openimaj/target/checkout/machine-learning/nearest-neighbour/src/main/jtemp/org/openimaj/knn/pq/#T#ProductQuantiser.jtemp 004*/ 005/** 006 * Copyright (c) 2011, The University of Southampton and the individual contributors. 007 * All rights reserved. 008 * 009 * Redistribution and use in source and binary forms, with or without modification, 010 * are permitted provided that the following conditions are met: 011 * 012 * * Redistributions of source code must retain the above copyright notice, 013 * this list of conditions and the following disclaimer. 014 * 015 * * Redistributions in binary form must reproduce the above copyright notice, 016 * this list of conditions and the following disclaimer in the documentation 017 * and/or other materials provided with the distribution. 018 * 019 * * Neither the name of the University of Southampton nor the names of its 020 * contributors may be used to endorse or promote products derived from this 021 * software without specific prior written permission. 022 * 023 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 024 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 025 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 026 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 027 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 028 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 029 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 030 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 032 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 033 */ 034 035 036package org.openimaj.knn.pq; 037 038import java.util.Arrays; 039 040import org.openimaj.citation.annotation.Reference; 041import org.openimaj.citation.annotation.ReferenceType; 042import org.openimaj.knn.IntNearestNeighboursExact; 043import org.openimaj.knn.NearestNeighbours; 044 045/** 046 * Implementation of a Product Quantiser for vectors/arrays of ints. Product 047 * Quantisers quantise data into a very large number of clusters (large enough 048 * that the centroids could not possibly fit into memory - i.e. 2^64 centroids). 049 * The Product Quantiser can be used to create compressed representations of 050 * high-dimensional vectors, and also as a means to perform efficient 051 * nearest-neighbour search over large collections of vectors (which have been 052 * effectively compressed using the product quantiser). 053 * <p> 054 * This is achieved by breaking down the input vectors into non-overlapping 055 * sub-vectors, and applying quantisation to these sub-vectors individually. The 056 * number of bins (cluster centroids) for the sub-vectors is small (up to 256 in 057 * this implementation), but when combined over all sub-vectors, the number of 058 * bins is much larger as it accounts for all combinations of bins across 059 * sub-vectors. As only a small set of centroids needs to be held for the 060 * sub-vectors, the memory requirements are quite modest. The output of the 061 * quantisation action in this implementation is an array of bytes corresponding 062 * to the index of the matching centroid for each sub-vector (index numbers are 063 * offset by -128 so that 256 centroids indexes can fit in a single byte). The 064 * bit-pattern of this byte array could be interpreted as a numeric value of 065 * global cluster index, however in practice this is not useful. 066 * <p> 067 * Typically the product quantiser is "trained" so that it adapts to the data 068 * that is is being applied too. The standard approach to this is to use 069 * K-Means, however, this is not required. Insofar as this implementation is 070 * concerned, any set of compatible {@link NearestNeighbours} implementations 071 * can be provided to the constructor. Each of the {@link NearestNeighbours} 072 * could even potentially have a different number of dimensions (corresponding 073 * to the sub-vector lengths). 074 * <p> 075 * In the standard case, where you just want to use K-Means to train the Product 076 * Quantiser, a set of utility methods can be found in the 077 * org.openimaj.knn.pq.IntProductQuantiserUtilities class which can be found in 078 * the clustering sub-project (due to the dependence on the K-Means algorithm). 079 * 080 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 081 * 082 */ 083 @Reference( 084 type = ReferenceType.Article, 085 author = { "Jegou, Herve", "Douze, Matthijs", "Schmid, Cordelia" }, 086 title = "Product Quantization for Nearest Neighbor Search", 087 year = "2011", 088 journal = "IEEE Trans. Pattern Anal. Mach. Intell.", 089 pages = { "117", "", "128" }, 090 url = "http://dx.doi.org/10.1109/TPAMI.2010.57", 091 month = "January", 092 number = "1", 093 publisher = "IEEE Computer Society", 094 volume = "33", 095 customData = { 096 "issn", "0162-8828", 097 "numpages", "12", 098 "doi", "10.1109/TPAMI.2010.57", 099 "acmid", "1916695", 100 "address", "Washington, DC, USA", 101 "keywords", "High-dimensional indexing, High-dimensional indexing, image indexing, very large databases, approximate search., approximate search., image indexing, very large databases" 102 }) 103public class IntProductQuantiser { 104 protected IntNearestNeighboursExact[] assigners; 105 protected int ndims; 106 107 /** 108 * Construct a {@link IntProductQuantiser} with the given 109 * nearest-neighbour assigners. The number of dimensions of the assigners 110 * determines how long each sub-vector is. There is a one-to-one mapping 111 * between in the order of assigners and sub-vectors. 112 * 113 * @param assigners 114 * the nearest-neighbour assigners. 115 */ 116 public IntProductQuantiser(IntNearestNeighboursExact[] assigners) { 117 this.assigners = assigners; 118 119 for (final IntNearestNeighboursExact nn : assigners) 120 ndims += nn.numDimensions(); 121 } 122 123 /** 124 * Quantise the given data using this Product Quantiser. The output is an 125 * array of bytes corresponding to the index of the matching centroid for 126 * each sub-vector (index numbers are offset by -128 so that 256 centroids 127 * indexes can fit in a single byte). 128 * 129 * @param data 130 * the data to quantise 131 * @return the quantised data. 132 */ 133 public byte[] quantise(int[] data) { 134 final byte[] quantised = new byte[assigners.length]; 135 136 final int[] idx = { 0 }; 137 final float[] dst = { 0 }; 138 final int[][] qus = new int[1][0]; 139 140 for (int i = 0, from = 0; i < assigners.length; i++) { 141 final int to = assigners[i].numDimensions(); 142 143 qus[0] = Arrays.copyOfRange(data, from, from + to); 144 assigners[i].searchNN(qus, idx, dst); 145 quantised[i] = (byte) (idx[0] - 128); 146 147 from += to; 148 } 149 150 return quantised; 151 } 152 153 /** 154 * Decompress the quantised data by replacing each encoded index with the actual centroid subvector. 155 * 156 * @param qdata the quantised data 157 * 158 * @return the (approximate) decompressed feature 159 */ 160 public int[] decompress(byte[] qdata) { 161 final int[] data = new int[ndims]; 162 163 for (int i = 0, from = 0; i < assigners.length; i++) { 164 final int len = assigners[i].numDimensions(); 165 int index = (int)qdata[i] + 128; 166 167 System.arraycopy(this.assigners[i].getPoints()[index], 0, data, from, len); 168 169 from += len; 170 } 171 172 return data; 173 } 174}