001/*
002        AUTOMATICALLY GENERATED BY jTemp FROM
003        /Users/jsh2/Work/openimaj/target/checkout/machine-learning/nearest-neighbour/src/main/jtemp/org/openimaj/knn/pq/#T#ProductQuantiser.jtemp
004*/
005/**
006 * Copyright (c) 2011, The University of Southampton and the individual contributors.
007 * All rights reserved.
008 *
009 * Redistribution and use in source and binary forms, with or without modification,
010 * are permitted provided that the following conditions are met:
011 *
012 *   *  Redistributions of source code must retain the above copyright notice,
013 *      this list of conditions and the following disclaimer.
014 *
015 *   *  Redistributions in binary form must reproduce the above copyright notice,
016 *      this list of conditions and the following disclaimer in the documentation
017 *      and/or other materials provided with the distribution.
018 *
019 *   *  Neither the name of the University of Southampton nor the names of its
020 *      contributors may be used to endorse or promote products derived from this
021 *      software without specific prior written permission.
022 *
023 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
024 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
025 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
026 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
027 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
028 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
029 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
030 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
032 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
033 */
034 
035  
036package org.openimaj.knn.pq;
037
038import java.util.Arrays;
039
040import org.openimaj.citation.annotation.Reference;
041import org.openimaj.citation.annotation.ReferenceType;
042import org.openimaj.knn.ByteNearestNeighboursExact;
043import org.openimaj.knn.NearestNeighbours;
044
045/**
046 * Implementation of a Product Quantiser for vectors/arrays of bytes. Product
047 * Quantisers quantise data into a very large number of clusters (large enough
048 * that the centroids could not possibly fit into memory - i.e. 2^64 centroids).
049 * The Product Quantiser can be used to create compressed representations of
050 * high-dimensional vectors, and also as a means to perform efficient
051 * nearest-neighbour search over large collections of vectors (which have been
052 * effectively compressed using the product quantiser).
053 * <p>
054 * This is achieved by breaking down the input vectors into non-overlapping
055 * sub-vectors, and applying quantisation to these sub-vectors individually. The
056 * number of bins (cluster centroids) for the sub-vectors is small (up to 256 in
057 * this implementation), but when combined over all sub-vectors, the number of
058 * bins is much larger as it accounts for all combinations of bins across
059 * sub-vectors. As only a small set of centroids needs to be held for the
060 * sub-vectors, the memory requirements are quite modest. The output of the
061 * quantisation action in this implementation is an array of bytes corresponding
062 * to the index of the matching centroid for each sub-vector (index numbers are
063 * offset by -128 so that 256 centroids indexes can fit in a single byte). The
064 * bit-pattern of this byte array could be interpreted as a numeric value of
065 * global cluster index, however in practice this is not useful.
066 * <p>
067 * Typically the product quantiser is "trained" so that it adapts to the data
068 * that is is being applied too. The standard approach to this is to use
069 * K-Means, however, this is not required. Insofar as this implementation is
070 * concerned, any set of compatible {@link NearestNeighbours} implementations
071 * can be provided to the constructor. Each of the {@link NearestNeighbours}
072 * could even potentially have a different number of dimensions (corresponding
073 * to the sub-vector lengths).
074 * <p>
075 * In the standard case, where you just want to use K-Means to train the Product
076 * Quantiser, a set of utility methods can be found in the
077 * org.openimaj.knn.pq.ByteProductQuantiserUtilities class which can be found in
078 * the clustering sub-project (due to the dependence on the K-Means algorithm).
079 * 
080 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
081 * 
082 */
083 @Reference(
084                type = ReferenceType.Article,
085                author = { "Jegou, Herve", "Douze, Matthijs", "Schmid, Cordelia" },
086                title = "Product Quantization for Nearest Neighbor Search",
087                year = "2011",
088                journal = "IEEE Trans. Pattern Anal. Mach. Intell.",
089                pages = { "117", "", "128" },
090                url = "http://dx.doi.org/10.1109/TPAMI.2010.57",
091                month = "January",
092                number = "1",
093                publisher = "IEEE Computer Society",
094                volume = "33",
095                customData = {
096                                "issn", "0162-8828",
097                                "numpages", "12",
098                                "doi", "10.1109/TPAMI.2010.57",
099                                "acmid", "1916695",
100                                "address", "Washington, DC, USA",
101                                "keywords", "High-dimensional indexing, High-dimensional indexing, image indexing, very large databases, approximate search., approximate search., image indexing, very large databases"
102                })
103public class ByteProductQuantiser {
104        protected ByteNearestNeighboursExact[] assigners;
105        protected int ndims;
106
107        /**
108         * Construct a {@link ByteProductQuantiser} with the given
109         * nearest-neighbour assigners. The number of dimensions of the assigners
110         * determines how long each sub-vector is. There is a one-to-one mapping
111         * between in the order of assigners and sub-vectors.
112         * 
113         * @param assigners
114         *            the nearest-neighbour assigners.
115         */
116        public ByteProductQuantiser(ByteNearestNeighboursExact[] assigners) {
117                this.assigners = assigners;
118                
119                for (final ByteNearestNeighboursExact nn : assigners)
120                        ndims += nn.numDimensions();
121        }
122
123        /**
124         * Quantise the given data using this Product Quantiser. The output is an
125         * array of bytes corresponding to the index of the matching centroid for
126         * each sub-vector (index numbers are offset by -128 so that 256 centroids
127         * indexes can fit in a single byte).
128         * 
129         * @param data
130         *            the data to quantise
131         * @return the quantised data.
132         */
133        public byte[] quantise(byte[] data) {
134                final byte[] quantised = new byte[assigners.length];
135
136                final int[] idx = { 0 };
137                final float[] dst = { 0 };
138                final byte[][] qus = new byte[1][0];
139
140                for (int i = 0, from = 0; i < assigners.length; i++) {
141                        final int to = assigners[i].numDimensions();
142
143                        qus[0] = Arrays.copyOfRange(data, from, from + to);
144                        assigners[i].searchNN(qus, idx, dst);
145                        quantised[i] = (byte) (idx[0] - 128);
146
147                        from += to;
148                }
149
150                return quantised;
151        }
152        
153        /**
154         * Decompress the quantised data by replacing each encoded index with the actual centroid subvector.
155         *
156         * @param qdata the quantised data
157         *
158         * @return the (approximate) decompressed feature
159         */
160        public byte[] decompress(byte[] qdata) {
161                final byte[] data = new byte[ndims];
162
163                for (int i = 0, from = 0; i < assigners.length; i++) {
164                        final int len = assigners[i].numDimensions();
165                        int index = (int)qdata[i] + 128;
166                
167                        System.arraycopy(this.assigners[i].getPoints()[index], 0, data, from, len);
168
169                        from += len;
170                }
171                
172                return data;
173        }
174}