001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.fastkmeans;
031
032import java.io.IOException;
033import java.util.ArrayList;
034
035import org.apache.hadoop.io.BytesWritable;
036import org.apache.hadoop.io.Text;
037import org.openimaj.hadoop.sequencefile.ExtractionState;
038import org.openimaj.hadoop.sequencefile.KeyValueDump;
039import org.openimaj.hadoop.sequencefile.NamingStrategy;
040
041
042public class SampleEqualityChecker {
043        
044        static class ByteArrayDump extends KeyValueDump<Text,BytesWritable>{
045                int index = 0;
046                int randomGens = 0;
047                ArrayList<byte[]> centroids;
048                
049                ByteArrayDump(){
050                        centroids = new ArrayList<byte[]>();
051                }
052                @Override
053                public void dumpValue(Text key, BytesWritable val) {
054                        byte [] bytes = new byte[val.getLength()]; 
055                        System.arraycopy(val.getBytes(), 0, bytes, 0, bytes.length);
056                        centroids.add(bytes);
057                }
058                
059        }
060        public static void checkSampleEquality(String selected,HadoopFastKMeansOptions options) throws IOException {
061                ByteArrayDump neededdump = new ByteArrayDump();
062                TextBytesSequenceMemoryUtility utility = new TextBytesSequenceMemoryUtility(selected, true);
063                utility.exportData(NamingStrategy.KEY, new ExtractionState(), 0, neededdump);
064                System.out.println("Finished loading all byte arrays");
065                int total = 0;
066                long done = 0;
067                for(int i = 0; i < neededdump.centroids.size(); i++){
068                        byte[] a = neededdump.centroids.get(i);
069                        for(int j = i+1; j < neededdump.centroids.size(); j++){
070                                
071                                done++;
072                                if(distanceUnderThreshold(a,neededdump.centroids.get(j),options.checkSampleEqualityThreshold) ){
073                                        total++;
074                                }
075                        }
076                        System.out.print("\r" + done + "/" + ((long)(neededdump.centroids.size()) * (long)(neededdump.centroids.size()))/2l + " total: " + total);
077                }
078                System.out.println();
079                System.out.println("There were " + total + " identical samples");
080        }
081        private static boolean distanceUnderThreshold(byte[] a, byte[] b, int threshold) {
082                int totalDistance = 0;
083                for(int i = 0; i < a.length; i++){
084                        int diff = ((int)a[i]) - ((int)b[i]);
085                        totalDistance += diff * diff;
086                        if(totalDistance > threshold){
087//                                      System.out.println("Total distance is: " + totalDistance);
088                                return false;
089                        }
090                }
091                return true;
092        }
093
094}