001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.util.hash;
031
032/**
033 * Use MurmurHash (http://murmurhash.googlepages.com/) to generate a random hash
034 * for a string.
035 * 
036 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
037 * 
038 */
039public class StringMurmurHashFunction implements HashFunction<String> {
040        private int seed = -1;
041
042        /**
043         * Construct a new {@link StringMurmurHashFunction} with the default seed.
044         */
045        public StringMurmurHashFunction() {
046        }
047
048        /**
049         * Construct a new {@link StringMurmurHashFunction} with the given seed.
050         * 
051         * @param seed
052         *            the seed
053         */
054        public StringMurmurHashFunction(int seed) {
055                this.seed = seed;
056        }
057
058        @Override
059        public int computeHashCode(String data) {
060                return murmurhash(data.getBytes(), seed);
061        }
062
063        /**
064         * Compute the Murmur hash (http://murmurhash.googlepages.com/) of the given
065         * bytes.
066         * 
067         * @see #murmurhash(byte[], int, int)
068         * 
069         * @param data
070         *            the data to hash
071         * @return the murmur hash
072         */
073        public static int murmurhash(byte[] data) {
074                return murmurhash(data, data.length, -1);
075        }
076
077        /**
078         * Compute the Murmur hash (http://murmurhash.googlepages.com/) of the given
079         * bytes.
080         * 
081         * @see #murmurhash(byte[], int, int)
082         * 
083         * @param data
084         *            the data to hash
085         * @param seed
086         *            the random seed
087         * @return the murmur hash
088         */
089        public static int murmurhash(byte[] data, int seed) {
090                return murmurhash(data, data.length, seed);
091        }
092
093        /**
094         * Compute the Murmur hash (http://murmurhash.googlepages.com/) of the given
095         * bytes. Based on the implementation from <a
096         * href="https://github.com/clearspring/stream-lib">
097         * https://github.com/clearspring/stream-lib</a>.
098         * 
099         * @param data
100         *            the data to hash
101         * @param length
102         *            the length of the data to hash
103         * @param seed
104         *            the random seed
105         * @return the murmur hash
106         */
107        public static int murmurhash(byte[] data, int length, int seed) {
108                final int m = 0x5bd1e995;
109                final int r = 24;
110
111                int h = seed ^ length;
112
113                final int len_4 = length >> 2;
114
115                for (int i = 0; i < len_4; i++) {
116                        final int i_4 = i << 2;
117                        int k = data[i_4 + 3];
118                        k = k << 8;
119                        k = k | (data[i_4 + 2] & 0xff);
120                        k = k << 8;
121                        k = k | (data[i_4 + 1] & 0xff);
122                        k = k << 8;
123                        k = k | (data[i_4 + 0] & 0xff);
124                        k *= m;
125                        k ^= k >>> r;
126                        k *= m;
127                        h *= m;
128                        h ^= k;
129                }
130
131                // avoid calculating modulo
132                final int len_m = len_4 << 2;
133                final int left = length - len_m;
134
135                if (left != 0) {
136                        if (left >= 3) {
137                                h ^= data[length - 3] << 16;
138                        }
139                        if (left >= 2) {
140                                h ^= data[length - 2] << 8;
141                        }
142                        if (left >= 1) {
143                                h ^= data[length - 1];
144                        }
145
146                        h *= m;
147                }
148
149                h ^= h >>> 13;
150                h *= m;
151                h ^= h >>> 15;
152
153                return h;
154        }
155}