001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.demos.sandbox.vlad;
031
032import gnu.trove.list.array.TLongArrayList;
033
034import java.io.BufferedInputStream;
035import java.io.BufferedOutputStream;
036import java.io.BufferedReader;
037import java.io.DataInputStream;
038import java.io.DataOutputStream;
039import java.io.EOFException;
040import java.io.File;
041import java.io.FileInputStream;
042import java.io.FileOutputStream;
043import java.io.FileReader;
044import java.io.IOException;
045import java.net.URI;
046import java.util.Map.Entry;
047
048import org.apache.hadoop.io.BytesWritable;
049import org.apache.hadoop.io.Text;
050import org.openimaj.hadoop.sequencefile.TextBytesSequenceFileUtility;
051import org.openimaj.image.indexing.vlad.VLADIndexerData;
052import org.openimaj.io.IOUtils;
053import org.openimaj.knn.pq.IncrementalFloatADCNearestNeighbours;
054
055public class FlickrIndexer {
056        public static void convertCSV() throws IOException {
057                final String CSV_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))";
058                final File csvFile = new File("/Volumes/Raid/FlickrCrawls/AllGeo16/images.csv");
059                final File output = new File("/Users/jsh2/Desktop/flickr46m-id2lat-lng.map");
060
061                DataOutputStream dos = null;
062                BufferedReader br = null;
063                try {
064                        br = new BufferedReader(new FileReader(csvFile));
065                        dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)));
066
067                        String line;
068                        int i = 0;
069                        while ((line = br.readLine()) != null) {
070                                try {
071                                        final String[] parts = line.split(CSV_REGEX);
072
073                                        // final int farmID = Integer.parseInt(parts[0].trim());
074                                        // final int serverID = Integer.parseInt(parts[1].trim());
075                                        final long imageID = Long.parseLong(parts[2].trim());
076                                        // final String secret = parts[3].trim();
077                                        // final String url = parts[5].trim();
078                                        final float lat = Float.parseFloat(parts[15].trim());
079                                        final float lon = Float.parseFloat(parts[16].trim());
080
081                                        dos.writeLong(imageID);
082                                        dos.writeFloat(lat);
083                                        dos.writeFloat(lon);
084                                        // dos.writeInt(farmID);
085                                        // dos.writeInt(serverID);
086                                        // dos.writeUTF(secret);
087                                } catch (final Exception e) {
088                                        // skip
089                                }
090                                if (i++ % 1000 == 0) {
091                                        System.out.println("Read " + i + " records. " + Runtime.getRuntime().freeMemory());
092                                }
093                        }
094                } finally {
095                        if (dos != null)
096                                dos.close();
097                        if (br != null)
098                                br.close();
099                }
100        }
101
102        public static void extractSequenceFileData() throws IOException {
103                final URI[] paths = TextBytesSequenceFileUtility
104                                .getFiles("hdfs://seurat/data/flickr-all-geo-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.seq",
105                                                "part-m-");
106
107                final File output = new File("/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.dat");
108
109                final DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)));
110
111                final byte[] tmp = new byte[512];
112                for (final URI p : paths) {
113                        System.out.println("Starting file " + p);
114
115                        final TextBytesSequenceFileUtility sf = new TextBytesSequenceFileUtility(p, true);
116
117                        for (final Entry<Text, BytesWritable> rec : sf) {
118                                final long id = Long.parseLong(rec.getKey().toString().trim());
119
120                                System.arraycopy(rec.getValue().getBytes(), 0, tmp, 0, tmp.length);
121
122                                dos.writeLong(id);
123                                dos.write(tmp);
124                        }
125                        dos.flush();
126                }
127
128                dos.close();
129        }
130
131        public static void createPQADCNN() throws IOException {
132                final File input = new File("/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.dat");
133
134                final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(input)));
135
136                final VLADIndexerData indexer = VLADIndexerData.read(new File(
137                                "/Users/jsh2/vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.dat"));
138                final IncrementalFloatADCNearestNeighbours nn = new IncrementalFloatADCNearestNeighbours(
139                                indexer.getProductQuantiser(), 128,
140                                46000000);
141                final TLongArrayList indexes = new TLongArrayList(46000000);
142                try {
143                        final float[] farr = new float[128];
144
145                        for (int x = 0;; x++) {
146                                if (x % 100000 == 0)
147                                        System.out.println(x);
148
149                                final long id = dis.readLong();
150
151                                for (int i = 0; i < 128; i++) {
152                                        farr[i] = dis.readFloat();
153                                }
154
155                                nn.add(farr);
156                                indexes.add(id);
157                        }
158                } catch (final EOFException e) {
159                        dis.close();
160                }
161
162                IOUtils.writeBinary(new File(
163                                "/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x-pqadcnn.dat"), nn);
164                IOUtils.writeToFile(indexes, new File(
165                                "/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x-pqadcnn-indexes.dat"));
166        }
167
168        public static void main(String[] args) throws Exception {
169                // convertCSV();
170                // extractSequenceFileData();
171                createPQADCNN();
172        }
173}