001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.demos.sandbox.vlad; 031 032import gnu.trove.list.array.TLongArrayList; 033 034import java.io.BufferedInputStream; 035import java.io.BufferedOutputStream; 036import java.io.BufferedReader; 037import java.io.DataInputStream; 038import java.io.DataOutputStream; 039import java.io.EOFException; 040import java.io.File; 041import java.io.FileInputStream; 042import java.io.FileOutputStream; 043import java.io.FileReader; 044import java.io.IOException; 045import java.net.URI; 046import java.util.Map.Entry; 047 048import org.apache.hadoop.io.BytesWritable; 049import org.apache.hadoop.io.Text; 050import org.openimaj.hadoop.sequencefile.TextBytesSequenceFileUtility; 051import org.openimaj.image.indexing.vlad.VLADIndexerData; 052import org.openimaj.io.IOUtils; 053import org.openimaj.knn.pq.IncrementalFloatADCNearestNeighbours; 054 055public class FlickrIndexer { 056 public static void convertCSV() throws IOException { 057 final String CSV_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))"; 058 final File csvFile = new File("/Volumes/Raid/FlickrCrawls/AllGeo16/images.csv"); 059 final File output = new File("/Users/jsh2/Desktop/flickr46m-id2lat-lng.map"); 060 061 DataOutputStream dos = null; 062 BufferedReader br = null; 063 try { 064 br = new BufferedReader(new FileReader(csvFile)); 065 dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output))); 066 067 String line; 068 int i = 0; 069 while ((line = br.readLine()) != null) { 070 try { 071 final String[] parts = line.split(CSV_REGEX); 072 073 // final int farmID = Integer.parseInt(parts[0].trim()); 074 // final int serverID = Integer.parseInt(parts[1].trim()); 075 final long imageID = Long.parseLong(parts[2].trim()); 076 // final String secret = parts[3].trim(); 077 // final String url = parts[5].trim(); 078 final float lat = Float.parseFloat(parts[15].trim()); 079 final float lon = Float.parseFloat(parts[16].trim()); 080 081 dos.writeLong(imageID); 082 dos.writeFloat(lat); 083 dos.writeFloat(lon); 084 // dos.writeInt(farmID); 085 // dos.writeInt(serverID); 086 // dos.writeUTF(secret); 087 } catch (final Exception e) { 088 // skip 089 } 090 if (i++ % 1000 == 0) { 091 System.out.println("Read " + i + " records. " + Runtime.getRuntime().freeMemory()); 092 } 093 } 094 } finally { 095 if (dos != null) 096 dos.close(); 097 if (br != null) 098 br.close(); 099 } 100 } 101 102 public static void extractSequenceFileData() throws IOException { 103 final URI[] paths = TextBytesSequenceFileUtility 104 .getFiles("hdfs://seurat/data/flickr-all-geo-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.seq", 105 "part-m-"); 106 107 final File output = new File("/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.dat"); 108 109 final DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output))); 110 111 final byte[] tmp = new byte[512]; 112 for (final URI p : paths) { 113 System.out.println("Starting file " + p); 114 115 final TextBytesSequenceFileUtility sf = new TextBytesSequenceFileUtility(p, true); 116 117 for (final Entry<Text, BytesWritable> rec : sf) { 118 final long id = Long.parseLong(rec.getKey().toString().trim()); 119 120 System.arraycopy(rec.getValue().getBytes(), 0, tmp, 0, tmp.length); 121 122 dos.writeLong(id); 123 dos.write(tmp); 124 } 125 dos.flush(); 126 } 127 128 dos.close(); 129 } 130 131 public static void createPQADCNN() throws IOException { 132 final File input = new File("/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.dat"); 133 134 final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(input))); 135 136 final VLADIndexerData indexer = VLADIndexerData.read(new File( 137 "/Users/jsh2/vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x.dat")); 138 final IncrementalFloatADCNearestNeighbours nn = new IncrementalFloatADCNearestNeighbours( 139 indexer.getProductQuantiser(), 128, 140 46000000); 141 final TLongArrayList indexes = new TLongArrayList(46000000); 142 try { 143 final float[] farr = new float[128]; 144 145 for (int x = 0;; x++) { 146 if (x % 100000 == 0) 147 System.out.println(x); 148 149 final long id = dis.readLong(); 150 151 for (int i = 0; i < 128; i++) { 152 farr[i] = dis.readFloat(); 153 } 154 155 nn.add(farr); 156 indexes.add(id); 157 } 158 } catch (final EOFException e) { 159 dis.close(); 160 } 161 162 IOUtils.writeBinary(new File( 163 "/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x-pqadcnn.dat"), nn); 164 IOUtils.writeToFile(indexes, new File( 165 "/Volumes/My Book/flickr46m-vlad64-pca128-pq16x8-indexer-mirflickr25k-sift1x-pqadcnn-indexes.dat")); 166 } 167 168 public static void main(String[] args) throws Exception { 169 // convertCSV(); 170 // extractSequenceFileData(); 171 createPQADCNN(); 172 } 173}