001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.demos.sandbox.flickr.geo;
031
032import java.io.BufferedReader;
033import java.io.File;
034import java.io.FileReader;
035import java.nio.ByteBuffer;
036
037import krati.core.StoreConfig;
038import krati.core.segment.MappedSegmentFactory;
039import krati.store.DynamicDataStore;
040
041public class CSVReader {
042        final static String CSV_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))";
043
044        static class Record {
045                long photoID;
046                int farmID;
047                int serverID;
048                float lat;
049                float lon;
050                String[] terms;
051
052                public static Record parseLine(String line) {
053                        final String[] parts = line.split(CSV_REGEX);
054
055                        try {
056                                final Record r = new Record();
057                                r.farmID = Integer.parseInt(parts[0].trim());
058                                r.serverID = Integer.parseInt(parts[1].trim());
059                                r.photoID = Long.parseLong(parts[2].trim());
060                                r.lat = Float.parseFloat(parts[15].trim());
061                                r.lon = Float.parseFloat(parts[16].trim());
062
063                                String tags = parts[17].trim();
064                                if (tags.startsWith("\"[")) {
065                                        tags = tags.substring(2, parts[17].length() - 3);
066                                } else {
067                                        tags = tags.substring(1, parts[17].length() - 2);
068                                }
069                                if (tags.length() > 2) {
070                                        r.terms = tags.split(CSV_REGEX);
071                                }
072
073                                return r;
074                        } catch (final NumberFormatException e) {
075                                return null;
076                        }
077                }
078        }
079
080        DynamicDataStore keywordStore;
081        DynamicDataStore recordStore;
082        int count = 0;
083
084        public static void buildIndex(File indexLocation, File CSVFile) throws Exception {
085                final CSVReader reader = new CSVReader();
086                reader.initStores(indexLocation);
087                reader.index(CSVFile);
088                reader.closeStores();
089        }
090
091        private void index(File csvFile) throws Exception {
092                BufferedReader br = null;
093                try {
094                        br = new BufferedReader(new FileReader(csvFile));
095
096                        String line;
097                        int i = 0;
098                        while ((line = br.readLine()) != null) {
099                                final Record r = Record.parseLine(line);
100                                if (r != null) {
101
102                                        writeRecord(r);
103
104                                        i++;
105                                }
106
107                                if (i % 1000 == 0) {
108                                        System.out.println("Read " + i + " records.");
109                                }
110                        }
111                } finally {
112                        if (br != null)
113                                br.close();
114                }
115        }
116
117        private void writeRecord(Record r) throws Exception {
118                final int nTerms = r.terms == null ? 0 : r.terms.length;
119                final int[] hashes = new int[nTerms];
120
121                final ByteBuffer keyBuffer = ByteBuffer.allocate(4);
122                for (int i = 0; i < nTerms; i++) {
123                        final String term = r.terms[i];
124                        hashes[i] = term.hashCode();
125
126                        // write keywords:
127                        keyBuffer.rewind();
128                        keyBuffer.putInt(hashes[i]);
129                        keywordStore.put(keyBuffer.array(), term.getBytes("UTF-8"));
130                }
131
132                final int recSize = (nTerms * 4) + 28;
133                final ByteBuffer valueBuffer = ByteBuffer.allocate(recSize);
134                valueBuffer.putLong(r.photoID);
135                valueBuffer.putInt(r.farmID);
136                valueBuffer.putInt(r.serverID);
137                valueBuffer.putFloat(r.lat);
138                valueBuffer.putFloat(r.lon);
139
140                valueBuffer.putInt(nTerms);
141                for (int i = 0; i < nTerms; i++) {
142                        valueBuffer.putInt(hashes[i]);
143                }
144
145                keyBuffer.rewind();
146                keyBuffer.putInt(count);
147                recordStore.put(keyBuffer.array(), valueBuffer.array());
148                count++;
149        }
150
151        private void initStores(File indexLocation) throws Exception {
152                final File keywordStoreLocation = new File(indexLocation, "keywords");
153                final File recordStoreLocation = new File(indexLocation, "records");
154
155                keywordStoreLocation.mkdirs();
156                final StoreConfig keywordStoreConf = new StoreConfig(keywordStoreLocation, 10000000);
157                keywordStoreConf.setSegmentFactory(new MappedSegmentFactory());
158                keywordStore = new DynamicDataStore(keywordStoreConf);
159
160                recordStoreLocation.mkdirs();
161                final StoreConfig recordStoreConf = new StoreConfig(recordStoreLocation, 10000000);
162                recordStoreConf.setSegmentFactory(new MappedSegmentFactory());
163                recordStore = new DynamicDataStore(recordStoreConf);
164        }
165
166        private void closeStores() throws Exception {
167                keywordStore.rehash();
168                keywordStore.close();
169
170                recordStore.rehash();
171                recordStore.close();
172        }
173
174        public static void main(String[] args) throws Exception {
175                final File index = new File("/Users/jsh2/Desktop/flickrData.idx");
176                final File csv = new File("/Volumes/Raid/FlickrCrawls/AllGeo16/images.csv");
177
178                buildIndex(index, csv);
179        }
180}