001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.demos.sandbox.flickr.geo; 031 032import java.io.BufferedReader; 033import java.io.DataInput; 034import java.io.DataOutput; 035import java.io.File; 036import java.io.FileInputStream; 037import java.io.FilenameFilter; 038import java.io.IOException; 039import java.io.InputStreamReader; 040import java.sql.Connection; 041import java.sql.DriverManager; 042import java.sql.PreparedStatement; 043import java.sql.SQLException; 044import java.sql.Statement; 045import java.sql.Timestamp; 046import java.text.ParseException; 047import java.text.SimpleDateFormat; 048import java.util.HashMap; 049import java.util.Locale; 050import java.util.Map; 051 052import org.apache.hadoop.conf.Configuration; 053import org.apache.hadoop.fs.FileSystem; 054import org.apache.hadoop.fs.Path; 055import org.apache.hadoop.io.BytesWritable; 056import org.apache.hadoop.io.SequenceFile.Reader; 057import org.apache.hadoop.io.Text; 058import org.apache.log4j.Level; 059import org.apache.log4j.Logger; 060import org.openimaj.feature.FloatFV; 061import org.openimaj.hadoop.sequencefile.SequenceFileUtility; 062import org.openimaj.hadoop.tools.HadoopToolsUtil; 063import org.openimaj.io.FileUtils; 064import org.openimaj.io.IOUtils; 065import org.openimaj.io.wrappers.ReadableMapBinary; 066import org.openimaj.io.wrappers.WriteableMapBinary; 067import org.openimaj.tools.FileToolsUtil; 068 069public class GlobalFlickrColour { 070 private static final int COUNT_PER_WRITE = 5000000; 071 private static final String WRITE_FILE_NAME = "binary_long_floatfv_%d"; 072 protected static final String INSERT_COLOUR = "insert into colour values (?, ?, ?, ?)"; 073 protected static final String INSERT_LATLON = "insert into latlong values (?, ?, ?, ?, ?)"; 074 final static String CVS_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))"; 075 static { 076 Logger.getRootLogger().setLevel(Level.ERROR); 077 } 078 079 public static void main(String[] args) throws Exception { 080 // saveSEQFileVersion(); 081 loadBinaryMapVersion(); 082 083 } 084 085 private static void loadBinaryMapVersion() throws IOException, SQLException, ClassNotFoundException, 086 InstantiationException, IllegalAccessException 087 { 088 final String source = "/Users/ss/Development/data/flickr-all-geo-16-46M-images-maxhistogram.binary"; 089 final String geocsv = "/Volumes/Raid/FlickrCrawls/AllGeo16/images.csv"; 090 // String geocsv = "/Users/ss/Development/data/flickrcsv.csv"; 091 092 // Prepare the sqlite connection 093 // final Connection connection = prepareDBSQLite(source + ".sqlite"); 094 final Connection connection = prepareDBmysql(); 095 connection.setAutoCommit(false); 096 prepareTables(connection); 097 insertGeo(geocsv, connection); 098 insertColours(source, connection); 099 connection.commit(); 100 connection.close(); 101 } 102 103 private static Connection prepareDBmysql() throws SQLException, InstantiationException, IllegalAccessException, 104 ClassNotFoundException 105 { 106 Connection conn = null; 107 final String url = "jdbc:mysql://leto/"; 108 final String driver = "com.mysql.jdbc.Driver"; 109 final String userName = "root"; 110 final String password = ""; 111 Class.forName(driver).newInstance(); 112 conn = DriverManager.getConnection(url, userName, password); 113 return conn; 114 } 115 116 private static void insertGeo(String source, Connection connection) throws IOException, SQLException { 117 final File f = new File(source); 118 final PreparedStatement statement = connection.prepareStatement(INSERT_LATLON); 119 final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f))); 120 String line = null; 121 int done = 0; 122 while ((line = reader.readLine()) != null) { 123 final String[] linesplit = line.split(CVS_REGEX); 124 try { 125 126 statement.setLong(1, Long.parseLong(linesplit[2].trim())); 127 statement.setFloat(2, Float.parseFloat(linesplit[15].trim())); 128 statement.setFloat(3, Float.parseFloat(linesplit[16].trim())); 129 130 statement.setTimestamp(4, asDate(linesplit[10].trim())); 131 statement.setTimestamp(5, asDate(linesplit[11].trim())); 132 statement.executeUpdate(); 133 done++; 134 if (done % 50000 == 0) { 135 System.out.println("commiting geo: " + done); 136 connection.commit(); 137 } 138 } catch (final Exception e) { 139 System.out.println("Failed writing: \n" + line + "\n to database"); 140 } 141 } 142 reader.close(); 143 return; 144 145 } 146 147 private static Timestamp asDate(String trim) throws ParseException { 148 final SimpleDateFormat format = new SimpleDateFormat("EE MMM dd HH:mm:ss zz yyyy", Locale.US); 149 final java.util.Date t = format.parse(trim); 150 return new java.sql.Timestamp(t.getTime()); 151 } 152 153 private static void insertColours(String source, Connection connection) throws SQLException, IOException { 154 final PreparedStatement statement = connection.prepareStatement(INSERT_COLOUR); 155 final File[] files = new File(source).listFiles(new FilenameFilter() { 156 @Override 157 public boolean accept(File f, String name) { 158 return name.startsWith("binary"); 159 } 160 161 }); 162 for (final File file : files) { 163 System.out.println("Reading from " + file); 164 final ReadableMapBinary<Long, FloatFV> readableMap = new ReadableMapBinary<Long, FloatFV>( 165 new HashMap<Long, FloatFV>()) 166 { 167 168 @Override 169 protected Long readKey(DataInput in) throws IOException { 170 171 return in.readLong(); 172 } 173 174 @Override 175 protected FloatFV readValue(DataInput in) throws IOException { 176 final FloatFV f = new FloatFV(); 177 f.readBinary(in); 178 return f; 179 } 180 181 @Override 182 public void readBinary(DataInput in) throws IOException { 183 final int sz = in.readInt(); 184 185 for (int i = 0; i < sz; i++) { 186 final Long key = readKey(in); 187 final FloatFV val = readValue(in); 188 try { 189 statement.setLong(1, key); 190 statement.setFloat(2, val.values[0]); 191 statement.setFloat(3, val.values[1]); 192 statement.setFloat(4, val.values[2]); 193 statement.executeUpdate(); 194 } catch (final SQLException e) { 195 throw new IOException("Couldn't"); 196 } 197 } 198 } 199 }; 200 IOUtils.read(file, readableMap); 201 connection.commit(); 202 } 203 } 204 205 private static void prepareTables(Connection connection) throws SQLException, IOException { 206 // Read the table def file 207 final String sql = FileUtils.readall(GlobalFlickrColour.class 208 .getResourceAsStream("/org/openimaj/demos/sandbox/flickr/geo/geoflickrcolour.sql")); 209 final Statement statement = connection.createStatement(); 210 final String[] vals = sql.split(";"); 211 for (String str : vals) { 212 str = str.trim(); 213 if (str.length() == 0) 214 continue; 215 statement.executeUpdate(str.trim()); 216 } 217 } 218 219 private static Connection prepareDBSQLite(String location) throws SQLException, ClassNotFoundException, IOException { 220 // load the sqlite-JDBC driver using the current class loader 221 Class.forName("org.sqlite.JDBC"); 222 FileToolsUtil.validateLocalOutput(location, true, false); 223 Connection connection = null; 224 connection = DriverManager.getConnection("jdbc:sqlite:" + location); 225 226 return connection; 227 } 228 229 private static void saveSEQFileVersion() throws Exception { 230 final String seqFileSource = "/Users/ss/Development/data/flickr-all-geo-16-46M-images-maxhistogram.seq"; 231 final String output = "/Users/ss/Development/data/flickr-all-geo-16-46M-images-maxhistogram.binary"; 232 final File ofile = FileToolsUtil.validateLocalOutput(output, true, false); 233 ofile.mkdirs(); 234 235 final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(seqFileSource, "part"); 236 final Configuration config = new Configuration(); 237 config.setQuietMode(true); 238 final Map<Long, FloatFV> flickrMaxHist = new HashMap<Long, FloatFV>(); 239 int total = 0; 240 int writeCount = 0; 241 for (final Path path : sequenceFiles) { 242 // System.out.println("Extracting from " + path.getName()); 243 System.out.print("."); 244 total++; 245 if (total % 40 == 0) 246 System.out.println(flickrMaxHist.size()); 247 final FileSystem fs = HadoopToolsUtil.getFileSystem(path); 248 final Reader reader = new Reader(config, 249 Reader.file(path.makeQualified(fs.getUri(), fs.getWorkingDirectory()))); 250 final Text key = org.apache.hadoop.util.ReflectionUtils.newInstance(Text.class, config); 251 final BytesWritable val = org.apache.hadoop.util.ReflectionUtils.newInstance(BytesWritable.class, config); 252 while (reader.next(key, val)) { 253 final FloatFV fv = IOUtils.deserialize(val.getBytes(), FloatFV.class); 254 // System.out.println(key + ": " + fv); 255 flickrMaxHist.put(Long.parseLong(key.toString().trim()), fv); 256 } 257 reader.close(); 258 if (flickrMaxHist.size() > COUNT_PER_WRITE) { 259 System.out.println(); 260 System.out.println("Writing values:" + flickrMaxHist.size()); 261 final WriteableMapBinary<Long, FloatFV> writeMap = new WriteableMapBinary<Long, FloatFV>(flickrMaxHist) { 262 @Override 263 protected void writeKey(Long key, DataOutput out) throws IOException { 264 out.writeLong(key); 265 } 266 267 @Override 268 protected void writeValue(FloatFV value, DataOutput out) throws IOException { 269 value.writeBinary(out); 270 } 271 272 }; 273 final File writeName = new File(ofile, String.format(WRITE_FILE_NAME, writeCount)); 274 System.out.println("writing to: " + writeName); 275 IOUtils.writeBinary(writeName, writeMap); 276 flickrMaxHist.clear(); 277 writeCount++; 278 } 279 } 280 } 281}