001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.demos.sandbox.flickr.geo;
031
032import java.io.BufferedReader;
033import java.io.DataInput;
034import java.io.DataOutput;
035import java.io.File;
036import java.io.FileInputStream;
037import java.io.FilenameFilter;
038import java.io.IOException;
039import java.io.InputStreamReader;
040import java.sql.Connection;
041import java.sql.DriverManager;
042import java.sql.PreparedStatement;
043import java.sql.SQLException;
044import java.sql.Statement;
045import java.sql.Timestamp;
046import java.text.ParseException;
047import java.text.SimpleDateFormat;
048import java.util.HashMap;
049import java.util.Locale;
050import java.util.Map;
051
052import org.apache.hadoop.conf.Configuration;
053import org.apache.hadoop.fs.FileSystem;
054import org.apache.hadoop.fs.Path;
055import org.apache.hadoop.io.BytesWritable;
056import org.apache.hadoop.io.SequenceFile.Reader;
057import org.apache.hadoop.io.Text;
058import org.apache.log4j.Level;
059import org.apache.log4j.Logger;
060import org.openimaj.feature.FloatFV;
061import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
062import org.openimaj.hadoop.tools.HadoopToolsUtil;
063import org.openimaj.io.FileUtils;
064import org.openimaj.io.IOUtils;
065import org.openimaj.io.wrappers.ReadableMapBinary;
066import org.openimaj.io.wrappers.WriteableMapBinary;
067import org.openimaj.tools.FileToolsUtil;
068
069public class GlobalFlickrColour {
070        private static final int COUNT_PER_WRITE = 5000000;
071        private static final String WRITE_FILE_NAME = "binary_long_floatfv_%d";
072        protected static final String INSERT_COLOUR = "insert into colour values (?, ?, ?, ?)";
073        protected static final String INSERT_LATLON = "insert into latlong values (?, ?, ?, ?, ?)";
074        final static String CVS_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))";
075        static {
076                Logger.getRootLogger().setLevel(Level.ERROR);
077        }
078
079        public static void main(String[] args) throws Exception {
080                // saveSEQFileVersion();
081                loadBinaryMapVersion();
082
083        }
084
085        private static void loadBinaryMapVersion() throws IOException, SQLException, ClassNotFoundException,
086                        InstantiationException, IllegalAccessException
087        {
088                final String source = "/Users/ss/Development/data/flickr-all-geo-16-46M-images-maxhistogram.binary";
089                final String geocsv = "/Volumes/Raid/FlickrCrawls/AllGeo16/images.csv";
090                // String geocsv = "/Users/ss/Development/data/flickrcsv.csv";
091
092                // Prepare the sqlite connection
093                // final Connection connection = prepareDBSQLite(source + ".sqlite");
094                final Connection connection = prepareDBmysql();
095                connection.setAutoCommit(false);
096                prepareTables(connection);
097                insertGeo(geocsv, connection);
098                insertColours(source, connection);
099                connection.commit();
100                connection.close();
101        }
102
103        private static Connection prepareDBmysql() throws SQLException, InstantiationException, IllegalAccessException,
104                        ClassNotFoundException
105        {
106                Connection conn = null;
107                final String url = "jdbc:mysql://leto/";
108                final String driver = "com.mysql.jdbc.Driver";
109                final String userName = "root";
110                final String password = "";
111                Class.forName(driver).newInstance();
112                conn = DriverManager.getConnection(url, userName, password);
113                return conn;
114        }
115
116        private static void insertGeo(String source, Connection connection) throws IOException, SQLException {
117                final File f = new File(source);
118                final PreparedStatement statement = connection.prepareStatement(INSERT_LATLON);
119                final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
120                String line = null;
121                int done = 0;
122                while ((line = reader.readLine()) != null) {
123                        final String[] linesplit = line.split(CVS_REGEX);
124                        try {
125
126                                statement.setLong(1, Long.parseLong(linesplit[2].trim()));
127                                statement.setFloat(2, Float.parseFloat(linesplit[15].trim()));
128                                statement.setFloat(3, Float.parseFloat(linesplit[16].trim()));
129
130                                statement.setTimestamp(4, asDate(linesplit[10].trim()));
131                                statement.setTimestamp(5, asDate(linesplit[11].trim()));
132                                statement.executeUpdate();
133                                done++;
134                                if (done % 50000 == 0) {
135                                        System.out.println("commiting geo: " + done);
136                                        connection.commit();
137                                }
138                        } catch (final Exception e) {
139                                System.out.println("Failed writing: \n" + line + "\n to database");
140                        }
141                }
142                reader.close();
143                return;
144
145        }
146
147        private static Timestamp asDate(String trim) throws ParseException {
148                final SimpleDateFormat format = new SimpleDateFormat("EE MMM dd HH:mm:ss zz yyyy", Locale.US);
149                final java.util.Date t = format.parse(trim);
150                return new java.sql.Timestamp(t.getTime());
151        }
152
153        private static void insertColours(String source, Connection connection) throws SQLException, IOException {
154                final PreparedStatement statement = connection.prepareStatement(INSERT_COLOUR);
155                final File[] files = new File(source).listFiles(new FilenameFilter() {
156                        @Override
157                        public boolean accept(File f, String name) {
158                                return name.startsWith("binary");
159                        }
160
161                });
162                for (final File file : files) {
163                        System.out.println("Reading from " + file);
164                        final ReadableMapBinary<Long, FloatFV> readableMap = new ReadableMapBinary<Long, FloatFV>(
165                                        new HashMap<Long, FloatFV>())
166                        {
167
168                                @Override
169                                protected Long readKey(DataInput in) throws IOException {
170
171                                        return in.readLong();
172                                }
173
174                                @Override
175                                protected FloatFV readValue(DataInput in) throws IOException {
176                                        final FloatFV f = new FloatFV();
177                                        f.readBinary(in);
178                                        return f;
179                                }
180
181                                @Override
182                                public void readBinary(DataInput in) throws IOException {
183                                        final int sz = in.readInt();
184
185                                        for (int i = 0; i < sz; i++) {
186                                                final Long key = readKey(in);
187                                                final FloatFV val = readValue(in);
188                                                try {
189                                                        statement.setLong(1, key);
190                                                        statement.setFloat(2, val.values[0]);
191                                                        statement.setFloat(3, val.values[1]);
192                                                        statement.setFloat(4, val.values[2]);
193                                                        statement.executeUpdate();
194                                                } catch (final SQLException e) {
195                                                        throw new IOException("Couldn't");
196                                                }
197                                        }
198                                }
199                        };
200                        IOUtils.read(file, readableMap);
201                        connection.commit();
202                }
203        }
204
205        private static void prepareTables(Connection connection) throws SQLException, IOException {
206                // Read the table def file
207                final String sql = FileUtils.readall(GlobalFlickrColour.class
208                                .getResourceAsStream("/org/openimaj/demos/sandbox/flickr/geo/geoflickrcolour.sql"));
209                final Statement statement = connection.createStatement();
210                final String[] vals = sql.split(";");
211                for (String str : vals) {
212                        str = str.trim();
213                        if (str.length() == 0)
214                                continue;
215                        statement.executeUpdate(str.trim());
216                }
217        }
218
219        private static Connection prepareDBSQLite(String location) throws SQLException, ClassNotFoundException, IOException {
220                // load the sqlite-JDBC driver using the current class loader
221                Class.forName("org.sqlite.JDBC");
222                FileToolsUtil.validateLocalOutput(location, true, false);
223                Connection connection = null;
224                connection = DriverManager.getConnection("jdbc:sqlite:" + location);
225
226                return connection;
227        }
228
229        private static void saveSEQFileVersion() throws Exception {
230                final String seqFileSource = "/Users/ss/Development/data/flickr-all-geo-16-46M-images-maxhistogram.seq";
231                final String output = "/Users/ss/Development/data/flickr-all-geo-16-46M-images-maxhistogram.binary";
232                final File ofile = FileToolsUtil.validateLocalOutput(output, true, false);
233                ofile.mkdirs();
234
235                final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(seqFileSource, "part");
236                final Configuration config = new Configuration();
237                config.setQuietMode(true);
238                final Map<Long, FloatFV> flickrMaxHist = new HashMap<Long, FloatFV>();
239                int total = 0;
240                int writeCount = 0;
241                for (final Path path : sequenceFiles) {
242                        // System.out.println("Extracting from " + path.getName());
243                        System.out.print(".");
244                        total++;
245                        if (total % 40 == 0)
246                                System.out.println(flickrMaxHist.size());
247                        final FileSystem fs = HadoopToolsUtil.getFileSystem(path);
248                        final Reader reader = new Reader(config,
249                                        Reader.file(path.makeQualified(fs.getUri(), fs.getWorkingDirectory())));
250                        final Text key = org.apache.hadoop.util.ReflectionUtils.newInstance(Text.class, config);
251                        final BytesWritable val = org.apache.hadoop.util.ReflectionUtils.newInstance(BytesWritable.class, config);
252                        while (reader.next(key, val)) {
253                                final FloatFV fv = IOUtils.deserialize(val.getBytes(), FloatFV.class);
254                                // System.out.println(key + ": " + fv);
255                                flickrMaxHist.put(Long.parseLong(key.toString().trim()), fv);
256                        }
257                        reader.close();
258                        if (flickrMaxHist.size() > COUNT_PER_WRITE) {
259                                System.out.println();
260                                System.out.println("Writing values:" + flickrMaxHist.size());
261                                final WriteableMapBinary<Long, FloatFV> writeMap = new WriteableMapBinary<Long, FloatFV>(flickrMaxHist) {
262                                        @Override
263                                        protected void writeKey(Long key, DataOutput out) throws IOException {
264                                                out.writeLong(key);
265                                        }
266
267                                        @Override
268                                        protected void writeValue(FloatFV value, DataOutput out) throws IOException {
269                                                value.writeBinary(out);
270                                        }
271
272                                };
273                                final File writeName = new File(ofile, String.format(WRITE_FILE_NAME, writeCount));
274                                System.out.println("writing to: " + writeName);
275                                IOUtils.writeBinary(writeName, writeMap);
276                                flickrMaxHist.clear();
277                                writeCount++;
278                        }
279                }
280        }
281}