001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.downloader;
031
032import java.net.URL;
033import java.util.ArrayList;
034import java.util.List;
035
036import org.apache.hadoop.io.MD5Hash;
037import org.kohsuke.args4j.CmdLineOptionsProvider;
038import org.kohsuke.args4j.Option;
039import org.openimaj.util.pair.IndependentPair;
040
041/**
042 * Different types of input file formats.
043 * 
044 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
045 */
046public enum InputMode implements CmdLineOptionsProvider {
047        /**
048         * Plain list-of-urls file. One URL per line.
049         * 
050         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
051         * 
052         */
053        PLAIN {
054                @Option(name = "-hash-keys", usage = "use the MD5SUM of the URL as the key, rather than the URL itself.")
055                boolean hashKeys = false;
056
057                @Override
058                public Parser getOptions() {
059                        return new Parser() {
060                                @Override
061                                public IndependentPair<String, List<URL>> parse(String data) throws Exception {
062                                        String key = data;
063
064                                        if (hashKeys) {
065                                                key = MD5Hash.digest(key).toString();
066                                        }
067
068                                        final ArrayList<URL> value = new ArrayList<URL>();
069                                        value.add(new URL(data));
070
071                                        return new IndependentPair<String, List<URL>>(key, value);
072                                }
073                        };
074                }
075        },
076        /**
077         * List of URLs in the form provided by <a
078         * href="http://www.image-net.org">image-net</a>
079         * 
080         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
081         * 
082         */
083        IMAGE_NET {
084                @Override
085                public Parser getOptions() {
086                        return new Parser() {
087                                @Override
088                                public IndependentPair<String, List<URL>> parse(String data) throws Exception {
089                                        // we expect a format [id]\t[url] as with the image-net url
090                                        // set
091                                        final String[] split = data.split("\t");
092                                        if (split.length != 2) {
093                                                throw new RuntimeException("Record is in the wrong format");
094                                        }
095
096                                        final String id = split[0].trim();
097                                        final String url = split[1].trim();
098
099                                        final ArrayList<URL> value = new ArrayList<URL>();
100                                        value.add(new URL(url));
101
102                                        return new IndependentPair<String, List<URL>>(id, value);
103                                }
104                        };
105                }
106        },
107        /**
108         * Wikipedia image URLs dump format
109         * 
110         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
111         * 
112         */
113        WIKIPEDIA_IMAGES_DUMP {
114                @Override
115                public Parser getOptions() {
116                        return new Parser() {
117                                @Option(
118                                                name = "--wikipedia-baseurl",
119                                                aliases = "-wbase",
120                                                required = false,
121                                                usage = "wikipedia upload files base urls. add many urls to check different locations for each image. defaults to upload.wikimedia.org/wikipedia/commons and upload.wikimedia.org/wikipedia/en",
122                                                multiValued = true)
123                                private List<String> wikipediaBase;
124
125                                @Override
126                                public IndependentPair<String, List<URL>> parse(String data) throws Exception {
127                                        if (wikipediaBase == null) {
128                                                wikipediaBase = new ArrayList<String>();
129                                                wikipediaBase.add("http://upload.wikimedia.org/wikipedia/commons");
130                                                wikipediaBase.add("http://upload.wikimedia.org/wikipedia/en");
131                                        }
132
133                                        final String[] split = data.split(":");
134                                        if (split.length != 2) {
135                                                throw new RuntimeException("Record is in the wrong format");
136                                        }
137
138                                        final String hash = MD5Hash.digest(split[1]).toString();
139                                        final String dirStructure = String.format("%s/%s", hash.substring(0, 1), hash.substring(0, 2));
140
141                                        final ArrayList<URL> value = new ArrayList<URL>();
142                                        for (final String base : wikipediaBase) {
143                                                final String completeURL = String.format("%s/%s/%s", base, dirStructure,
144                                                                split[1].replace(" ", "_"));
145                                                value.add(new URL(completeURL));
146                                        }
147
148                                        return new IndependentPair<String, List<URL>>(data, value);
149                                }
150                        };
151                }
152        },
153        /**
154         * Parse urls and keys from a csv record.
155         * 
156         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
157         */
158        CSV {
159                @Override
160                public Parser getOptions() {
161                        return new CsvParser() {
162                                @Option(name = "--key-field")
163                                int keyField;
164                                @Option(name = "--url-field")
165                                int urlField;
166
167                                @Override
168                                public int getKeyField() {
169                                        return keyField;
170                                }
171
172                                @Override
173                                public int getUrlField() {
174                                        return urlField;
175                                }
176                        };
177                }
178        },
179        /**
180         * Parse the FlickrCrawler csv file to get the medium url of the image.
181         * 
182         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
183         * 
184         */
185        FLICKR_CSV_MEDIUM {
186                @Override
187                public Parser getOptions() {
188                        return new CsvParser() {
189                                @Override
190                                public int getKeyField() {
191                                        return 2;
192                                }
193
194                                @Override
195                                public int getUrlField() {
196                                        return 5;
197                                }
198                        };
199                }
200        };
201
202        @Override
203        public abstract Parser getOptions();
204
205        /**
206         * Options for the {@link InputMode}
207         * 
208         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
209         */
210        public static abstract class Parser {
211                /**
212                 * Parse a record into a key and list of potential URLs. In most cases
213                 * there will only be a single potential URL in the list. The downloader
214                 * will work through the list until it finds a working URL, or exhausts
215                 * its options.
216                 * 
217                 * @param data
218                 *            the data record from the input file
219                 * @return the key and potential URLs
220                 * @throws Exception
221                 *             if an error occurs
222                 */
223                public abstract IndependentPair<String, List<URL>> parse(String data) throws Exception;
224        }
225
226        private static abstract class CsvParser extends Parser {
227                final static String CVS_REGEX = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))";
228
229                public abstract int getKeyField();
230
231                public abstract int getUrlField();
232
233                @Override
234                public IndependentPair<String, List<URL>> parse(String data) throws Exception {
235                        final String[] parts = data.split(CVS_REGEX);
236
237                        final String key = unescapeCSV(parts[getKeyField()]);
238                        final URL url = new URL(unescapeCSV(parts[getUrlField()]));
239
240                        final ArrayList<URL> value = new ArrayList<URL>();
241                        value.add(url);
242
243                        return new IndependentPair<String, List<URL>>(key, value);
244                }
245
246                private String unescapeCSV(String input) {
247                        if (input == null)
248                                return input;
249                        else if (input.length() < 2)
250                                return input;
251                        else if (input.charAt(0) != '"' || input.charAt(input.length() - 1) != '"')
252                                return input;
253                        else {
254                                String quoteless = input.substring(1, input.length() - 1);
255
256                                if (quoteless.contains(",") || quoteless.contains("\n") || quoteless.contains("\"")) {
257                                        quoteless = quoteless.replace("\"\"", "\"");
258                                }
259
260                                return quoteless;
261                        }
262                }
263        }
264}