001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.web;
031
032import java.io.BufferedReader;
033import java.io.File;
034import java.io.FileInputStream;
035import java.io.FileWriter;
036import java.io.IOException;
037import java.io.InputStreamReader;
038import java.io.PrintStream;
039import java.io.PrintWriter;
040import java.util.List;
041
042import org.openimaj.image.ImageUtilities;
043import org.openimaj.image.MBFImage;
044import org.openimaj.web.layout.ElementInfo;
045import org.openimaj.web.layout.LayoutExtractor;
046
047/**
048 * Extract features from the webpages listed in files created by
049 * {@link Dmoz2CSV}.
050 * 
051 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
052 * 
053 */
054public class DmozExtractFeatures {
055        final static String csvregex = ",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))";
056
057        /**
058         * Main method. First arg is the csv; second is the output directory.
059         * 
060         * 
061         * @param args
062         * @throws IOException
063         */
064        public static void main(String[] args) throws IOException {
065                final File inputCSV = new File(args[0]);
066                final File outputDirBase = new File(args[1]);
067
068                System.setOut(new PrintStream(System.out, true, "UTF-8"));
069
070                final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inputCSV), "UTF-8"));
071
072                String it;
073                while ((it = br.readLine()) != null) {
074                        final String[] parts = it.split(csvregex);
075
076                        final String url = parts[2];
077                        System.out.println(url);
078
079                        final File dir = new File(outputDirBase, parts[0].replace("\"", "") + "/" + parts[1] + "/"
080                                        + url.replace(":", "|").replace("/", "_"));
081                        final File layoutfile = new File(dir, "layout.csv");
082                        final File imagefile = new File(dir, "render.png");
083
084                        if (dir.exists())
085                                continue;
086                        if (!dir.mkdirs())
087                                continue;
088
089                        final LayoutExtractor le = new LayoutExtractor(30000L); // timeout
090                                                                                                                                        // after 30s
091                        if (le.load(url)) {
092                                final PrintWriter layoutfilePW = new PrintWriter(new FileWriter(layoutfile));
093
094                                final List<ElementInfo> info = le.getLayoutInfo();
095                                layoutfilePW.println(ElementInfo.getCSVHeader());
096                                for (final ElementInfo ei : info) {
097                                        layoutfilePW.println(ei.toCSVString());
098                                }
099
100                                layoutfilePW.close();
101
102                                final MBFImage image = le.render(1024, 768);
103                                if (image != null)
104                                        ImageUtilities.write(image, imagefile);
105                        }
106                }
107
108                br.close();
109        }
110}