001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.web;
031
032import java.io.File;
033import java.io.FileWriter;
034import java.io.IOException;
035import java.io.PrintStream;
036import java.io.PrintWriter;
037import java.util.List;
038
039import org.kohsuke.args4j.Argument;
040import org.kohsuke.args4j.CmdLineException;
041import org.kohsuke.args4j.CmdLineParser;
042import org.kohsuke.args4j.Option;
043import org.openimaj.image.ImageUtilities;
044import org.openimaj.image.MBFImage;
045import org.openimaj.image.colour.RGBColour;
046import org.openimaj.image.processing.resize.ResizeProcessor;
047import org.openimaj.web.layout.ElementInfo;
048import org.openimaj.web.layout.LayoutExtractor;
049
050/**
051 * Tool for extracting information from rendered webpages. 
052 * 
053 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
054 *
055 */
056public class LayoutExtractorTool {
057        private static final int THUMBNAIL_HEIGHT = 100;
058        private static final int THUMBNAIL_WIDTH = 100;
059        private static final Float[] NON_CONTENT_COLOUR = RGBColour.CYAN;
060        private static final Float[] NON_CONTENT_INSIDE_COLOUR = RGBColour.RED;
061        private static final Float[] CONTENT_COLOUR = RGBColour.GREEN;
062
063        @Option(name = "--thumbnail", aliases="-t", usage = "Write a thumbnail image of the page", required=false)
064        File thumbnailFile;
065        
066        @Option(name = "--render", aliases="-r", usage = "Write a rendered image of the page", required=false)
067        File renderFile;
068        
069        @Option(name = "--layout", aliases="-l", usage = "Write the layout information in CSV format. Passing \"-\" will cause the data to be written to STDOUT", required=false)
070        File layoutFile;
071        
072        @Option(name = "--layout-render", aliases="-lr", usage = "Write the layout information as an image", required=false)
073        File layoutRender;
074        
075        @Option(name = "--layout-render-overlay", aliases="-lro", usage = "Write the layout information as an image, overlayed on a render of the page", required=false)
076        File layoutRenderOverlayed;
077        
078        @Option(name = "--content-layout-render", aliases="-clr", usage = "Write the content layout information as an image", required=false)
079        File contentLayoutRender;
080        
081        @Option(name = "--content-layout-render-overlay", aliases="-clro", usage = "Write the content layout information as an image, overlayed on a render of the page", required=false)
082        File contentLayoutRenderOverlayed;
083        
084        @Argument()
085        String url;
086        
087        LayoutExtractor extractor = new LayoutExtractor();
088        MBFImage render;
089        
090        protected void writeLayout() throws IOException {
091                List<ElementInfo> info = extractor.getLayoutInfo();
092                PrintWriter pw;
093                
094                if (layoutFile.getName().equals("-")) {
095                        pw = new PrintWriter(System.out);
096                } else {
097                        pw = new PrintWriter(new FileWriter(layoutFile));
098                }
099                
100                pw.println(ElementInfo.getCSVHeader());
101                for (ElementInfo ei : info) {
102                        pw.println(ei.toCSVString());
103                }
104                
105                if (!layoutFile.getName().equals("-")) {
106                        pw.close();
107                }
108        }
109                
110        protected MBFImage getRender() {
111                if (render == null)
112                        render = extractor.render();
113                return render;
114        }
115        
116        /**
117         * Extract content.
118         * @throws IOException
119         */
120        public void extractContent() throws IOException {
121                if (!extractor.load(url)) {
122                        System.err.println("Error loading page: " + url);
123                        System.exit(1);
124                }
125                
126                if (layoutFile != null) writeLayout();
127                
128                if (thumbnailFile != null) {
129                        MBFImage image = getRender();
130                        
131                        //crop first if its very long
132                        if (image.getHeight() > 1.5 * image.getWidth()) {
133                                image = image.extractROI(0, 0, image.getWidth(), image.getWidth());
134                        }
135                        
136                        MBFImage thumb = image.process(new ResizeProcessor(THUMBNAIL_WIDTH, THUMBNAIL_HEIGHT));
137                        ImageUtilities.write(thumb, thumbnailFile);
138                }
139                
140                if (renderFile != null) {
141                        ImageUtilities.write(getRender(), renderFile);
142                }
143                
144                if (layoutRender != null) {
145                        ImageUtilities.write(extractor.renderLayoutInfo(RGBColour.BLACK), layoutRender);
146                }
147                
148                if (layoutRenderOverlayed != null) {
149                        ImageUtilities.write(extractor.renderLayoutInfo(getRender(), RGBColour.RED), layoutRenderOverlayed);
150                }
151                
152                if (contentLayoutRender != null) {
153                        ImageUtilities.write(extractor.renderContentLayout(CONTENT_COLOUR, NON_CONTENT_INSIDE_COLOUR, NON_CONTENT_COLOUR), contentLayoutRender);
154                }
155                
156                if (contentLayoutRenderOverlayed != null) {
157                        ImageUtilities.write(extractor.renderContentLayout(getRender(), CONTENT_COLOUR, NON_CONTENT_INSIDE_COLOUR, NON_CONTENT_COLOUR), contentLayoutRenderOverlayed);
158                }
159        }
160        
161        /**
162         * Main method
163         * @param args
164         * @throws IOException
165         */
166        public static void main(String [] args) throws IOException {
167                System.setOut(new PrintStream(System.out, true, "UTF-8"));
168                
169                LayoutExtractorTool extractor = new LayoutExtractorTool();
170                CmdLineParser parser = new CmdLineParser(extractor);
171                
172                try {
173                    parser.parseArgument(args);
174                } catch(CmdLineException e) {
175                    System.err.println(e.getMessage());
176                    System.err.println("Usage: java -jar LayoutExtractor.jar [options...]");
177                    parser.printUsage(System.err);
178                    return;
179                }
180                
181                extractor.extractContent();
182        }
183}