001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.web; 031 032import java.io.File; 033import java.io.FileWriter; 034import java.io.IOException; 035import java.io.PrintStream; 036import java.io.PrintWriter; 037import java.util.List; 038 039import org.kohsuke.args4j.Argument; 040import org.kohsuke.args4j.CmdLineException; 041import org.kohsuke.args4j.CmdLineParser; 042import org.kohsuke.args4j.Option; 043import org.openimaj.image.ImageUtilities; 044import org.openimaj.image.MBFImage; 045import org.openimaj.image.colour.RGBColour; 046import org.openimaj.image.processing.resize.ResizeProcessor; 047import org.openimaj.web.layout.ElementInfo; 048import org.openimaj.web.layout.LayoutExtractor; 049 050/** 051 * Tool for extracting information from rendered webpages. 052 * 053 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 054 * 055 */ 056public class LayoutExtractorTool { 057 private static final int THUMBNAIL_HEIGHT = 100; 058 private static final int THUMBNAIL_WIDTH = 100; 059 private static final Float[] NON_CONTENT_COLOUR = RGBColour.CYAN; 060 private static final Float[] NON_CONTENT_INSIDE_COLOUR = RGBColour.RED; 061 private static final Float[] CONTENT_COLOUR = RGBColour.GREEN; 062 063 @Option(name = "--thumbnail", aliases="-t", usage = "Write a thumbnail image of the page", required=false) 064 File thumbnailFile; 065 066 @Option(name = "--render", aliases="-r", usage = "Write a rendered image of the page", required=false) 067 File renderFile; 068 069 @Option(name = "--layout", aliases="-l", usage = "Write the layout information in CSV format. Passing \"-\" will cause the data to be written to STDOUT", required=false) 070 File layoutFile; 071 072 @Option(name = "--layout-render", aliases="-lr", usage = "Write the layout information as an image", required=false) 073 File layoutRender; 074 075 @Option(name = "--layout-render-overlay", aliases="-lro", usage = "Write the layout information as an image, overlayed on a render of the page", required=false) 076 File layoutRenderOverlayed; 077 078 @Option(name = "--content-layout-render", aliases="-clr", usage = "Write the content layout information as an image", required=false) 079 File contentLayoutRender; 080 081 @Option(name = "--content-layout-render-overlay", aliases="-clro", usage = "Write the content layout information as an image, overlayed on a render of the page", required=false) 082 File contentLayoutRenderOverlayed; 083 084 @Argument() 085 String url; 086 087 LayoutExtractor extractor = new LayoutExtractor(); 088 MBFImage render; 089 090 protected void writeLayout() throws IOException { 091 List<ElementInfo> info = extractor.getLayoutInfo(); 092 PrintWriter pw; 093 094 if (layoutFile.getName().equals("-")) { 095 pw = new PrintWriter(System.out); 096 } else { 097 pw = new PrintWriter(new FileWriter(layoutFile)); 098 } 099 100 pw.println(ElementInfo.getCSVHeader()); 101 for (ElementInfo ei : info) { 102 pw.println(ei.toCSVString()); 103 } 104 105 if (!layoutFile.getName().equals("-")) { 106 pw.close(); 107 } 108 } 109 110 protected MBFImage getRender() { 111 if (render == null) 112 render = extractor.render(); 113 return render; 114 } 115 116 /** 117 * Extract content. 118 * @throws IOException 119 */ 120 public void extractContent() throws IOException { 121 if (!extractor.load(url)) { 122 System.err.println("Error loading page: " + url); 123 System.exit(1); 124 } 125 126 if (layoutFile != null) writeLayout(); 127 128 if (thumbnailFile != null) { 129 MBFImage image = getRender(); 130 131 //crop first if its very long 132 if (image.getHeight() > 1.5 * image.getWidth()) { 133 image = image.extractROI(0, 0, image.getWidth(), image.getWidth()); 134 } 135 136 MBFImage thumb = image.process(new ResizeProcessor(THUMBNAIL_WIDTH, THUMBNAIL_HEIGHT)); 137 ImageUtilities.write(thumb, thumbnailFile); 138 } 139 140 if (renderFile != null) { 141 ImageUtilities.write(getRender(), renderFile); 142 } 143 144 if (layoutRender != null) { 145 ImageUtilities.write(extractor.renderLayoutInfo(RGBColour.BLACK), layoutRender); 146 } 147 148 if (layoutRenderOverlayed != null) { 149 ImageUtilities.write(extractor.renderLayoutInfo(getRender(), RGBColour.RED), layoutRenderOverlayed); 150 } 151 152 if (contentLayoutRender != null) { 153 ImageUtilities.write(extractor.renderContentLayout(CONTENT_COLOUR, NON_CONTENT_INSIDE_COLOUR, NON_CONTENT_COLOUR), contentLayoutRender); 154 } 155 156 if (contentLayoutRenderOverlayed != null) { 157 ImageUtilities.write(extractor.renderContentLayout(getRender(), CONTENT_COLOUR, NON_CONTENT_INSIDE_COLOUR, NON_CONTENT_COLOUR), contentLayoutRenderOverlayed); 158 } 159 } 160 161 /** 162 * Main method 163 * @param args 164 * @throws IOException 165 */ 166 public static void main(String [] args) throws IOException { 167 System.setOut(new PrintStream(System.out, true, "UTF-8")); 168 169 LayoutExtractorTool extractor = new LayoutExtractorTool(); 170 CmdLineParser parser = new CmdLineParser(extractor); 171 172 try { 173 parser.parseArgument(args); 174 } catch(CmdLineException e) { 175 System.err.println(e.getMessage()); 176 System.err.println("Usage: java -jar LayoutExtractor.jar [options...]"); 177 parser.printUsage(System.err); 178 return; 179 } 180 181 extractor.extractContent(); 182 } 183}