001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.web.layout; 031 032import java.net.URL; 033import java.util.ArrayList; 034import java.util.HashSet; 035import java.util.List; 036import java.util.Set; 037import java.util.concurrent.TimeoutException; 038 039import org.apache.log4j.Logger; 040import org.openimaj.image.MBFImage; 041import org.openimaj.image.colour.ColourSpace; 042import org.openimaj.image.renderer.MBFImageRenderer; 043import org.openimaj.math.geometry.shape.Rectangle; 044import org.openimaj.web.ProgrammaticBrowser; 045import org.openimaj.web.readability.Readability; 046import org.w3c.dom.Element; 047import org.w3c.dom.Node; 048import org.w3c.dom.NodeList; 049 050import com.trolltech.qt.webkit.QWebElement; 051import com.trolltech.qt.webkit.QWebElementCollection; 052 053/** 054 * Class for extracting information on the layout of DOM elements in 055 * a web page. 056 * 057 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 058 * 059 */ 060public class LayoutExtractor { 061 private static final String GEN_ID = "__openimaj_gen_id_"; 062 063 private static final Logger logger = Logger.getLogger(LayoutExtractor.class); 064 065 private ProgrammaticBrowser browser; 066 067 private long timeout = 0; 068 069 /** 070 * Default constructor 071 */ 072 public LayoutExtractor() { 073 browser = new ProgrammaticBrowser(); 074 } 075 076 /** 077 * Default constructor 078 * @param timeout 079 */ 080 public LayoutExtractor(long timeout) { 081 this(); 082 this.timeout = timeout; 083 } 084 085 /** 086 * Load a web page from a URL 087 * @param url the url 088 * @return true if successful; false otherwise 089 */ 090 public boolean load(String url) { 091 boolean ret; 092 try { 093 ret = browser.load(url, timeout); 094 } catch (TimeoutException e) { 095 return false; 096 } 097 098 if (ret) augmentDOM(); 099 100 return ret; 101 } 102 103 /** 104 * Load a web page from a URL 105 * @param url the url 106 * @return true if successful; false otherwise 107 */ 108 public boolean load(URL url) { 109 boolean ret; 110 try { 111 ret = browser.load(url, timeout); 112 } catch (TimeoutException e) { 113 return false; 114 } 115 116 if (ret) augmentDOM(); 117 118 return ret; 119 } 120 121 /** 122 * Load a web page from an HTML string 123 * @param html the HTML string 124 * @return true if successful; false otherwise 125 */ 126 public boolean loadHTML(String html) { 127 boolean ret = browser.loadHTML(html); 128 129 if (ret) augmentDOM(); 130 131 return ret; 132 } 133 134 private void augmentDOM() { 135 QWebElement body = getBody(); 136 137 if (body == null) { 138 logger.warn("body not found"); 139 return; 140 } 141 142 QWebElementCollection nl = body.findAll("*"); 143 for (int i=0; i<nl.count(); i++) { 144 QWebElement ei = nl.at(i); 145 146 if (ei.attribute("id") == null || ei.attribute("id").equals("")) { 147 ei.setAttribute("id", GEN_ID+i); 148 } 149 } 150 } 151 152 /** 153 * Get the layout info of the page 154 * 155 * @return information about the page layout 156 */ 157 public List<ElementInfo> getLayoutInfo() { 158 List<ElementInfo> info = new ArrayList<ElementInfo>(); 159 160 Set<String> contentIds = getContentIds(); 161 162 QWebElementCollection elements = browser.findAllElements("*"); 163 164 for (int i=0; i<elements.count(); i++) { 165 ElementInfo ei = new ElementInfo(); 166 167 ei.element = elements.at(i); 168 169 ei.bounds = new Rectangle( 170 ei.element.geometry().left(), 171 ei.element.geometry().top(), 172 ei.element.geometry().width(), 173 ei.element.geometry().height() 174 ); 175 176 if (contentIds.contains(ei.element.attribute("id"))) { 177 ei.isContent = true; 178 } 179 180 QWebElement parent = ei.element; 181 while (!(parent = parent.parent()).isNull()) { 182 String id = parent.attribute("id"); 183 184 if (contentIds.contains(id)) { 185 ei.isInsideContent = true; 186 break; 187 } 188 } 189 190 info.add(ei); 191 } 192 193 return info; 194 } 195 196 /** 197 * Render ALL the content boxes to a new image in the given color 198 * @param color Color 199 * @return new image illustrating ALL content boxes 200 */ 201 public MBFImage renderLayoutInfo(Float[] color) { 202 int w = browser.getWidth(); 203 int h = browser.getHeight(); 204 205 //Pixel p = LayoutUtils.renderSize(page.mainFrame()); 206 MBFImage image = new MBFImage(w, h, ColourSpace.RGB); 207 return renderLayoutInfo(image, color); 208 } 209 210 /** 211 * Render ALL the content boxes to the given image in the given color 212 * @param image Image to draw on top of 213 * @param colour Color 214 * @return the rendered image 215 */ 216 public MBFImage renderLayoutInfo(MBFImage image, Float[] colour) { 217 MBFImageRenderer renderer = image.createRenderer(); 218 219 for (ElementInfo e : getLayoutInfo()) { 220 Rectangle r = e.getBounds(); 221 renderer.drawShape(r, colour); 222 } 223 224 return image; 225 } 226 227 /** 228 * Get the BODY element of the loaded page 229 * @return body element or null if it doesn't exist 230 */ 231 public QWebElement getBody() { 232 return browser.getBody(); 233 } 234 235 protected String nodeToString(QWebElement n, boolean pretty) { 236 return n.toOuterXml(); 237 } 238 239 private Set<String> getContentIds() { 240 Set<String> ids = new HashSet<String>(); 241 try { 242 String html = browser.getHTML(); 243 244 Readability r = Readability.getReadability(html); 245 246 Element d = (Element) r.getArticleHTML_DOM(); 247 if (d==null) return ids; 248 NodeList nl = d.getElementsByTagName("*"); 249 250 for (int i=0; i<nl.getLength(); i++) { 251 Node idnode = nl.item(i).getAttributes().getNamedItem("id"); 252 if (idnode != null) { 253 ids.add(idnode.getNodeValue()); 254 } 255 } 256 } catch (Exception e) { 257 logger.error("Error finding content ids: " + e); 258 } 259 260 return ids; 261 } 262 263 /** 264 * Render the layout of the content. 265 * @param contentColour Colour for content 266 * @param nonContent Colour for non-content 267 * @param nonContentInside Colour for non-content inside content 268 * @return rendered image with boxes 269 */ 270 public MBFImage renderContentLayout(Float[] contentColour, Float [] nonContent, Float [] nonContentInside) { 271 int w = browser.getWidth(); 272 int h = browser.getHeight(); 273 274 MBFImage image = new MBFImage(w, h, ColourSpace.RGB); 275 return renderContentLayout(image, contentColour, nonContent, nonContentInside); 276 } 277 278 /** 279 * Render the layout of the content. 280 * @param image image to draw into 281 * @param contentColour Colour for content 282 * @param nonContent Colour for non-content 283 * @param nonContentInside Colour for non-content inside content 284 * @return rendered image with boxes 285 */ 286 public MBFImage renderContentLayout(MBFImage image, Float[] contentColour, Float [] nonContent, Float [] nonContentInside) { 287 List<Rectangle> content_areas = new ArrayList<Rectangle>(); 288 List<Rectangle> non_content_areas = new ArrayList<Rectangle>(); 289 List<Rectangle> non_content_areas_inside = new ArrayList<Rectangle>(); 290 291 for (ElementInfo ei : getLayoutInfo()) { 292 if (ei.isContent) { 293 content_areas.add(ei.bounds); 294 } else if (ei.isInsideContent) { 295 non_content_areas_inside.add(ei.bounds); 296 } else { 297 non_content_areas.add(ei.bounds); 298 } 299 } 300 301 MBFImageRenderer renderer = image.createRenderer(); 302 for (Rectangle r : content_areas) { 303 renderer.drawShape(r, contentColour); 304 } 305 306 for (Rectangle r : non_content_areas_inside) { 307 renderer.drawShape(r, nonContentInside); 308 } 309 310 for (Rectangle r : non_content_areas) { 311 renderer.drawShape(r, nonContent); 312 } 313 314 return image; 315 } 316 317 /** 318 * Render the current page to an image 319 * @return an image of the current page, or null if there is no content 320 */ 321 public MBFImage render() { 322 return browser.renderToImage(); 323 } 324 325 /** 326 * Render the current page to an image of the given size or smaller 327 * @param maxwidth 328 * @param maxheight 329 * @return an image of the current page, or null if there is no content 330 */ 331 public MBFImage render(int maxwidth, int maxheight) { 332 return browser.renderToImage(maxwidth, maxheight); 333 } 334 335 /** 336 * Run the browser for ms milliseconds. This 337 * allows it to update its content, etc. 338 * @param ms time to wait 339 */ 340 public void waitForBrowser(long ms) { 341 browser.mainLoop(ms); 342 } 343}