001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.web.layout;
031
032import java.net.URL;
033import java.util.ArrayList;
034import java.util.HashSet;
035import java.util.List;
036import java.util.Set;
037import java.util.concurrent.TimeoutException;
038
039import org.apache.log4j.Logger;
040import org.openimaj.image.MBFImage;
041import org.openimaj.image.colour.ColourSpace;
042import org.openimaj.image.renderer.MBFImageRenderer;
043import org.openimaj.math.geometry.shape.Rectangle;
044import org.openimaj.web.ProgrammaticBrowser;
045import org.openimaj.web.readability.Readability;
046import org.w3c.dom.Element;
047import org.w3c.dom.Node;
048import org.w3c.dom.NodeList;
049
050import com.trolltech.qt.webkit.QWebElement;
051import com.trolltech.qt.webkit.QWebElementCollection;
052
053/**
054 * Class for extracting information on the layout of DOM elements in
055 * a web page.
056 * 
057 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
058 *
059 */
060public class LayoutExtractor {
061        private static final String GEN_ID = "__openimaj_gen_id_";
062
063        private static final Logger logger = Logger.getLogger(LayoutExtractor.class);
064        
065        private ProgrammaticBrowser browser;
066        
067        private long timeout = 0;
068
069        /**
070         * Default constructor
071         */
072        public LayoutExtractor() {
073                browser = new ProgrammaticBrowser();
074        }
075        
076        /**
077         * Default constructor
078         * @param timeout 
079         */
080        public LayoutExtractor(long timeout) {
081                this();
082                this.timeout = timeout;
083        }
084
085        /**
086         * Load a web page from a URL
087         * @param url the url
088         * @return true if successful; false otherwise
089         */
090        public boolean load(String url) {
091                boolean ret;
092                try {
093                        ret = browser.load(url, timeout);
094                } catch (TimeoutException e) {
095                        return false;
096                }
097                                
098                if (ret) augmentDOM();
099                
100                return ret;
101        }
102        
103        /**
104         * Load a web page from a URL
105         * @param url the url
106         * @return true if successful; false otherwise
107         */
108        public boolean load(URL url) {
109                boolean ret;
110                try {
111                        ret = browser.load(url, timeout);
112                } catch (TimeoutException e) {
113                        return false;
114                }
115                                
116                if (ret) augmentDOM();
117                
118                return ret;
119        }
120        
121        /**
122         * Load a web page from an HTML string
123         * @param html the HTML string
124         * @return true if successful; false otherwise
125         */
126        public boolean loadHTML(String html) {
127                boolean ret = browser.loadHTML(html);
128                                
129                if (ret) augmentDOM();
130                
131                return ret;
132        }
133        
134        private void augmentDOM() {
135                QWebElement body = getBody();
136                
137                if (body == null) {
138                        logger.warn("body not found");
139                        return;
140                }
141
142                QWebElementCollection nl = body.findAll("*");
143                for (int i=0; i<nl.count(); i++) {
144                        QWebElement ei = nl.at(i);
145                        
146                        if (ei.attribute("id") == null || ei.attribute("id").equals("")) {
147                                ei.setAttribute("id", GEN_ID+i);
148                        }
149                }
150        }
151
152        /**
153         * Get the layout info of the page
154         * 
155         * @return information about the page layout
156         */
157        public List<ElementInfo> getLayoutInfo() {
158                List<ElementInfo> info = new ArrayList<ElementInfo>();
159                
160                Set<String> contentIds = getContentIds();
161                
162                QWebElementCollection elements = browser.findAllElements("*");
163                
164                for (int i=0; i<elements.count(); i++) {
165                        ElementInfo ei = new ElementInfo();
166                        
167                        ei.element = elements.at(i);
168                        
169                        ei.bounds = new Rectangle(
170                                        ei.element.geometry().left(),
171                                        ei.element.geometry().top(),
172                                        ei.element.geometry().width(),
173                                        ei.element.geometry().height()
174                                        );
175                        
176                        if (contentIds.contains(ei.element.attribute("id"))) {
177                                ei.isContent = true;
178                        }
179                        
180                        QWebElement parent = ei.element;
181                        while (!(parent = parent.parent()).isNull()) {
182                                String id = parent.attribute("id");
183                                
184                                if (contentIds.contains(id)) {
185                                        ei.isInsideContent = true;
186                                        break;
187                                }
188                        }
189                        
190                        info.add(ei);
191                }
192
193                return info;
194        }
195
196        /**
197         * Render ALL the content boxes to a new image in the given color
198         * @param color Color 
199         * @return new image illustrating ALL content boxes
200         */
201        public MBFImage renderLayoutInfo(Float[] color) {
202                int w = browser.getWidth();
203                int h = browser.getHeight();
204                
205                //Pixel p = LayoutUtils.renderSize(page.mainFrame());
206                MBFImage image = new MBFImage(w, h, ColourSpace.RGB);
207                return renderLayoutInfo(image, color);
208        }
209
210        /**
211         * Render ALL the content boxes to the given image in the given color
212         * @param image Image to draw on top of
213         * @param colour Color
214         * @return the rendered image
215         */
216        public MBFImage renderLayoutInfo(MBFImage image, Float[] colour) {
217                MBFImageRenderer renderer = image.createRenderer();
218                
219                for (ElementInfo e : getLayoutInfo()) {
220                        Rectangle r = e.getBounds();
221                        renderer.drawShape(r, colour);
222                }
223                
224                return image;
225        }
226
227        /**
228         * Get the BODY element of the loaded page
229         * @return body element or null if it doesn't exist
230         */
231        public QWebElement getBody() {
232                return browser.getBody();
233        }
234        
235        protected String nodeToString(QWebElement n, boolean pretty) {
236                return n.toOuterXml();
237        }
238
239        private Set<String> getContentIds() {
240                Set<String> ids = new HashSet<String>();
241                try {
242                        String html = browser.getHTML();
243                        
244                        Readability r = Readability.getReadability(html);
245
246                        Element d = (Element) r.getArticleHTML_DOM();
247                        if (d==null) return ids;
248                        NodeList nl = d.getElementsByTagName("*");
249
250                        for (int i=0; i<nl.getLength(); i++) {
251                                Node idnode = nl.item(i).getAttributes().getNamedItem("id");
252                                if (idnode != null) {
253                                        ids.add(idnode.getNodeValue());
254                                }
255                        }
256                } catch (Exception e) {
257                        logger.error("Error finding content ids: " + e);
258                }
259
260                return ids;
261        }
262
263        /**
264         * Render the layout of the content.
265         * @param contentColour Colour for content
266         * @param nonContent Colour for non-content
267         * @param nonContentInside Colour for non-content inside content
268         * @return rendered image with boxes
269         */
270        public MBFImage renderContentLayout(Float[] contentColour, Float [] nonContent, Float [] nonContentInside) {
271                int w = browser.getWidth();
272                int h = browser.getHeight();
273                
274                MBFImage image = new MBFImage(w, h, ColourSpace.RGB);
275                return renderContentLayout(image, contentColour, nonContent, nonContentInside);
276        }
277        
278        /**
279         * Render the layout of the content.
280         * @param image image to draw into
281         * @param contentColour Colour for content
282         * @param nonContent Colour for non-content
283         * @param nonContentInside Colour for non-content inside content
284         * @return rendered image with boxes
285         */
286        public MBFImage renderContentLayout(MBFImage image, Float[] contentColour, Float [] nonContent, Float [] nonContentInside) {
287                List<Rectangle> content_areas = new ArrayList<Rectangle>();
288                List<Rectangle> non_content_areas = new ArrayList<Rectangle>();
289                List<Rectangle> non_content_areas_inside = new ArrayList<Rectangle>();
290                
291                for (ElementInfo ei : getLayoutInfo()) {
292                        if (ei.isContent) {
293                                content_areas.add(ei.bounds);
294                        } else if (ei.isInsideContent) {
295                                non_content_areas_inside.add(ei.bounds);
296                        } else {
297                                non_content_areas.add(ei.bounds);
298                        }
299                }
300
301                MBFImageRenderer renderer = image.createRenderer();
302                for (Rectangle r : content_areas) {
303                        renderer.drawShape(r, contentColour);
304                }
305
306                for (Rectangle r : non_content_areas_inside) {
307                        renderer.drawShape(r, nonContentInside);
308                }
309                
310                for (Rectangle r : non_content_areas) {
311                        renderer.drawShape(r, nonContent);
312                }
313
314                return image;
315        }
316        
317        /**
318         * Render the current page to an image
319         * @return an image of the current page, or null if there is no content
320         */
321        public MBFImage render() {
322                return browser.renderToImage();
323        }
324        
325        /**
326         * Render the current page to an image of the given size or smaller
327         * @param maxwidth 
328         * @param maxheight 
329         * @return an image of the current page, or null if there is no content
330         */
331        public MBFImage render(int maxwidth, int maxheight) {
332                return browser.renderToImage(maxwidth, maxheight);
333        }
334        
335        /**
336         * Run the browser for ms milliseconds. This
337         * allows it to update its content, etc.
338         * @param ms time to wait
339         */
340        public void waitForBrowser(long ms) {
341                browser.mainLoop(ms);
342        }
343}