001/** 002 * Copyright 2010 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.readability; 018 019import java.io.IOException; 020import java.io.StringReader; 021import java.net.URL; 022import java.text.ParseException; 023import java.text.SimpleDateFormat; 024import java.util.ArrayList; 025import java.util.Date; 026import java.util.EnumSet; 027import java.util.List; 028import java.util.regex.Matcher; 029import java.util.regex.Pattern; 030 031import org.cyberneko.html.parsers.DOMFragmentParser; 032import org.cyberneko.html.parsers.DOMParser; 033import org.pojava.datetime.DateTime; 034import org.w3c.dom.DOMException; 035import org.w3c.dom.Document; 036import org.w3c.dom.DocumentFragment; 037import org.w3c.dom.Element; 038import org.w3c.dom.Node; 039import org.w3c.dom.NodeList; 040import org.w3c.dom.bootstrap.DOMImplementationRegistry; 041import org.w3c.dom.ls.DOMImplementationLS; 042import org.w3c.dom.ls.LSSerializer; 043import org.w3c.dom.traversal.DocumentTraversal; 044import org.w3c.dom.traversal.NodeFilter; 045import org.w3c.dom.traversal.TreeWalker; 046import org.xml.sax.InputSource; 047import org.xml.sax.SAXException; 048 049/** 050 * Class for extracting the "content" from web-pages, and ignoring adverts, etc. 051 * Based upon readability.js (http://lab.arc90.com/experiments/readability/) and 052 * modified to behave better for certain sites (and typically better mimic 053 * Safari Reader functionality). 054 * 055 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 056 * @author Michael Matthews (mikemat@yahoo-inc.com) 057 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 058 */ 059public class Readability 060{ 061 /** 062 * Regular expressions for different types of content 063 */ 064 protected static class Regexps { 065 066 public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner"; // caption? 067 public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main"; 068 public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text"; 069 public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning"; 070 public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)"; 071 public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}"; 072 public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>"; 073 public static String trimRe = "^\\s+|\\s+$"; 074 public static String normalizeRe = "\\s{2,}"; 075 public static String killBreaksRe = "(<br\\s*\\/?>(\\s| ?)*){1,}"; 076 public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com"; 077 078 public static String titleSeparatorRe = "\\|\\-\\/"; 079 080 // this is used to try and find elements that represent sub-headings 081 // (that are not h1..h6) 082 public static String likelySubheadCandidateRe = "(?i)cross-head"; 083 } 084 085 enum Flag { 086 FLAG_STRIP_UNLIKELYS, 087 FLAG_WEIGHT_CLASSES 088 } 089 090 /** 091 * Threshold for removing elements with lots of links 092 */ 093 public static float LINK_DENSITY_THRESHOLD = 0.33F; 094 095 // IVARS below 096 protected Document document; 097 private Node bodyCache; 098 protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class); 099 100 protected String articleTitle; 101 protected Element articleContent; 102 protected String article_date_string; 103 protected Date article_date; 104 protected String article_contentType; 105 106 protected boolean debug = false; 107 108 protected boolean addTitle = false; 109 110 /** 111 * Construct with the given document. Debugging is disabled. 112 * 113 * @param document 114 * The document. 115 */ 116 public Readability(Document document) { 117 this(document, false); 118 } 119 120 /** 121 * Construct with the given document. The second argument can be used to 122 * enable debugging output. 123 * 124 * @param document 125 * The document. 126 * @param debug 127 * Enable debugging output. 128 */ 129 public Readability(Document document, boolean debug) { 130 this(document, debug, false); 131 } 132 133 /** 134 * Construct with the given document. The second argument can be used to 135 * enable debugging output. The third option controls whether the title 136 * should be included in the output. 137 * 138 * @param document 139 * The document. 140 * @param debug 141 * Enable debugging output. 142 * @param addTitle 143 * Add title to output. 144 */ 145 public Readability(Document document, boolean debug, boolean addTitle) { 146 this.debug = debug; 147 this.document = document; 148 this.addTitle = addTitle; 149 augmentDocument(document); 150 init(); 151 } 152 153 /** 154 * Iterates through all the ELEMENT nodes in a document and gives them ids 155 * if they don't already have them. 156 * 157 * @param document 158 */ 159 public static void augmentDocument(Document document) { 160 final DocumentTraversal traversal = (DocumentTraversal) document; 161 162 final TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true); 163 164 traverseLevel(walker, 0); 165 } 166 167 private static int traverseLevel(TreeWalker walker, int counter) { 168 // describe current node: 169 final Node parend = walker.getCurrentNode(); 170 171 if (parend instanceof Element) { 172 if (((Element) parend).getAttribute("id").length() == 0) { 173 ((Element) parend).setAttribute("id", "gen-id-" + counter); 174 counter++; 175 } 176 } 177 178 // traverse children: 179 for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) { 180 counter = traverseLevel(walker, counter); 181 } 182 183 // return position to the current (level up): 184 walker.setCurrentNode(parend); 185 186 return counter; 187 } 188 189 protected void dbg(String s) { 190 if (debug) 191 System.err.println(s); 192 } 193 194 protected String getTitle() { 195 final NodeList l = document.getElementsByTagName("title"); 196 197 if (l.getLength() == 0) 198 return ""; 199 200 return l.item(0).getTextContent(); 201 } 202 203 /** 204 * Javascript-like String.match 205 * 206 * @param input 207 * @param regex 208 * @return 209 */ 210 protected String[] match(String input, String regex) { 211 final Matcher matcher = Pattern.compile(regex).matcher(input); 212 final List<String> matches = new ArrayList<String>(); 213 214 while (matcher.find()) { 215 matches.add(matcher.group(0)); 216 } 217 218 return matches.toArray(new String[matches.size()]); 219 } 220 221 /** 222 * @return True if the article has any detected content; false otherwise. 223 */ 224 public boolean hasContent() { 225 return articleContent != null; 226 } 227 228 /** 229 * Javascript-like String.search 230 * 231 * @param input 232 * @param regex 233 * @return 234 */ 235 protected int search(String input, String regex) { 236 final Matcher matcher = Pattern.compile(regex).matcher(input); 237 238 if (!matcher.find()) 239 return -1; 240 return matcher.start(); 241 } 242 243 protected void findArticleEncoding() { 244 final NodeList nl = document.getElementsByTagName("meta"); 245 for (int j = 0; j < nl.getLength(); j++) { 246 if (((Element) nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) { 247 article_contentType = ((Element) nl.item(j)).getAttribute("content"); 248 return; 249 } 250 } 251 252 } 253 254 protected void findArticleDate() { 255 // <meta name="OriginalPublicationDate" content="2010/07/12 14:08:02"/> 256 // <meta name="DC.date.issued" content="2010-07-12"> 257 NodeList nl = document.getElementsByTagName("meta"); 258 for (int j = 0; j < nl.getLength(); j++) { 259 if (((Element) nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) { 260 article_date_string = ((Element) nl.item(j)).getAttribute("content"); 261 article_date = DateTime.parse(article_date_string).toDate(); 262 return; 263 } 264 if (((Element) nl.item(j)).getAttribute("name").equals("DC.date.issued")) { 265 article_date_string = ((Element) nl.item(j)).getAttribute("content"); 266 article_date = DateTime.parse(article_date_string).toDate(); 267 return; 268 } 269 } 270 271 // <time datetime="2010-07-12T10:26BST" pubdate>Monday 12 July 2010 272 // 10.26 BST</time> 273 nl = document.getElementsByTagName("time"); 274 for (int j = 0; j < nl.getLength(); j++) { 275 if (((Element) nl.item(j)).getAttributeNode("pubdate") != null) { 276 article_date_string = ((Element) nl.item(j)).getAttribute("datetime"); 277 article_date = DateTime.parse(article_date_string).toDate(); 278 return; 279 } 280 } 281 282 // <span class="date">14:08 GMT, Monday, 12 July 2010 15:08 UK</span> 283 // <p class="date">09.07.2010 @ 17:49 CET</p> 284 // <p class="date">Today @ 09:29 CET</p> 285 nl = document.getElementsByTagName("*"); 286 for (int j = 0; j < nl.getLength(); j++) { 287 if ((((Element) nl.item(j)).getAttribute("class").contains("date") || 288 ((Element) nl.item(j)).getAttribute("class").contains("Date")) && 289 !(((Element) nl.item(j)).getAttribute("class").contains("update") || 290 ((Element) nl.item(j)).getAttribute("class").contains("Update"))) 291 { 292 article_date_string = getInnerTextSep(nl.item(j)).trim(); 293 parseDate(); 294 return; 295 } 296 } 297 for (int j = 0; j < nl.getLength(); j++) { 298 if ((((Element) nl.item(j)).getAttribute("id").contains("date") || 299 ((Element) nl.item(j)).getAttribute("id").contains("Date")) && 300 !(((Element) nl.item(j)).getAttribute("id").contains("update") || 301 ((Element) nl.item(j)).getAttribute("id").contains("Update"))) 302 { 303 article_date_string = getInnerTextSep(nl.item(j)).trim(); 304 parseDate(); 305 return; 306 } 307 } 308 309 // Last updated at 3:05 PM on 12th July 2010 310 nl = document.getElementsByTagName("*"); 311 for (int j = 0; j < nl.getLength(); j++) { 312 final String text = nl.item(j).getTextContent(); 313 314 if (text == null) 315 continue; 316 317 final Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)"); 318 final Matcher m = p.matcher(text); 319 if (m.find()) { 320 article_date_string = m.group(1); 321 322 String cpy = article_date_string.replaceAll("th", ""); 323 cpy = cpy.replaceAll("st", ""); 324 cpy = cpy.replaceAll("nd", ""); 325 cpy = cpy.replaceAll("rd", ""); 326 327 final SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy"); 328 try { 329 article_date = sdf.parse(cpy); 330 } catch (final ParseException e) { 331 } 332 return; 333 } 334 } 335 } 336 337 @SuppressWarnings("deprecation") 338 protected void parseDate() { 339 if (article_date_string == null || article_date_string.trim().isEmpty()) 340 return; 341 342 if (article_date_string.contains("Today")) { 343 try { 344 final SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z"); 345 article_date = sdf.parse(article_date_string); 346 final Date now = new Date(); 347 article_date.setDate(now.getDate()); 348 article_date.setMonth(now.getMonth()); 349 article_date.setYear(now.getYear()); 350 } catch (final ParseException e) { 351 } 352 } else { 353 try { 354 final SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy"); 355 article_date = sdf.parse(article_date_string); 356 } catch (final ParseException e) { 357 try { 358 final SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z"); 359 article_date = sdf.parse(article_date_string); 360 } catch (final ParseException ee) { 361 try { 362 final SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy"); 363 article_date = sdf.parse(article_date_string); 364 } catch (final ParseException eee) { 365 try { 366 article_date = DateTime.parse(article_date_string).toDate(); 367 } catch (final IllegalArgumentException ie) { 368 } catch (final java.lang.ArrayIndexOutOfBoundsException ie) { 369 System.out.println(article_date_string); 370 } 371 } 372 } 373 } 374 } 375 } 376 377 /** 378 * Get the article title. 379 * 380 * @return void 381 **/ 382 protected String findArticleTitle() { 383 String curTitle = "", origTitle = ""; 384 385 curTitle = origTitle = getTitle(); 386 387 // 388 final List<String> potentialTitles = new ArrayList<String>(); 389 for (int i = 1; i <= 6; i++) { 390 final NodeList nl = document.getElementsByTagName("h" + i); 391 if (nl.getLength() > 0) { 392 for (int j = 0; j < nl.getLength(); j++) 393 potentialTitles.add(nl.item(j).getTextContent().trim()); 394 } 395 } 396 397 String potentialTitle = null; 398 int score = 0; 399 for (final String s : potentialTitles) { 400 if (s.length() > score && curTitle.contains(s)) { 401 potentialTitle = s; 402 score = s.length(); 403 } 404 } 405 if (potentialTitle != null) 406 return potentialTitle; 407 // 408 409 if (match(curTitle, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0) 410 { 411 curTitle = origTitle.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1"); 412 413 if (curTitle.split(" ").length < 3) { 414 curTitle = origTitle.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe 415 + "]+(.*)", "$1"); 416 } 417 } 418 else if (curTitle.indexOf(": ") != -1) 419 { 420 curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1"); 421 422 if (curTitle.split(" ").length < 3) { 423 curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1"); 424 } 425 } 426 else if (curTitle.length() > 150 || curTitle.length() < 15) 427 { 428 final NodeList hOnes = document.getElementsByTagName("h1"); 429 if (hOnes.getLength() == 1) 430 { 431 curTitle = getInnerText((Element) hOnes.item(0)); 432 } 433 } 434 435 curTitle = curTitle.replaceAll(Regexps.trimRe, ""); 436 437 if (curTitle.split(" ").length <= 3) { 438 curTitle = origTitle; 439 } 440 441 return curTitle; 442 } 443 444 /** 445 * Equivalent to document.body in JS 446 * 447 * @return 448 */ 449 protected Element getBody() { 450 final NodeList nl = document.getElementsByTagName("body"); 451 452 if (nl.getLength() == 0) 453 return null; 454 else 455 return (Element) nl.item(0); 456 } 457 458 /** 459 * Runs readability. 460 * 461 * Workflow: 1. Prep the document by removing script tags, css, etc. 2. 462 * Build readability"s DOM tree. 3. Grab the article content from the 463 * current dom tree. 4. Replace the current DOM tree with the new one. 5. 464 * Read peacefully. 465 * 466 **/ 467 protected void init() { 468 if (getBody() != null && bodyCache == null) { 469 bodyCache = getBody().cloneNode(true); 470 } 471 472 findArticleDate(); // must be done before prepDocument() 473 474 findArticleEncoding(); 475 476 prepDocument(); 477 478 /* Build readability"s DOM tree */ 479 articleTitle = findArticleTitle(); 480 articleContent = grabArticle(); 481 482 /** 483 * If we attempted to strip unlikely candidates on the first run 484 * through, and we ended up with no content, that may mean we stripped 485 * out the actual content so we couldn"t parse it. So re-run init while 486 * preserving unlikely candidates to have a better shot at getting our 487 * content out properly. 488 **/ 489 if (getInnerText(articleContent, false).length() < 250) 490 { 491 if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) { 492 flags.remove(Flag.FLAG_STRIP_UNLIKELYS); 493 getBody().getParentNode().replaceChild(bodyCache, getBody()); 494 init(); 495 return; 496 } 497 else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) { 498 flags.remove(Flag.FLAG_WEIGHT_CLASSES); 499 getBody().getParentNode().replaceChild(bodyCache, getBody()); 500 init(); 501 return; 502 } 503 else { 504 articleContent = null; 505 } 506 } 507 508 if (addTitle && articleContent != null) { 509 final Element titleNode = document.createElement("h1"); 510 titleNode.setAttribute("id", "title"); 511 titleNode.appendChild(document.createTextNode(getArticleTitle())); 512 articleContent.insertBefore(titleNode, articleContent.getFirstChild()); 513 } 514 } 515 516 /** 517 * Prepare the HTML document for readability to scrape it. This includes 518 * things like stripping javascript, CSS, and handling terrible markup. 519 * 520 **/ 521 protected void prepDocument() { 522 /** 523 * In some cases a body element can"t be found (if the HTML is totally 524 * hosed for example) so we create a new body node and append it to the 525 * document. 526 */ 527 if (getBody() == null) 528 { 529 final Node body = document.createElement("body"); 530 document.appendChild(body); 531 } 532 533 // frames are not supported in this version! 534 // NodeList frames = document.getElementsByTagName("frame"); 535 // if(frames.length > 0) 536 // { 537 // Node bestFrame = null; 538 // int bestFrameSize = 0; 539 // for(int frameIndex = 0; frameIndex < frames.getLength(); 540 // frameIndex++) 541 // { 542 // int frameSize = frames.item(frameIndex).offsetWidth + 543 // frames[frameIndex].offsetHeight; 544 // var canAccessFrame = false; 545 // try { 546 // frames[frameIndex].contentWindow.document.body; 547 // canAccessFrame = true; 548 // } 549 // catch(eFrames) { 550 // dbg(eFrames); 551 // } 552 // 553 // if(canAccessFrame && frameSize > bestFrameSize) 554 // { 555 // bestFrame = frames[frameIndex]; 556 // bestFrameSize = frameSize; 557 // } 558 // } 559 // 560 // if(bestFrame) 561 // { 562 // var newBody = document.createElement("body"); 563 // newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML; 564 // newBody.style.overflow = "scroll"; 565 // document.body = newBody; 566 // 567 // var frameset = document.getElementsByTagName("frameset")[0]; 568 // if(frameset) { 569 // frameset.parentNode.removeChild(frameset); } 570 // 571 // readability.frameHack = true; 572 // } 573 // } 574 575 /* remove all scripts that are not readability */ 576 final NodeList scripts = document.getElementsByTagName("script"); 577 for (int i = scripts.getLength() - 1; i >= 0; i--) 578 { 579 scripts.item(i).getParentNode().removeChild(scripts.item(i)); 580 } 581 582 /* Remove all style tags in head */ 583 final NodeList styleTags = document.getElementsByTagName("style"); 584 for (int st = styleTags.getLength() - 1; st >= 0; st--) { 585 styleTags.item(st).getParentNode().removeChild(styleTags.item(st)); 586 } 587 588 /* Remove all meta tags */ 589 final NodeList metaTags = document.getElementsByTagName("meta"); 590 for (int mt = metaTags.getLength() - 1; mt >= 0; mt--) { 591 metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt)); 592 } 593 594 /* Turn all double br's into p's */ 595 /* 596 * Note, this is pretty costly as far as processing goes. Maybe optimize 597 * later. 598 */ 599 // document.body.innerHTML = 600 // document.body.innerHTML.replace(readability.regexps.replaceBrsRe, 601 // '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>'); 602 final Element body = getBody(); 603 // Node rep = 604 // stringToNode(nodeToString(body).replaceAll(Regexps.replaceBrsRe, 605 // "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>")); 606 // body.getParentNode().replaceChild(rep, body); 607 608 // This is slow! 609 final Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll( 610 Regexps.replaceFontsRe, "<$1span>")); 611 removeChildren(body); 612 body.appendChild(frag); 613 614 /* Remove all comments */ 615 removeComments(document); 616 } 617 618 protected void removeComments(Node n) { 619 if (n.getNodeType() == Node.COMMENT_NODE) { 620 n.getParentNode().removeChild(n); 621 } else { 622 final NodeList nl = n.getChildNodes(); 623 for (int i = nl.getLength() - 1; i >= 0; i--) 624 removeComments(nl.item(i)); 625 } 626 } 627 628 /** 629 * Prepare the article node for display. Clean out any inline styles, 630 * iframes, forms, strip extraneous 631 * <p> 632 * tags, etc. 633 * 634 * @param Element 635 **/ 636 protected void prepArticle(Element articleContent) { 637 cleanStyles(articleContent); 638 killBreaks(articleContent); 639 640 /* Clean out junk from the article content */ 641 clean(articleContent, "form"); 642 clean(articleContent, "object"); 643 clean(articleContent, "h1"); 644 /** 645 * If there is only one h2, they are probably using it as a header and 646 * not a subheader, so remove it since we already have a header. 647 ***/ 648 if (articleContent.getElementsByTagName("h2").getLength() == 1) { 649 clean(articleContent, "h2"); 650 } 651 clean(articleContent, "iframe"); 652 653 cleanHeaders(articleContent); 654 655 /* 656 * Do these last as the previous stuff may have removed junk that will 657 * affect these 658 */ 659 cleanConditionally(articleContent, "table"); 660 cleanConditionally(articleContent, "ul"); 661 cleanConditionally(articleContent, "div"); 662 663 /* Remove extra paragraphs */ 664 final NodeList articleParagraphs = articleContent.getElementsByTagName("p"); 665 for (int i = articleParagraphs.getLength() - 1; i >= 0; i--) 666 { 667 final int imgCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength(); 668 final int embedCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength(); 669 final int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength(); 670 671 if (imgCount == 0 && embedCount == 0 && objectCount == 0 672 && getInnerText((Element) articleParagraphs.item(i), false) == "") 673 { 674 articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i)); 675 } 676 } 677 678 // articleContent.innerHTML = 679 // articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, "<p"); 680 final Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P")); 681 removeChildren(articleContent); 682 articleContent.appendChild(n); 683 684 // now remove empty p's and tidy up 685 final NodeList nl = articleContent.getElementsByTagName("p"); 686 for (int i = nl.getLength() - 1; i >= 0; i--) { 687 if (nl.item(i).getTextContent().trim().length() == 0) 688 { 689 nl.item(i).getParentNode().removeChild(nl.item(i)); 690 } else if (nl.item(i).getChildNodes().getLength() == 1 691 && nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE) 692 { 693 nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n"); 694 } 695 else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled")) 696 { 697 nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i)); 698 } 699 } 700 701 } 702 703 protected void removeChildren(Node n) { 704 final NodeList nl = n.getChildNodes(); 705 final int nn = nl.getLength(); 706 for (int i = 0; i < nn; i++) 707 n.removeChild(nl.item(0)); 708 } 709 710 /** 711 * Initialize a node with the readability object. Also checks the 712 * className/id for special names to add to its score. 713 * 714 * @param Element 715 **/ 716 protected void initializeNode(Element node) { 717 float contentScore = 0; 718 719 if (node.getTagName() == "DIV") { 720 contentScore += 5; 721 } else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") { 722 contentScore += 3; 723 } else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL" 724 || node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT" 725 || node.getTagName() == "LI" || node.getTagName() == "FORM") 726 { 727 contentScore -= 3; 728 } else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3" 729 || node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6" 730 || node.getTagName() == "TH") 731 { 732 contentScore -= 5; 733 } 734 735 contentScore += getClassWeight(node); 736 node.setUserData("readability", contentScore, null); 737 } 738 739 /** 740 * Get an elements class/id weight. Uses regular expressions to tell if this 741 * element looks good or bad. 742 * 743 * @param Element 744 * @return number (Integer) 745 **/ 746 protected int getClassWeight(Element e) { 747 if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) { 748 return 0; 749 } 750 751 int weight = 0; 752 753 /* Look for a special classname */ 754 if (e.getAttribute("class") != "") 755 { 756 if (search(e.getAttribute("class"), Regexps.negativeRe) != -1) { 757 weight -= 25; 758 } 759 760 if (search(e.getAttribute("class"), Regexps.positiveRe) != -1) { 761 weight += 25; 762 } 763 } 764 765 /* Look for a special ID */ 766 if (e.getAttribute("id") != "") 767 { 768 if (search(e.getAttribute("id"), Regexps.negativeRe) != -1) { 769 weight -= 25; 770 } 771 772 if (search(e.getAttribute("id"), Regexps.positiveRe) != -1) { 773 weight += 25; 774 } 775 } 776 777 return weight; 778 } 779 780 protected void cleanStyles() { 781 cleanStyles((Element) document); 782 } 783 784 /** 785 * Remove the style attribute on every e and under. TODO: Test if 786 * getElementsByTagName(*) is faster. 787 * 788 * @param Element 789 **/ 790 protected void cleanStyles(Element e) { 791 if (e == null) 792 return; 793 Node cur = e.getFirstChild(); 794 795 // Remove any root styles, if we"re able. 796 if (!e.getAttribute("class").equals("readability-styled")) 797 e.removeAttribute("style"); 798 799 // Go until there are no more child nodes 800 while (cur != null) { 801 if (cur.getNodeType() == Element.ELEMENT_NODE) { 802 // Remove style attribute(s) : 803 if (!((Element) cur).getAttribute("class").equals("readability-styled")) { 804 ((Element) cur).removeAttribute("style"); 805 } 806 cleanStyles((Element) cur); 807 } 808 cur = cur.getNextSibling(); 809 } 810 } 811 812 /** 813 * Remove extraneous break tags from a node. 814 * 815 * @param Element 816 **/ 817 protected void killBreaks(Element e) { 818 // e.innerHTML = 819 // e.innerHTML.replace(readability.regexps.killBreaksRe,"<br />"); 820 821 final Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe, "<BR />")); 822 removeChildren(e); 823 e.appendChild(n); 824 } 825 826 /** 827 * Clean a node of all elements of type "tag". (Unless it"s a youtube/vimeo 828 * video. People love movies.) 829 * 830 * @param Element 831 * @param string 832 * tag to clean 833 **/ 834 protected void clean(Element e, String tag) { 835 final NodeList targetList = e.getElementsByTagName(tag); 836 final boolean isEmbed = (tag.equals("object") || tag.equals("embed")); 837 838 for (int y = targetList.getLength() - 1; y >= 0; y--) { 839 /* 840 * Allow youtube and vimeo videos through as people usually want to 841 * see those. 842 */ 843 if (isEmbed) { 844 String attributeValues = ""; 845 for (int i = 0, il = targetList.item(y).getAttributes().getLength(); i < il; i++) { 846 attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|"; 847 } 848 849 /* 850 * First, check the elements attributes to see if any of them 851 * contain youtube or vimeo 852 */ 853 if (search(attributeValues, Regexps.videoRe) != -1) { 854 continue; 855 } 856 857 /* Then check the elements inside this element for the same. */ 858 if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) { 859 continue; 860 } 861 } 862 863 targetList.item(y).getParentNode().removeChild(targetList.item(y)); 864 } 865 } 866 867 /** 868 * Clean out spurious headers from an Element. Checks things like classnames 869 * and link density. 870 * 871 * @param Element 872 **/ 873 protected void cleanHeaders(Element e) { 874 for (int headerIndex = 1; headerIndex < 7; headerIndex++) { 875 final NodeList headers = e.getElementsByTagName("h" + headerIndex); 876 for (int i = headers.getLength() - 1; i >= 0; i--) { 877 if (getClassWeight((Element) headers.item(i)) < 0 878 || getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD) 879 { 880 headers.item(i).getParentNode().removeChild(headers.item(i)); 881 } 882 } 883 } 884 } 885 886 /** 887 * Get the density of links as a percentage of the content This is the 888 * amount of text that is inside a link divided by the total text in the 889 * node. 890 * 891 * @param Element 892 * @return number (float) 893 **/ 894 protected float getLinkDensity(Element e) { 895 final NodeList links = e.getElementsByTagName("a"); 896 final int textLength = getInnerText(e).length(); 897 int linkLength = 0; 898 899 for (int i = 0, il = links.getLength(); i < il; i++) 900 { 901 linkLength += getInnerText((Element) links.item(i)).length(); 902 } 903 904 if (linkLength == 0) 905 return 0; 906 907 return (float) linkLength / (float) textLength; 908 } 909 910 /** 911 * Clean an element of all tags of type "tag" if they look fishy. "Fishy" is 912 * an algorithm based on content length, classnames, link density, number of 913 * images & embeds, etc. 914 **/ 915 protected void cleanConditionally(Element e, String tag) { 916 final NodeList tagsList = e.getElementsByTagName(tag); 917 final int curTagsLength = tagsList.getLength(); 918 919 /** 920 * Gather counts for other typical elements embedded within. Traverse 921 * backwards so we can remove nodes at the same time without effecting 922 * the traversal. 923 * 924 * Todo: Consider taking into account original contentScore here. 925 **/ 926 for (int i = curTagsLength - 1; i >= 0; i--) { 927 final int weight = getClassWeight((Element) tagsList.item(i)); 928 final float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float) (tagsList.item(i) 929 .getUserData("readability")) : 0; 930 931 dbg("Cleaning Conditionally " 932 + tagsList.item(i) 933 + " (" 934 + ((Element) tagsList.item(i)).getAttribute("class") 935 + ":" 936 + ((Element) tagsList.item(i)).getAttribute("id") 937 + ")" 938 + ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i) 939 .getUserData("readability")) : "")); 940 941 if (weight + contentScore < 0) 942 { 943 dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":" 944 + ((Element) tagsList.item(i)).getAttribute("id") + ")"); 945 tagsList.item(i).getParentNode().removeChild(tagsList.item(i)); 946 } 947 else if (getCharCount((Element) tagsList.item(i), ",") < 10) { 948 /** 949 * If there are not very many commas, and the number of 950 * non-paragraph elements is more than paragraphs or other 951 * ominous signs, remove the element. 952 **/ 953 final int p = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength(); 954 final int img = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength(); 955 final int li = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength() - 100; 956 final int input = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength(); 957 958 int embedCount = 0; 959 final NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed"); 960 for (int ei = 0, il = embeds.getLength(); ei < il; ei++) { 961 if (search(((Element) embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) { 962 embedCount++; 963 } 964 } 965 966 final float linkDensity = getLinkDensity((Element) tagsList.item(i)); 967 final int contentLength = getInnerText((Element) tagsList.item(i)).length(); 968 boolean toRemove = false; 969 970 if (img > p) { 971 toRemove = true; 972 } else if (li > p && tag != "ul" && tag != "ol") { 973 toRemove = true; 974 } else if (input > Math.floor(p / 3)) { 975 toRemove = true; 976 } else if (contentLength < 25 && (img == 0 || img > 2)) { 977 toRemove = true; 978 } else if (weight < 25 && linkDensity > 0.2) { 979 toRemove = true; 980 } else if (weight >= 25 && linkDensity > 0.5) { 981 toRemove = true; 982 } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) { 983 toRemove = true; 984 } 985 986 if (img == 1 && p == 0 && contentLength == 0) { 987 final Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0); 988 989 String w = ""; 990 if (theImg.getAttribute("width") != null) 991 w = theImg.getAttribute("width"); 992 993 String h = ""; 994 if (theImg.getAttribute("height") != null) 995 h = theImg.getAttribute("height"); 996 997 if (!(w.equals("0") || h.equals("0"))) 998 toRemove = false; // special case - it's just an inline 999 // image 1000 } 1001 1002 if (toRemove) { 1003 dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":" 1004 + ((Element) tagsList.item(i)).getAttribute("id") + ")"); 1005 tagsList.item(i).getParentNode().removeChild(tagsList.item(i)); 1006 } 1007 } 1008 } 1009 } 1010 1011 /** 1012 * Get the number of times a string s appears in the node e. 1013 * 1014 * @param Element 1015 * @param string 1016 * - what to split on. Default is "," 1017 * @return number (integer) 1018 **/ 1019 protected int getCharCount(Element e, String s) { 1020 return getInnerText(e).split(s).length - 1; 1021 } 1022 1023 protected int getCharCount(Element e) { 1024 return getCharCount(e, ","); 1025 } 1026 1027 /** 1028 * @return The article title 1029 */ 1030 public String getArticleTitle() { 1031 return articleTitle; 1032 } 1033 1034 /** 1035 * @return The content type of the article 1036 */ 1037 public String getArticleContentType() { 1038 return article_contentType; 1039 } 1040 1041 /*** 1042 * grabArticle - Using a variety of metrics (content score, classname, 1043 * element types), find the content that is most likely to be the stuff a 1044 * user wants to read. Then return it wrapped up in a div. 1045 * 1046 * @return Element 1047 **/ 1048 protected Element grabArticle() { 1049 final boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS); 1050 1051 /** 1052 * First, node prepping. Trash nodes that look cruddy (like ones with 1053 * the class name "comment", etc), and turn divs into P tags where they 1054 * have been used inappropriately (as in, where they contain no other 1055 * block level elements.) 1056 * 1057 * Note: Assignment from index for performance. See 1058 * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 Todo: 1059 * Shouldn't this be a reverse traversal? 1060 **/ 1061 Element node = null; 1062 final List<Element> nodesToScore = new ArrayList<Element>(); 1063 for (int nodeIndex = 0; (node = (Element) document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++) 1064 { 1065 /* Remove unlikely candidates */ 1066 if (stripUnlikelyCandidates) { 1067 final String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id"); 1068 if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 && 1069 search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 && 1070 !node.getTagName().equals("BODY")) 1071 { 1072 dbg("Removing unlikely candidate - " + unlikelyMatchString); 1073 node.getParentNode().removeChild(node); 1074 nodeIndex--; 1075 continue; 1076 } 1077 } 1078 1079 if (node.getTagName().equals("P") || node.getTagName().equals("TD")) { 1080 nodesToScore.add(node); 1081 } 1082 1083 /* 1084 * Turn all divs that don't have children block level elements into 1085 * p's 1086 */ 1087 if (node.getTagName().equals("DIV")) { 1088 1089 if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) { 1090 dbg("Altering div to p"); 1091 final Element newNode = document.createElement("P"); 1092 1093 // newNode.innerHTML = node.innerHTML; 1094 final NodeList nl = node.getChildNodes(); 1095 for (int i = 0; i < nl.getLength(); i++) 1096 newNode.appendChild(nl.item(i)); 1097 1098 node.getParentNode().replaceChild(newNode, node); 1099 nodeIndex--; 1100 } 1101 else 1102 { 1103 /* EXPERIMENTAL */ 1104 for (int i = 0, il = node.getChildNodes().getLength(); i < il; i++) { 1105 final Node childNode = node.getChildNodes().item(i); 1106 if (childNode.getNodeType() == Element.TEXT_NODE) { 1107 dbg("replacing text node with a p tag with the same content."); 1108 final Element p = document.createElement("p"); 1109 // p.innerHTML = childNode.nodeValue; 1110 p.setNodeValue(childNode.getNodeValue()); 1111 p.setTextContent(childNode.getTextContent()); 1112 // p.style.display = "inline"; 1113 p.setAttribute("class", "readability-styled"); 1114 childNode.getParentNode().replaceChild(p, childNode); 1115 } 1116 } 1117 } 1118 } 1119 } 1120 1121 /** 1122 * Loop through all paragraphs, and assign a score to them based on how 1123 * content-y they look. Then add their score to their parent node. 1124 * 1125 * A score is determined by things like number of commas, class names, 1126 * etc. Maybe eventually link density. 1127 **/ 1128 final List<Element> candidates = new ArrayList<Element>(); 1129 for (int pt = 0; pt < nodesToScore.size(); pt++) { 1130 final Element parentNode = (Element) nodesToScore.get(pt).getParentNode(); 1131 final Element grandParentNode = (Element) parentNode.getParentNode(); 1132 final String innerText = getInnerText(nodesToScore.get(pt)); 1133 1134 /* 1135 * If this paragraph is less than 25 characters, don't even count 1136 * it. 1137 */ 1138 if (innerText.length() < 25) { 1139 continue; 1140 } 1141 1142 /* Initialize readability data for the parent. */ 1143 if (parentNode.getUserData("readability") == null) 1144 { 1145 initializeNode(parentNode); 1146 candidates.add(parentNode); 1147 } 1148 1149 /* Initialize readability data for the grandparent. */ 1150 if (grandParentNode.getUserData("readability") == null) 1151 { 1152 initializeNode(grandParentNode); 1153 candidates.add(grandParentNode); 1154 } 1155 1156 float contentScore = 0; 1157 1158 /* Add a point for the paragraph itself as a base. */ 1159 contentScore++; 1160 1161 /* Add points for any commas within this paragraph */ 1162 contentScore += innerText.split(",").length; 1163 1164 /* 1165 * For every 100 characters in this paragraph, add another point. Up 1166 * to 3 points. 1167 */ 1168 contentScore += Math.min(Math.floor(innerText.length() / 100F), 3F); 1169 1170 /* Add the score to the parent. The grandparent gets half. */ 1171 parentNode.setUserData("readability", ((Float) (parentNode.getUserData("readability")) + contentScore), null); 1172 grandParentNode.setUserData("readability", ((Float) (grandParentNode.getUserData("readability"))) 1173 + (contentScore / 2F), null); 1174 } 1175 1176 /** 1177 * After we've calculated scores, loop through all of the possible 1178 * candidate nodes we found and find the one with the highest score. 1179 **/ 1180 Element topCandidate = null; 1181 for (int c = 0, cl = candidates.size(); c < cl; c++) 1182 { 1183 /** 1184 * Scale the final candidates score based on link density. Good 1185 * content should have a relatively small link density (5% or less) 1186 * and be mostly unaffected by this operation. 1187 **/ 1188 1189 candidates.get(c).setUserData("readability", 1190 (Float) (candidates.get(c).getUserData("readability")) * (1F - getLinkDensity(candidates.get(c))), 1191 null); 1192 1193 dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class") + ":" 1194 + candidates.get(c).getAttribute("id") + ") with score " 1195 + candidates.get(c).getUserData("readability")); 1196 1197 if (topCandidate == null 1198 || (Float) (candidates.get(c).getUserData("readability")) > ((Float) topCandidate 1199 .getUserData("readability"))) 1200 { 1201 topCandidate = candidates.get(c); 1202 } 1203 } 1204 1205 if (topCandidate != null) 1206 dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class") + ":" 1207 + topCandidate.getAttribute("id") + ") with score " + topCandidate.getUserData("readability")); 1208 1209 /** 1210 * If we still have no top candidate, just use the body as a last 1211 * resort. We also have to copy the body node so it is something we can 1212 * modify. 1213 **/ 1214 if (topCandidate == null || topCandidate.getTagName().equals("BODY")) 1215 { 1216 topCandidate = document.createElement("DIV"); 1217 1218 // topCandidate.innerHTML = document.body.innerHTML; 1219 final NodeList nl = getBody().getChildNodes(); 1220 for (int i = 0; i < nl.getLength(); i++) 1221 topCandidate.appendChild(nl.item(i)); 1222 // document.body.innerHTML = ""; //should be covered by above 1223 1224 getBody().appendChild(topCandidate); 1225 initializeNode(topCandidate); 1226 } 1227 1228 /** 1229 * Now that we have the top candidate, look through its siblings for 1230 * content that might also be related. Things like preambles, content 1231 * split by ads that we removed, etc. 1232 **/ 1233 final Element articleContent = document.createElement("DIV"); 1234 articleContent.setAttribute("id", "readability-content"); 1235 final float siblingScoreThreshold = Math.max(10F, (Float) topCandidate.getUserData("readability") * 0.2F); 1236 final NodeList siblingNodes = topCandidate.getParentNode().getChildNodes(); 1237 1238 for (int s = 0, sl = siblingNodes.getLength(); s < sl; s++) 1239 { 1240 final Node siblingNode = siblingNodes.item(s); 1241 boolean append = false; 1242 1243 if (siblingNode instanceof Element) 1244 dbg("Looking at sibling node: " 1245 + siblingNode 1246 + " (" 1247 + ((Element) siblingNode).getAttribute("class") 1248 + ":" 1249 + ((Element) siblingNode).getAttribute("id") 1250 + ")" 1251 + ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode 1252 .getUserData("readability")) : "")); 1253 dbg("Sibling has score " 1254 + (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability") 1255 : "Unknown")); 1256 1257 if (siblingNode == topCandidate) 1258 { 1259 append = true; 1260 } 1261 1262 float contentBonus = 0; 1263 /* 1264 * Give a bonus if sibling nodes and top candidates have the example 1265 * same classname 1266 */ 1267 if (siblingNode instanceof Element 1268 && ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class")) 1269 && !topCandidate.getAttribute("class").equals("")) 1270 { 1271 contentBonus += (Float) topCandidate.getUserData("readability") * 0.2F; 1272 } 1273 1274 if (siblingNode.getUserData("readability") != null 1275 && ((Float) siblingNode.getUserData("readability") + contentBonus) >= siblingScoreThreshold) 1276 { 1277 append = true; 1278 } 1279 1280 if (siblingNode.getNodeName().equals("P")) { 1281 final float linkDensity = getLinkDensity((Element) siblingNode); 1282 final String nodeContent = getInnerText((Element) siblingNode); 1283 final int nodeLength = nodeContent.length(); 1284 1285 if (nodeLength > 80 && linkDensity < 0.25) 1286 { 1287 append = true; 1288 } 1289 else if (nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1) 1290 { 1291 append = true; 1292 } 1293 } 1294 1295 if (append) 1296 { 1297 dbg("Appending node: " + siblingNode); 1298 1299 Node nodeToAppend = null; 1300 if (!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) { 1301 /* 1302 * We have a node that isn't a common block level element, 1303 * like a form or td tag. Turn it into a div so it doesn't 1304 * get filtered out later by accident. 1305 */ 1306 1307 dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div."); 1308 nodeToAppend = document.createElement("div"); 1309 if (siblingNode instanceof Element) 1310 ((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id")); 1311 1312 // nodeToAppend.innerHTML = siblingNode.innerHTML; 1313 final NodeList nl = siblingNode.getChildNodes(); 1314 for (int i = 0; i < nl.getLength(); i++) 1315 nodeToAppend.appendChild(nl.item(i)); 1316 } else { 1317 nodeToAppend = siblingNode; 1318 s--; 1319 sl--; 1320 } 1321 1322 /* 1323 * To ensure a node does not interfere with readability styles, 1324 * remove its classnames 1325 */ 1326 if (nodeToAppend instanceof Element) 1327 ((Element) nodeToAppend).setAttribute("class", ""); 1328 1329 /* 1330 * Append sibling and subtract from our list because it removes 1331 * the node when you append to another node 1332 */ 1333 articleContent.appendChild(nodeToAppend); 1334 } 1335 } 1336 1337 /** 1338 * So we have all of the content that we need. Now we clean it up for 1339 * presentation. 1340 **/ 1341 prepArticle(articleContent); 1342 1343 return articleContent; 1344 } 1345 1346 protected String getInnerHTML(Node n) { 1347 if (n.getNodeType() == Node.TEXT_NODE) 1348 return n.getTextContent(); 1349 1350 String result = ""; 1351 final NodeList nl = n.getChildNodes(); 1352 for (int i = 0; i < nl.getLength(); i++) { 1353 if (nl.item(i).getNodeType() == Node.TEXT_NODE) 1354 result += nl.item(i).getTextContent(); 1355 else if (nl.item(i).getNodeType() == Node.COMMENT_NODE) 1356 result += "<!-- " + nl.item(i).getTextContent() + " -->"; 1357 else 1358 result += nodeToString(nl.item(i)); 1359 } 1360 1361 return result; 1362 } 1363 1364 protected String nodeToString(Node n) { 1365 return nodeToString(n, false); 1366 } 1367 1368 protected static String nodeToString(Node n, boolean pretty) { 1369 try { 1370 final DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance(); 1371 final DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS"); 1372 final LSSerializer writer = impl.createLSSerializer(); 1373 1374 writer.getDomConfig().setParameter("xml-declaration", false); 1375 if (pretty) { 1376 writer.getDomConfig().setParameter("format-pretty-print", true); 1377 } 1378 1379 return writer.writeToString(n); 1380 } catch (final Exception e) { 1381 throw new RuntimeException(e); 1382 } 1383 } 1384 1385 protected Node stringToNode(String str) { 1386 try { 1387 final DOMFragmentParser parser = new DOMFragmentParser(); 1388 final DocumentFragment fragment = document.createDocumentFragment(); 1389 parser.parse(new InputSource(new StringReader(str)), fragment); 1390 return fragment; 1391 1392 // try and return the element itself if possible... 1393 // NodeList nl = fragment.getChildNodes(); 1394 // for (int i=0; i<nl.getLength(); i++) if (nl.item(i).getNodeType() 1395 // == Node.ELEMENT_NODE) return nl.item(i); 1396 // return fragment; 1397 1398 } catch (final Exception e) { 1399 throw new RuntimeException(e); 1400 } 1401 } 1402 1403 /** 1404 * Get the inner text of a node - cross browser compatibly. This also strips 1405 * out any excess whitespace to be found. 1406 * 1407 * @param Element 1408 * @return string 1409 **/ 1410 protected String getInnerText(Element e, boolean normalizeSpaces) { 1411 String textContent = ""; 1412 1413 textContent = e.getTextContent().replaceAll(Regexps.trimRe, ""); 1414 1415 if (normalizeSpaces) { 1416 return textContent.replaceAll(Regexps.normalizeRe, " "); 1417 } else { 1418 return textContent; 1419 } 1420 } 1421 1422 protected String getInnerTextSep(Node e) { 1423 if (e.hasChildNodes()) { 1424 String s = ""; 1425 final NodeList nl = e.getChildNodes(); 1426 for (int i = 0; i < nl.getLength(); i++) { 1427 if (!nl.item(i).getNodeName().equalsIgnoreCase("script")) 1428 s += getInnerTextSep(nl.item(i)); 1429 } 1430 return s; 1431 } else { 1432 return e.getTextContent() + " "; 1433 } 1434 } 1435 1436 protected String getInnerText(Element e) { 1437 return getInnerText(e, true); 1438 } 1439 1440 /** 1441 * @return The article HTML content as a {@link String}. 1442 */ 1443 public String getArticleHTML() { 1444 if (articleContent == null) 1445 return ""; 1446 return nodeToString(articleContent, true); 1447 } 1448 1449 /** 1450 * @return The articles HTML dom node. 1451 */ 1452 public Node getArticleHTML_DOM() { 1453 return articleContent; 1454 } 1455 1456 protected String getArticleDateString() { 1457 return article_date_string; 1458 } 1459 1460 /** 1461 * @return The article date. 1462 */ 1463 public Date getArticleDate() { 1464 return article_date; 1465 } 1466 1467 /** 1468 * @return The text of the article. 1469 */ 1470 public String getArticleText() { 1471 if (articleContent == null) 1472 return "Unable to find article content"; 1473 // return getInnerText(articleContent, false); 1474 return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " "); 1475 } 1476 1477 /** 1478 * @return Any links in the article. 1479 */ 1480 public List<Anchor> getArticleLinks() { 1481 final List<Anchor> anchors = new ArrayList<Anchor>(); 1482 if (articleContent == null) 1483 return anchors; 1484 1485 final NodeList nl = articleContent.getElementsByTagName("a"); 1486 for (int i = 0; i < nl.getLength(); i++) { 1487 final Element a = (Element) nl.item(i); 1488 1489 final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href")); 1490 anchors.add(anchor); 1491 } 1492 return anchors; 1493 } 1494 1495 /** 1496 * @return Any links in the document. 1497 */ 1498 public List<Anchor> getAllLinks() { 1499 final List<Anchor> anchors = new ArrayList<Anchor>(); 1500 1501 final NodeList nl = document.getElementsByTagName("a"); 1502 for (int i = 0; i < nl.getLength(); i++) { 1503 final Element a = (Element) nl.item(i); 1504 final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href")); 1505 anchors.add(anchor); 1506 } 1507 return anchors; 1508 } 1509 1510 /** 1511 * @return Any images in the article. 1512 */ 1513 public List<String> getArticleImages() { 1514 final List<String> images = new ArrayList<String>(); 1515 if (articleContent == null) 1516 return images; 1517 1518 final NodeList nl = articleContent.getElementsByTagName("img"); 1519 for (int i = 0; i < nl.getLength(); i++) { 1520 final Element img = (Element) nl.item(i); 1521 images.add(img.getAttribute("src")); 1522 } 1523 return images; 1524 } 1525 1526 /** 1527 * @return Any subheadings in the article. 1528 */ 1529 public List<String> getArticleSubheadings() { 1530 final List<String> subtitles = new ArrayList<String>(); 1531 if (articleContent == null) 1532 return subtitles; 1533 1534 for (int j = 1; j <= 6; j++) { 1535 final NodeList nl = articleContent.getElementsByTagName("h" + j); 1536 if (nl.getLength() > 0) { 1537 for (int i = 0; i < nl.getLength(); i++) { 1538 subtitles.add(nl.item(i).getTextContent()); 1539 } 1540 break; 1541 } 1542 } 1543 1544 if (subtitles.size() == 0) { 1545 // try looking for other likely-looking elements 1546 1547 final NodeList nl = articleContent.getElementsByTagName("*"); 1548 for (int i = 0; i < nl.getLength(); i++) { 1549 if (nl.item(i) instanceof Element && 1550 ((Element) nl.item(i)).getAttribute("class") != null && 1551 search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1) 1552 subtitles.add(nl.item(i).getTextContent()); 1553 } 1554 } 1555 1556 return subtitles; 1557 } 1558 1559 protected List<Node> findChildNodesWithName(Node parent, String name) { 1560 final NodeList children = parent.getChildNodes(); 1561 final List<Node> results = new ArrayList<Node>(); 1562 1563 for (int i = 0; i < children.getLength(); ++i) { 1564 final Node child = children.item(i); 1565 if (child == null) 1566 continue; 1567 1568 final String nodeName = child.getNodeName(); 1569 if (nodeName == null) 1570 continue; 1571 1572 if (nodeName.equals(name)) { 1573 results.add(child); 1574 } 1575 } 1576 return results; 1577 } 1578 1579 protected int findChildNodeIndex(Node parent, Node childToFind) 1580 { 1581 for (int index = 0; index < parent.getChildNodes().getLength(); index++) 1582 if (parent.getChildNodes().item(index) == childToFind) 1583 return index; 1584 return -1; 1585 } 1586 1587 protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException { 1588 final Node parend = walker.getCurrentNode(); 1589 1590 if (parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null) 1591 { 1592 if (parend.getTextContent().trim().length() > 0) 1593 { 1594 final int index = findChildNodeIndex(parend.getParentNode(), parend); 1595 if (index != -1) 1596 { 1597 // square brackets are not valid XML/HTML identifier 1598 // characters, so we can use them here 1599 map.add(new MappingNode( 1600 parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + index + "]", 1601 parend.getNodeValue())); 1602 1603 // System.out.println( 1604 // "ELEMENT '"+parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() 1605 // + "["+index+"]"+"'"); 1606 // System.out.println( "VALUE: '"+parend.getNodeValue()+"'" 1607 // ); 1608 } 1609 } 1610 } 1611 1612 // traverse children: 1613 for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) { 1614 getArticleTextMapping(walker, map); 1615 } 1616 1617 // return position to the current (level up): 1618 walker.setCurrentNode(parend); 1619 } 1620 1621 protected class MappingNode { 1622 String id; 1623 String text; 1624 1625 public MappingNode(String id, String text) { 1626 this.id = id; 1627 this.text = text; 1628 } 1629 1630 public String getId() { 1631 return id; 1632 } 1633 1634 public String getText() { 1635 return text; 1636 } 1637 1638 @Override 1639 public String toString() { 1640 return "MappingNode(" + id + " -> " + text + ")"; 1641 } 1642 } 1643 1644 /** 1645 * Get the mapping between bits of text in the dom & their xpaths 1646 * 1647 * @return mapping from xpath to text 1648 */ 1649 public List<MappingNode> getArticleTextMapping() { 1650 if (articleContent == null) 1651 return null; 1652 1653 final List<MappingNode> map = new ArrayList<MappingNode>(); 1654 1655 final TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT 1656 | NodeFilter.SHOW_ELEMENT, null, true); 1657 1658 getArticleTextMapping(walker, map); 1659 1660 return map; 1661 } 1662 1663 /** 1664 * Convenience method to build a {@link Readability} instance from an html 1665 * string. 1666 * 1667 * @param html 1668 * The html string 1669 * @return new {@link Readability} instance. 1670 * @throws SAXException 1671 * @throws IOException 1672 */ 1673 public static Readability getReadability(String html) throws SAXException, IOException { 1674 return getReadability(html, false); 1675 } 1676 1677 /** 1678 * Convenience method to build a {@link Readability} instance from an html 1679 * string. 1680 * 1681 * @param html 1682 * The html string 1683 * @param addTitle 1684 * Should the title be added to the generated article? 1685 * @return new {@link Readability} instance. 1686 * @throws SAXException 1687 * @throws IOException 1688 */ 1689 public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException { 1690 final DOMParser parser = new DOMParser(); 1691 parser.parse(new InputSource(new StringReader(html))); 1692 1693 return new Readability(parser.getDocument(), false, addTitle); 1694 } 1695 1696 /** 1697 * Testing 1698 * 1699 * @param argv 1700 * @throws Exception 1701 */ 1702 public static void main(String[] argv) throws Exception { 1703 // URL input = new 1704 // URL("file:///home/dd/Programming/Readability4J/t.html"); 1705 // URL input = new 1706 // URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm"); 1707 final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/"); 1708 // URL input = new URL("http://euobserver.com/9/30465"); 1709 // URL input = new URL("http://euobserver.com/?aid=23383"); 1710 // URL input = new 1711 // URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html"); 1712 // URL input = new URL("file:///Users/jsh2/Desktop/test.html"); 1713 // URL input = new 1714 // URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/"); 1715 // URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx"); 1716 // URL input = new 1717 // URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html"); 1718 // URL input = new 1719 // URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx"); 1720 // URL input = new 1721 // URL("http://www.bbc.co.uk/news/world-middle-east-11415719"); 1722 // URL input = new URL("http://www.thebigproject.co.uk/news/"); 1723 // URL input = new 1724 // URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958"); 1725 // URL input = new 1726 // URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2"); 1727 1728 // URL input = new 1729 // URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html"); 1730 // URL input = new 1731 // URL("http://uk.mac.ign.com/articles/573/573319p1.html"); 1732 final DOMParser parser = new DOMParser(); 1733 parser.parse(new InputSource(input.openStream())); 1734 1735 final Readability r = new Readability(parser.getDocument(), true, true); 1736 1737 // System.out.println(r.getArticleTitle()); 1738 System.out.println(r.getArticleHTML()); 1739 // System.out.println(r.getAllLinks()); 1740 // System.out.println(r.getArticleText()); 1741 1742 System.out.println(); 1743 System.out.println("***"); 1744 System.out.println(); 1745 1746 for (final MappingNode s : r.getArticleTextMapping()) 1747 System.out.println(s); 1748 1749 // PrintStream out = new PrintStream("news-sites"); 1750 // for (Anchor anchor : r.getAllLinks()) { 1751 // out.println(anchor.getHref() + "\t" + anchor.getText()); 1752 // } 1753 // out.close(); 1754 1755 System.out.println(r.getArticleImages()); 1756 // System.out.println(r.getArticleSubheadings()); 1757 // System.out.println(r.getArticleHTML()); 1758 // System.out.println(r.getArticleHTML_DOM()); 1759 1760 // System.out.println(r.getArticleDateString()); 1761 // System.out.println(r.getArticleDate()); 1762 } 1763}