View Javadoc

1   /**
2    * Copyright 2010 The University of Southampton, Yahoo Inc., and the
3    * individual contributors. All rights reserved.
4    *
5    * Licensed under the Apache License, Version 2.0 (the "License");
6    * you may not use this file except in compliance with the License.
7    * You may obtain a copy of the License at
8    *
9    *    http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.openimaj.web.readability;
18  
19  import java.io.IOException;
20  import java.io.StringReader;
21  import java.net.URL;
22  import java.text.ParseException;
23  import java.text.SimpleDateFormat;
24  import java.util.ArrayList;
25  import java.util.Date;
26  import java.util.EnumSet;
27  import java.util.List;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.cyberneko.html.parsers.DOMFragmentParser;
32  import org.cyberneko.html.parsers.DOMParser;
33  import org.pojava.datetime.DateTime;
34  import org.w3c.dom.DOMException;
35  import org.w3c.dom.Document;
36  import org.w3c.dom.DocumentFragment;
37  import org.w3c.dom.Element;
38  import org.w3c.dom.Node;
39  import org.w3c.dom.NodeList;
40  import org.w3c.dom.bootstrap.DOMImplementationRegistry;
41  import org.w3c.dom.ls.DOMImplementationLS;
42  import org.w3c.dom.ls.LSSerializer;
43  import org.w3c.dom.traversal.DocumentTraversal;
44  import org.w3c.dom.traversal.NodeFilter;
45  import org.w3c.dom.traversal.TreeWalker;
46  import org.xml.sax.InputSource;
47  import org.xml.sax.SAXException;
48  
49  /**
50   * Class for extracting the "content" from web-pages, and ignoring adverts, etc.
51   * Based upon readability.js (http://lab.arc90.com/experiments/readability/) and
52   * modified to behave better for certain sites (and typically better mimic
53   * Safari Reader functionality).
54   *
55   * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
56   * @author Michael Matthews (mikemat@yahoo-inc.com)
57   * @author David Dupplaw (dpd@ecs.soton.ac.uk)
58   */
59  public class Readability
60  {
61  	/**
62  	 * Regular expressions for different types of content
63  	 */
64  	protected static class Regexps {
65  
66  		public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner"; // caption?
67  		public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
68  		public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
69  		public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
70  		public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
71  		public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
72  		public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>";
73  		public static String trimRe = "^\\s+|\\s+$";
74  		public static String normalizeRe = "\\s{2,}";
75  		public static String killBreaksRe = "(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}";
76  		public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
77  
78  		public static String titleSeparatorRe = "\\|\\-\\/";
79  
80  		// this is used to try and find elements that represent sub-headings
81  		// (that are not h1..h6)
82  		public static String likelySubheadCandidateRe = "(?i)cross-head";
83  	}
84  
85  	enum Flag {
86  		FLAG_STRIP_UNLIKELYS,
87  		FLAG_WEIGHT_CLASSES
88  	}
89  
90  	/**
91  	 * Threshold for removing elements with lots of links
92  	 */
93  	public static float LINK_DENSITY_THRESHOLD = 0.33F;
94  
95  	// IVARS below
96  	protected Document document;
97  	private Node bodyCache;
98  	protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class);
99  
100 	protected String articleTitle;
101 	protected Element articleContent;
102 	protected String article_date_string;
103 	protected Date article_date;
104 	protected String article_contentType;
105 
106 	protected boolean debug = false;
107 
108 	protected boolean addTitle = false;
109 
110 	/**
111 	 * Construct with the given document. Debugging is disabled.
112 	 *
113 	 * @param document
114 	 *            The document.
115 	 */
116 	public Readability(Document document) {
117 		this(document, false);
118 	}
119 
120 	/**
121 	 * Construct with the given document. The second argument can be used to
122 	 * enable debugging output.
123 	 *
124 	 * @param document
125 	 *            The document.
126 	 * @param debug
127 	 *            Enable debugging output.
128 	 */
129 	public Readability(Document document, boolean debug) {
130 		this(document, debug, false);
131 	}
132 
133 	/**
134 	 * Construct with the given document. The second argument can be used to
135 	 * enable debugging output. The third option controls whether the title
136 	 * should be included in the output.
137 	 *
138 	 * @param document
139 	 *            The document.
140 	 * @param debug
141 	 *            Enable debugging output.
142 	 * @param addTitle
143 	 *            Add title to output.
144 	 */
145 	public Readability(Document document, boolean debug, boolean addTitle) {
146 		this.debug = debug;
147 		this.document = document;
148 		this.addTitle = addTitle;
149 		augmentDocument(document);
150 		init();
151 	}
152 
153 	/**
154 	 * Iterates through all the ELEMENT nodes in a document and gives them ids
155 	 * if they don't already have them.
156 	 *
157 	 * @param document
158 	 */
159 	public static void augmentDocument(Document document) {
160 		final DocumentTraversal traversal = (DocumentTraversal) document;
161 
162 		final TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true);
163 
164 		traverseLevel(walker, 0);
165 	}
166 
167 	private static int traverseLevel(TreeWalker walker, int counter) {
168 		// describe current node:
169 		final Node parend = walker.getCurrentNode();
170 
171 		if (parend instanceof Element) {
172 			if (((Element) parend).getAttribute("id").length() == 0) {
173 				((Element) parend).setAttribute("id", "gen-id-" + counter);
174 				counter++;
175 			}
176 		}
177 
178 		// traverse children:
179 		for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
180 			counter = traverseLevel(walker, counter);
181 		}
182 
183 		// return position to the current (level up):
184 		walker.setCurrentNode(parend);
185 
186 		return counter;
187 	}
188 
189 	protected void dbg(String s) {
190 		if (debug)
191 			System.err.println(s);
192 	}
193 
194 	protected String getTitle() {
195 		final NodeList l = document.getElementsByTagName("title");
196 
197 		if (l.getLength() == 0)
198 			return "";
199 
200 		return l.item(0).getTextContent();
201 	}
202 
203 	/**
204 	 * Javascript-like String.match
205 	 *
206 	 * @param input
207 	 * @param regex
208 	 * @return
209 	 */
210 	protected String[] match(String input, String regex) {
211 		final Matcher matcher = Pattern.compile(regex).matcher(input);
212 		final List<String> matches = new ArrayList<String>();
213 
214 		while (matcher.find()) {
215 			matches.add(matcher.group(0));
216 		}
217 
218 		return matches.toArray(new String[matches.size()]);
219 	}
220 
221 	/**
222 	 * @return True if the article has any detected content; false otherwise.
223 	 */
224 	public boolean hasContent() {
225 		return articleContent != null;
226 	}
227 
228 	/**
229 	 * Javascript-like String.search
230 	 *
231 	 * @param input
232 	 * @param regex
233 	 * @return
234 	 */
235 	protected int search(String input, String regex) {
236 		final Matcher matcher = Pattern.compile(regex).matcher(input);
237 
238 		if (!matcher.find())
239 			return -1;
240 		return matcher.start();
241 	}
242 
243 	protected void findArticleEncoding() {
244 		final NodeList nl = document.getElementsByTagName("meta");
245 		for (int j = 0; j < nl.getLength(); j++) {
246 			if (((Element) nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) {
247 				article_contentType = ((Element) nl.item(j)).getAttribute("content");
248 				return;
249 			}
250 		}
251 
252 	}
253 
254 	protected void findArticleDate() {
255 		// <meta name="OriginalPublicationDate" content="2010/07/12 14:08:02"/>
256 		// <meta name="DC.date.issued" content="2010-07-12">
257 		NodeList nl = document.getElementsByTagName("meta");
258 		for (int j = 0; j < nl.getLength(); j++) {
259 			if (((Element) nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) {
260 				article_date_string = ((Element) nl.item(j)).getAttribute("content");
261 				article_date = DateTime.parse(article_date_string).toDate();
262 				return;
263 			}
264 			if (((Element) nl.item(j)).getAttribute("name").equals("DC.date.issued")) {
265 				article_date_string = ((Element) nl.item(j)).getAttribute("content");
266 				article_date = DateTime.parse(article_date_string).toDate();
267 				return;
268 			}
269 		}
270 
271 		// <time datetime="2010-07-12T10:26BST" pubdate>Monday 12 July 2010
272 		// 10.26 BST</time>
273 		nl = document.getElementsByTagName("time");
274 		for (int j = 0; j < nl.getLength(); j++) {
275 			if (((Element) nl.item(j)).getAttributeNode("pubdate") != null) {
276 				article_date_string = ((Element) nl.item(j)).getAttribute("datetime");
277 				article_date = DateTime.parse(article_date_string).toDate();
278 				return;
279 			}
280 		}
281 
282 		// <span class="date">14:08 GMT, Monday, 12 July 2010 15:08 UK</span>
283 		// <p class="date">09.07.2010 @ 17:49 CET</p>
284 		// <p class="date">Today @ 09:29 CET</p>
285 		nl = document.getElementsByTagName("*");
286 		for (int j = 0; j < nl.getLength(); j++) {
287 			if ((((Element) nl.item(j)).getAttribute("class").contains("date") ||
288 					((Element) nl.item(j)).getAttribute("class").contains("Date")) &&
289 					!(((Element) nl.item(j)).getAttribute("class").contains("update") ||
290 							((Element) nl.item(j)).getAttribute("class").contains("Update")))
291 			{
292 				article_date_string = getInnerTextSep(nl.item(j)).trim();
293 				parseDate();
294 				return;
295 			}
296 		}
297 		for (int j = 0; j < nl.getLength(); j++) {
298 			if ((((Element) nl.item(j)).getAttribute("id").contains("date") ||
299 					((Element) nl.item(j)).getAttribute("id").contains("Date")) &&
300 					!(((Element) nl.item(j)).getAttribute("id").contains("update") ||
301 							((Element) nl.item(j)).getAttribute("id").contains("Update")))
302 			{
303 				article_date_string = getInnerTextSep(nl.item(j)).trim();
304 				parseDate();
305 				return;
306 			}
307 		}
308 
309 		// Last updated at 3:05 PM on 12th July 2010
310 		nl = document.getElementsByTagName("*");
311 		for (int j = 0; j < nl.getLength(); j++) {
312 			final String text = nl.item(j).getTextContent();
313 
314 			if (text == null)
315 				continue;
316 
317 			final Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)");
318 			final Matcher m = p.matcher(text);
319 			if (m.find()) {
320 				article_date_string = m.group(1);
321 
322 				String cpy = article_date_string.replaceAll("th", "");
323 				cpy = cpy.replaceAll("st", "");
324 				cpy = cpy.replaceAll("nd", "");
325 				cpy = cpy.replaceAll("rd", "");
326 
327 				final SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy");
328 				try {
329 					article_date = sdf.parse(cpy);
330 				} catch (final ParseException e) {
331 				}
332 				return;
333 			}
334 		}
335 	}
336 
337 	@SuppressWarnings("deprecation")
338 	protected void parseDate() {
339 		if (article_date_string == null || article_date_string.trim().isEmpty())
340 			return;
341 
342 		if (article_date_string.contains("Today")) {
343 			try {
344 				final SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z");
345 				article_date = sdf.parse(article_date_string);
346 				final Date now = new Date();
347 				article_date.setDate(now.getDate());
348 				article_date.setMonth(now.getMonth());
349 				article_date.setYear(now.getYear());
350 			} catch (final ParseException e) {
351 			}
352 		} else {
353 			try {
354 				final SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy");
355 				article_date = sdf.parse(article_date_string);
356 			} catch (final ParseException e) {
357 				try {
358 					final SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z");
359 					article_date = sdf.parse(article_date_string);
360 				} catch (final ParseException ee) {
361 					try {
362 						final SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
363 						article_date = sdf.parse(article_date_string);
364 					} catch (final ParseException eee) {
365 						try {
366 							article_date = DateTime.parse(article_date_string).toDate();
367 						} catch (final IllegalArgumentException ie) {
368 						} catch (final java.lang.ArrayIndexOutOfBoundsException ie) {
369 							System.out.println(article_date_string);
370 						}
371 					}
372 				}
373 			}
374 		}
375 	}
376 
377 	/**
378 	 * Get the article title.
379 	 *
380 	 * @return void
381 	 **/
382 	protected String findArticleTitle() {
383 		String curTitle = "", origTitle = "";
384 
385 		curTitle = origTitle = getTitle();
386 
387 		//
388 		final List<String> potentialTitles = new ArrayList<String>();
389 		for (int i = 1; i <= 6; i++) {
390 			final NodeList nl = document.getElementsByTagName("h" + i);
391 			if (nl.getLength() > 0) {
392 				for (int j = 0; j < nl.getLength(); j++)
393 					potentialTitles.add(nl.item(j).getTextContent().trim());
394 			}
395 		}
396 
397 		String potentialTitle = null;
398 		int score = 0;
399 		for (final String s : potentialTitles) {
400 			if (s.length() > score && curTitle.contains(s)) {
401 				potentialTitle = s;
402 				score = s.length();
403 			}
404 		}
405 		if (potentialTitle != null)
406 			return potentialTitle;
407 		//
408 
409 		if (match(curTitle, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0)
410 		{
411 			curTitle = origTitle.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1");
412 
413 			if (curTitle.split(" ").length < 3) {
414 				curTitle = origTitle.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe
415 						+ "]+(.*)", "$1");
416 			}
417 		}
418 		else if (curTitle.indexOf(": ") != -1)
419 		{
420 			curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1");
421 
422 			if (curTitle.split(" ").length < 3) {
423 				curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1");
424 			}
425 		}
426 		else if (curTitle.length() > 150 || curTitle.length() < 15)
427 		{
428 			final NodeList hOnes = document.getElementsByTagName("h1");
429 			if (hOnes.getLength() == 1)
430 			{
431 				curTitle = getInnerText((Element) hOnes.item(0));
432 			}
433 		}
434 
435 		curTitle = curTitle.replaceAll(Regexps.trimRe, "");
436 
437 		if (curTitle.split(" ").length <= 3) {
438 			curTitle = origTitle;
439 		}
440 
441 		return curTitle;
442 	}
443 
444 	/**
445 	 * Equivalent to document.body in JS
446 	 *
447 	 * @return
448 	 */
449 	protected Element getBody() {
450 		final NodeList nl = document.getElementsByTagName("body");
451 
452 		if (nl.getLength() == 0)
453 			return null;
454 		else
455 			return (Element) nl.item(0);
456 	}
457 
458 	/**
459 	 * Runs readability.
460 	 *
461 	 * Workflow: 1. Prep the document by removing script tags, css, etc. 2.
462 	 * Build readability"s DOM tree. 3. Grab the article content from the
463 	 * current dom tree. 4. Replace the current DOM tree with the new one. 5.
464 	 * Read peacefully.
465 	 *
466 	 **/
467 	protected void init() {
468 		if (getBody() != null && bodyCache == null) {
469 			bodyCache = getBody().cloneNode(true);
470 		}
471 
472 		findArticleDate(); // must be done before prepDocument()
473 
474 		findArticleEncoding();
475 
476 		prepDocument();
477 
478 		/* Build readability"s DOM tree */
479 		articleTitle = findArticleTitle();
480 		articleContent = grabArticle();
481 
482 		/**
483 		 * If we attempted to strip unlikely candidates on the first run
484 		 * through, and we ended up with no content, that may mean we stripped
485 		 * out the actual content so we couldn"t parse it. So re-run init while
486 		 * preserving unlikely candidates to have a better shot at getting our
487 		 * content out properly.
488 		 **/
489 		if (getInnerText(articleContent, false).length() < 250)
490 		{
491 			if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) {
492 				flags.remove(Flag.FLAG_STRIP_UNLIKELYS);
493 				getBody().getParentNode().replaceChild(bodyCache, getBody());
494 				init();
495 				return;
496 			}
497 			else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
498 				flags.remove(Flag.FLAG_WEIGHT_CLASSES);
499 				getBody().getParentNode().replaceChild(bodyCache, getBody());
500 				init();
501 				return;
502 			}
503 			else {
504 				articleContent = null;
505 			}
506 		}
507 
508 		if (addTitle && articleContent != null) {
509 			final Element titleNode = document.createElement("h1");
510 			titleNode.setAttribute("id", "title");
511 			titleNode.appendChild(document.createTextNode(getArticleTitle()));
512 			articleContent.insertBefore(titleNode, articleContent.getFirstChild());
513 		}
514 	}
515 
516 	/**
517 	 * Prepare the HTML document for readability to scrape it. This includes
518 	 * things like stripping javascript, CSS, and handling terrible markup.
519 	 *
520 	 **/
521 	protected void prepDocument() {
522 		/**
523 		 * In some cases a body element can"t be found (if the HTML is totally
524 		 * hosed for example) so we create a new body node and append it to the
525 		 * document.
526 		 */
527 		if (getBody() == null)
528 		{
529 			final Node body = document.createElement("body");
530 			document.appendChild(body);
531 		}
532 
533 		// frames are not supported in this version!
534 		// NodeList frames = document.getElementsByTagName("frame");
535 		// if(frames.length > 0)
536 		// {
537 		// Node bestFrame = null;
538 		// int bestFrameSize = 0;
539 		// for(int frameIndex = 0; frameIndex < frames.getLength();
540 		// frameIndex++)
541 		// {
542 		// int frameSize = frames.item(frameIndex).offsetWidth +
543 		// frames[frameIndex].offsetHeight;
544 		// var canAccessFrame = false;
545 		// try {
546 		// frames[frameIndex].contentWindow.document.body;
547 		// canAccessFrame = true;
548 		// }
549 		// catch(eFrames) {
550 		// dbg(eFrames);
551 		// }
552 		//
553 		// if(canAccessFrame && frameSize > bestFrameSize)
554 		// {
555 		// bestFrame = frames[frameIndex];
556 		// bestFrameSize = frameSize;
557 		// }
558 		// }
559 		//
560 		// if(bestFrame)
561 		// {
562 		// var newBody = document.createElement("body");
563 		// newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
564 		// newBody.style.overflow = "scroll";
565 		// document.body = newBody;
566 		//
567 		// var frameset = document.getElementsByTagName("frameset")[0];
568 		// if(frameset) {
569 		// frameset.parentNode.removeChild(frameset); }
570 		//
571 		// readability.frameHack = true;
572 		// }
573 		// }
574 
575 		/* remove all scripts that are not readability */
576 		final NodeList scripts = document.getElementsByTagName("script");
577 		for (int i = scripts.getLength() - 1; i >= 0; i--)
578 		{
579 			scripts.item(i).getParentNode().removeChild(scripts.item(i));
580 		}
581 
582 		/* Remove all style tags in head */
583 		final NodeList styleTags = document.getElementsByTagName("style");
584 		for (int st = styleTags.getLength() - 1; st >= 0; st--) {
585 			styleTags.item(st).getParentNode().removeChild(styleTags.item(st));
586 		}
587 
588 		/* Remove all meta tags */
589 		final NodeList metaTags = document.getElementsByTagName("meta");
590 		for (int mt = metaTags.getLength() - 1; mt >= 0; mt--) {
591 			metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt));
592 		}
593 
594 		/* Turn all double br's into p's */
595 		/*
596 		 * Note, this is pretty costly as far as processing goes. Maybe optimize
597 		 * later.
598 		 */
599 		// document.body.innerHTML =
600 		// document.body.innerHTML.replace(readability.regexps.replaceBrsRe,
601 		// '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>');
602 		final Element body = getBody();
603 		// Node rep =
604 		// stringToNode(nodeToString(body).replaceAll(Regexps.replaceBrsRe,
605 		// "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>"));
606 		// body.getParentNode().replaceChild(rep, body);
607 
608 		// This is slow!
609 		final Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(
610 				Regexps.replaceFontsRe, "<$1span>"));
611 		removeChildren(body);
612 		body.appendChild(frag);
613 
614 		/* Remove all comments */
615 		removeComments(document);
616 	}
617 
618 	protected void removeComments(Node n) {
619 		if (n.getNodeType() == Node.COMMENT_NODE) {
620 			n.getParentNode().removeChild(n);
621 		} else {
622 			final NodeList nl = n.getChildNodes();
623 			for (int i = nl.getLength() - 1; i >= 0; i--)
624 				removeComments(nl.item(i));
625 		}
626 	}
627 
628 	/**
629 	 * Prepare the article node for display. Clean out any inline styles,
630 	 * iframes, forms, strip extraneous
631 	 * <p>
632 	 * tags, etc.
633 	 *
634 	 * @param Element
635 	 **/
636 	protected void prepArticle(Element articleContent) {
637 		cleanStyles(articleContent);
638 		killBreaks(articleContent);
639 
640 		/* Clean out junk from the article content */
641 		clean(articleContent, "form");
642 		clean(articleContent, "object");
643 		clean(articleContent, "h1");
644 		/**
645 		 * If there is only one h2, they are probably using it as a header and
646 		 * not a subheader, so remove it since we already have a header.
647 		 ***/
648 		if (articleContent.getElementsByTagName("h2").getLength() == 1) {
649 			clean(articleContent, "h2");
650 		}
651 		clean(articleContent, "iframe");
652 
653 		cleanHeaders(articleContent);
654 
655 		/*
656 		 * Do these last as the previous stuff may have removed junk that will
657 		 * affect these
658 		 */
659 		cleanConditionally(articleContent, "table");
660 		cleanConditionally(articleContent, "ul");
661 		cleanConditionally(articleContent, "div");
662 
663 		/* Remove extra paragraphs */
664 		final NodeList articleParagraphs = articleContent.getElementsByTagName("p");
665 		for (int i = articleParagraphs.getLength() - 1; i >= 0; i--)
666 		{
667 			final int imgCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength();
668 			final int embedCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength();
669 			final int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength();
670 
671 			if (imgCount == 0 && embedCount == 0 && objectCount == 0
672 					&& getInnerText((Element) articleParagraphs.item(i), false) == "")
673 			{
674 				articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i));
675 			}
676 		}
677 
678 		// articleContent.innerHTML =
679 		// articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, "<p");
680 		final Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
681 		removeChildren(articleContent);
682 		articleContent.appendChild(n);
683 
684 		// now remove empty p's and tidy up
685 		final NodeList nl = articleContent.getElementsByTagName("p");
686 		for (int i = nl.getLength() - 1; i >= 0; i--) {
687 			if (nl.item(i).getTextContent().trim().length() == 0)
688 			{
689 				nl.item(i).getParentNode().removeChild(nl.item(i));
690 			} else if (nl.item(i).getChildNodes().getLength() == 1
691 					&& nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE)
692 			{
693 				nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n");
694 			}
695 			else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled"))
696 			{
697 				nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i));
698 			}
699 		}
700 
701 	}
702 
703 	protected void removeChildren(Node n) {
704 		final NodeList nl = n.getChildNodes();
705 		final int nn = nl.getLength();
706 		for (int i = 0; i < nn; i++)
707 			n.removeChild(nl.item(0));
708 	}
709 
710 	/**
711 	 * Initialize a node with the readability object. Also checks the
712 	 * className/id for special names to add to its score.
713 	 *
714 	 * @param Element
715 	 **/
716 	protected void initializeNode(Element node) {
717 		float contentScore = 0;
718 
719 		if (node.getTagName() == "DIV") {
720 			contentScore += 5;
721 		} else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") {
722 			contentScore += 3;
723 		} else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL"
724 				|| node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT"
725 				|| node.getTagName() == "LI" || node.getTagName() == "FORM")
726 		{
727 			contentScore -= 3;
728 		} else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3"
729 				|| node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6"
730 				|| node.getTagName() == "TH")
731 		{
732 			contentScore -= 5;
733 		}
734 
735 		contentScore += getClassWeight(node);
736 		node.setUserData("readability", contentScore, null);
737 	}
738 
739 	/**
740 	 * Get an elements class/id weight. Uses regular expressions to tell if this
741 	 * element looks good or bad.
742 	 *
743 	 * @param Element
744 	 * @return number (Integer)
745 	 **/
746 	protected int getClassWeight(Element e) {
747 		if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
748 			return 0;
749 		}
750 
751 		int weight = 0;
752 
753 		/* Look for a special classname */
754 		if (e.getAttribute("class") != "")
755 		{
756 			if (search(e.getAttribute("class"), Regexps.negativeRe) != -1) {
757 				weight -= 25;
758 			}
759 
760 			if (search(e.getAttribute("class"), Regexps.positiveRe) != -1) {
761 				weight += 25;
762 			}
763 		}
764 
765 		/* Look for a special ID */
766 		if (e.getAttribute("id") != "")
767 		{
768 			if (search(e.getAttribute("id"), Regexps.negativeRe) != -1) {
769 				weight -= 25;
770 			}
771 
772 			if (search(e.getAttribute("id"), Regexps.positiveRe) != -1) {
773 				weight += 25;
774 			}
775 		}
776 
777 		return weight;
778 	}
779 
780 	protected void cleanStyles() {
781 		cleanStyles((Element) document);
782 	}
783 
784 	/**
785 	 * Remove the style attribute on every e and under. TODO: Test if
786 	 * getElementsByTagName(*) is faster.
787 	 *
788 	 * @param Element
789 	 **/
790 	protected void cleanStyles(Element e) {
791 		if (e == null)
792 			return;
793 		Node cur = e.getFirstChild();
794 
795 		// Remove any root styles, if we"re able.
796 		if (!e.getAttribute("class").equals("readability-styled"))
797 			e.removeAttribute("style");
798 
799 		// Go until there are no more child nodes
800 		while (cur != null) {
801 			if (cur.getNodeType() == Element.ELEMENT_NODE) {
802 				// Remove style attribute(s) :
803 				if (!((Element) cur).getAttribute("class").equals("readability-styled")) {
804 					((Element) cur).removeAttribute("style");
805 				}
806 				cleanStyles((Element) cur);
807 			}
808 			cur = cur.getNextSibling();
809 		}
810 	}
811 
812 	/**
813 	 * Remove extraneous break tags from a node.
814 	 *
815 	 * @param Element
816 	 **/
817 	protected void killBreaks(Element e) {
818 		// e.innerHTML =
819 		// e.innerHTML.replace(readability.regexps.killBreaksRe,"<br />");
820 
821 		final Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe, "<BR />"));
822 		removeChildren(e);
823 		e.appendChild(n);
824 	}
825 
826 	/**
827 	 * Clean a node of all elements of type "tag". (Unless it"s a youtube/vimeo
828 	 * video. People love movies.)
829 	 *
830 	 * @param Element
831 	 * @param string
832 	 *            tag to clean
833 	 **/
834 	protected void clean(Element e, String tag) {
835 		final NodeList targetList = e.getElementsByTagName(tag);
836 		final boolean isEmbed = (tag.equals("object") || tag.equals("embed"));
837 
838 		for (int y = targetList.getLength() - 1; y >= 0; y--) {
839 			/*
840 			 * Allow youtube and vimeo videos through as people usually want to
841 			 * see those.
842 			 */
843 			if (isEmbed) {
844 				String attributeValues = "";
845 				for (int i = 0, il = targetList.item(y).getAttributes().getLength(); i < il; i++) {
846 					attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|";
847 				}
848 
849 				/*
850 				 * First, check the elements attributes to see if any of them
851 				 * contain youtube or vimeo
852 				 */
853 				if (search(attributeValues, Regexps.videoRe) != -1) {
854 					continue;
855 				}
856 
857 				/* Then check the elements inside this element for the same. */
858 				if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) {
859 					continue;
860 				}
861 			}
862 
863 			targetList.item(y).getParentNode().removeChild(targetList.item(y));
864 		}
865 	}
866 
867 	/**
868 	 * Clean out spurious headers from an Element. Checks things like classnames
869 	 * and link density.
870 	 *
871 	 * @param Element
872 	 **/
873 	protected void cleanHeaders(Element e) {
874 		for (int headerIndex = 1; headerIndex < 7; headerIndex++) {
875 			final NodeList headers = e.getElementsByTagName("h" + headerIndex);
876 			for (int i = headers.getLength() - 1; i >= 0; i--) {
877 				if (getClassWeight((Element) headers.item(i)) < 0
878 						|| getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD)
879 				{
880 					headers.item(i).getParentNode().removeChild(headers.item(i));
881 				}
882 			}
883 		}
884 	}
885 
886 	/**
887 	 * Get the density of links as a percentage of the content This is the
888 	 * amount of text that is inside a link divided by the total text in the
889 	 * node.
890 	 *
891 	 * @param Element
892 	 * @return number (float)
893 	 **/
894 	protected float getLinkDensity(Element e) {
895 		final NodeList links = e.getElementsByTagName("a");
896 		final int textLength = getInnerText(e).length();
897 		int linkLength = 0;
898 
899 		for (int i = 0, il = links.getLength(); i < il; i++)
900 		{
901 			linkLength += getInnerText((Element) links.item(i)).length();
902 		}
903 
904 		if (linkLength == 0)
905 			return 0;
906 
907 		return (float) linkLength / (float) textLength;
908 	}
909 
910 	/**
911 	 * Clean an element of all tags of type "tag" if they look fishy. "Fishy" is
912 	 * an algorithm based on content length, classnames, link density, number of
913 	 * images & embeds, etc.
914 	 **/
915 	protected void cleanConditionally(Element e, String tag) {
916 		final NodeList tagsList = e.getElementsByTagName(tag);
917 		final int curTagsLength = tagsList.getLength();
918 
919 		/**
920 		 * Gather counts for other typical elements embedded within. Traverse
921 		 * backwards so we can remove nodes at the same time without effecting
922 		 * the traversal.
923 		 *
924 		 * Todo: Consider taking into account original contentScore here.
925 		 **/
926 		for (int i = curTagsLength - 1; i >= 0; i--) {
927 			final int weight = getClassWeight((Element) tagsList.item(i));
928 			final float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float) (tagsList.item(i)
929 					.getUserData("readability")) : 0;
930 
931 			dbg("Cleaning Conditionally "
932 					+ tagsList.item(i)
933 					+ " ("
934 					+ ((Element) tagsList.item(i)).getAttribute("class")
935 					+ ":"
936 					+ ((Element) tagsList.item(i)).getAttribute("id")
937 					+ ")"
938 					+ ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i)
939 							.getUserData("readability")) : ""));
940 
941 			if (weight + contentScore < 0)
942 			{
943 				dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
944 						+ ((Element) tagsList.item(i)).getAttribute("id") + ")");
945 				tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
946 			}
947 			else if (getCharCount((Element) tagsList.item(i), ",") < 10) {
948 				/**
949 				 * If there are not very many commas, and the number of
950 				 * non-paragraph elements is more than paragraphs or other
951 				 * ominous signs, remove the element.
952 				 **/
953 				final int p = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength();
954 				final int img = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength();
955 				final int li = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength() - 100;
956 				final int input = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength();
957 
958 				int embedCount = 0;
959 				final NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed");
960 				for (int ei = 0, il = embeds.getLength(); ei < il; ei++) {
961 					if (search(((Element) embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) {
962 						embedCount++;
963 					}
964 				}
965 
966 				final float linkDensity = getLinkDensity((Element) tagsList.item(i));
967 				final int contentLength = getInnerText((Element) tagsList.item(i)).length();
968 				boolean toRemove = false;
969 
970 				if (img > p) {
971 					toRemove = true;
972 				} else if (li > p && tag != "ul" && tag != "ol") {
973 					toRemove = true;
974 				} else if (input > Math.floor(p / 3)) {
975 					toRemove = true;
976 				} else if (contentLength < 25 && (img == 0 || img > 2)) {
977 					toRemove = true;
978 				} else if (weight < 25 && linkDensity > 0.2) {
979 					toRemove = true;
980 				} else if (weight >= 25 && linkDensity > 0.5) {
981 					toRemove = true;
982 				} else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) {
983 					toRemove = true;
984 				}
985 
986 				if (img == 1 && p == 0 && contentLength == 0) {
987 					final Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0);
988 
989 					String w = "";
990 					if (theImg.getAttribute("width") != null)
991 						w = theImg.getAttribute("width");
992 
993 					String h = "";
994 					if (theImg.getAttribute("height") != null)
995 						h = theImg.getAttribute("height");
996 
997 					if (!(w.equals("0") || h.equals("0")))
998 						toRemove = false; // special case - it's just an inline
999 					// image
1000 				}
1001 
1002 				if (toRemove) {
1003 					dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
1004 							+ ((Element) tagsList.item(i)).getAttribute("id") + ")");
1005 					tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
1006 				}
1007 			}
1008 		}
1009 	}
1010 
1011 	/**
1012 	 * Get the number of times a string s appears in the node e.
1013 	 *
1014 	 * @param Element
1015 	 * @param string
1016 	 *            - what to split on. Default is ","
1017 	 * @return number (integer)
1018 	 **/
1019 	protected int getCharCount(Element e, String s) {
1020 		return getInnerText(e).split(s).length - 1;
1021 	}
1022 
1023 	protected int getCharCount(Element e) {
1024 		return getCharCount(e, ",");
1025 	}
1026 
1027 	/**
1028 	 * @return The article title
1029 	 */
1030 	public String getArticleTitle() {
1031 		return articleTitle;
1032 	}
1033 
1034 	/**
1035 	 * @return The content type of the article
1036 	 */
1037 	public String getArticleContentType() {
1038 		return article_contentType;
1039 	}
1040 
1041 	/***
1042 	 * grabArticle - Using a variety of metrics (content score, classname,
1043 	 * element types), find the content that is most likely to be the stuff a
1044 	 * user wants to read. Then return it wrapped up in a div.
1045 	 *
1046 	 * @return Element
1047 	 **/
1048 	protected Element grabArticle() {
1049 		final boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS);
1050 
1051 		/**
1052 		 * First, node prepping. Trash nodes that look cruddy (like ones with
1053 		 * the class name "comment", etc), and turn divs into P tags where they
1054 		 * have been used inappropriately (as in, where they contain no other
1055 		 * block level elements.)
1056 		 *
1057 		 * Note: Assignment from index for performance. See
1058 		 * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 Todo:
1059 		 * Shouldn't this be a reverse traversal?
1060 		 **/
1061 		Element node = null;
1062 		final List<Element> nodesToScore = new ArrayList<Element>();
1063 		for (int nodeIndex = 0; (node = (Element) document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++)
1064 		{
1065 			/* Remove unlikely candidates */
1066 			if (stripUnlikelyCandidates) {
1067 				final String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id");
1068 				if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 &&
1069 						search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 &&
1070 						!node.getTagName().equals("BODY"))
1071 				{
1072 					dbg("Removing unlikely candidate - " + unlikelyMatchString);
1073 					node.getParentNode().removeChild(node);
1074 					nodeIndex--;
1075 					continue;
1076 				}
1077 			}
1078 
1079 			if (node.getTagName().equals("P") || node.getTagName().equals("TD")) {
1080 				nodesToScore.add(node);
1081 			}
1082 
1083 			/*
1084 			 * Turn all divs that don't have children block level elements into
1085 			 * p's
1086 			 */
1087 			if (node.getTagName().equals("DIV")) {
1088 
1089 				if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) {
1090 					dbg("Altering div to p");
1091 					final Element newNode = document.createElement("P");
1092 
1093 					// newNode.innerHTML = node.innerHTML;
1094 					final NodeList nl = node.getChildNodes();
1095 					for (int i = 0; i < nl.getLength(); i++)
1096 						newNode.appendChild(nl.item(i));
1097 
1098 					node.getParentNode().replaceChild(newNode, node);
1099 					nodeIndex--;
1100 				}
1101 				else
1102 				{
1103 					/* EXPERIMENTAL */
1104 					for (int i = 0, il = node.getChildNodes().getLength(); i < il; i++) {
1105 						final Node childNode = node.getChildNodes().item(i);
1106 						if (childNode.getNodeType() == Element.TEXT_NODE) {
1107 							dbg("replacing text node with a p tag with the same content.");
1108 							final Element p = document.createElement("p");
1109 							// p.innerHTML = childNode.nodeValue;
1110 							p.setNodeValue(childNode.getNodeValue());
1111 							p.setTextContent(childNode.getTextContent());
1112 							// p.style.display = "inline";
1113 							p.setAttribute("class", "readability-styled");
1114 							childNode.getParentNode().replaceChild(p, childNode);
1115 						}
1116 					}
1117 				}
1118 			}
1119 		}
1120 
1121 		/**
1122 		 * Loop through all paragraphs, and assign a score to them based on how
1123 		 * content-y they look. Then add their score to their parent node.
1124 		 *
1125 		 * A score is determined by things like number of commas, class names,
1126 		 * etc. Maybe eventually link density.
1127 		 **/
1128 		final List<Element> candidates = new ArrayList<Element>();
1129 		for (int pt = 0; pt < nodesToScore.size(); pt++) {
1130 			final Element parentNode = (Element) nodesToScore.get(pt).getParentNode();
1131 			final Element grandParentNode = (Element) parentNode.getParentNode();
1132 			final String innerText = getInnerText(nodesToScore.get(pt));
1133 
1134 			/*
1135 			 * If this paragraph is less than 25 characters, don't even count
1136 			 * it.
1137 			 */
1138 			if (innerText.length() < 25) {
1139 				continue;
1140 			}
1141 
1142 			/* Initialize readability data for the parent. */
1143 			if (parentNode.getUserData("readability") == null)
1144 			{
1145 				initializeNode(parentNode);
1146 				candidates.add(parentNode);
1147 			}
1148 
1149 			/* Initialize readability data for the grandparent. */
1150 			if (grandParentNode.getUserData("readability") == null)
1151 			{
1152 				initializeNode(grandParentNode);
1153 				candidates.add(grandParentNode);
1154 			}
1155 
1156 			float contentScore = 0;
1157 
1158 			/* Add a point for the paragraph itself as a base. */
1159 			contentScore++;
1160 
1161 			/* Add points for any commas within this paragraph */
1162 			contentScore += innerText.split(",").length;
1163 
1164 			/*
1165 			 * For every 100 characters in this paragraph, add another point. Up
1166 			 * to 3 points.
1167 			 */
1168 			contentScore += Math.min(Math.floor(innerText.length() / 100F), 3F);
1169 
1170 			/* Add the score to the parent. The grandparent gets half. */
1171 			parentNode.setUserData("readability", ((Float) (parentNode.getUserData("readability")) + contentScore), null);
1172 			grandParentNode.setUserData("readability", ((Float) (grandParentNode.getUserData("readability")))
1173 					+ (contentScore / 2F), null);
1174 		}
1175 
1176 		/**
1177 		 * After we've calculated scores, loop through all of the possible
1178 		 * candidate nodes we found and find the one with the highest score.
1179 		 **/
1180 		Element topCandidate = null;
1181 		for (int c = 0, cl = candidates.size(); c < cl; c++)
1182 		{
1183 			/**
1184 			 * Scale the final candidates score based on link density. Good
1185 			 * content should have a relatively small link density (5% or less)
1186 			 * and be mostly unaffected by this operation.
1187 			 **/
1188 
1189 			candidates.get(c).setUserData("readability",
1190 					(Float) (candidates.get(c).getUserData("readability")) * (1F - getLinkDensity(candidates.get(c))),
1191 					null);
1192 
1193 			dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class") + ":"
1194 					+ candidates.get(c).getAttribute("id") + ") with score "
1195 					+ candidates.get(c).getUserData("readability"));
1196 
1197 			if (topCandidate == null
1198 					|| (Float) (candidates.get(c).getUserData("readability")) > ((Float) topCandidate
1199 							.getUserData("readability")))
1200 			{
1201 				topCandidate = candidates.get(c);
1202 			}
1203 		}
1204 
1205 		if (topCandidate != null)
1206 			dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class") + ":"
1207 					+ topCandidate.getAttribute("id") + ") with score " + topCandidate.getUserData("readability"));
1208 
1209 		/**
1210 		 * If we still have no top candidate, just use the body as a last
1211 		 * resort. We also have to copy the body node so it is something we can
1212 		 * modify.
1213 		 **/
1214 		if (topCandidate == null || topCandidate.getTagName().equals("BODY"))
1215 		{
1216 			topCandidate = document.createElement("DIV");
1217 
1218 			// topCandidate.innerHTML = document.body.innerHTML;
1219 			final NodeList nl = getBody().getChildNodes();
1220 			for (int i = 0; i < nl.getLength(); i++)
1221 				topCandidate.appendChild(nl.item(i));
1222 			// document.body.innerHTML = ""; //should be covered by above
1223 
1224 			getBody().appendChild(topCandidate);
1225 			initializeNode(topCandidate);
1226 		}
1227 
1228 		/**
1229 		 * Now that we have the top candidate, look through its siblings for
1230 		 * content that might also be related. Things like preambles, content
1231 		 * split by ads that we removed, etc.
1232 		 **/
1233 		final Element articleContent = document.createElement("DIV");
1234 		articleContent.setAttribute("id", "readability-content");
1235 		final float siblingScoreThreshold = Math.max(10F, (Float) topCandidate.getUserData("readability") * 0.2F);
1236 		final NodeList siblingNodes = topCandidate.getParentNode().getChildNodes();
1237 
1238 		for (int s = 0, sl = siblingNodes.getLength(); s < sl; s++)
1239 		{
1240 			final Node siblingNode = siblingNodes.item(s);
1241 			boolean append = false;
1242 
1243 			if (siblingNode instanceof Element)
1244 				dbg("Looking at sibling node: "
1245 						+ siblingNode
1246 						+ " ("
1247 						+ ((Element) siblingNode).getAttribute("class")
1248 						+ ":"
1249 						+ ((Element) siblingNode).getAttribute("id")
1250 						+ ")"
1251 						+ ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode
1252 								.getUserData("readability")) : ""));
1253 			dbg("Sibling has score "
1254 					+ (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability")
1255 							: "Unknown"));
1256 
1257 			if (siblingNode == topCandidate)
1258 			{
1259 				append = true;
1260 			}
1261 
1262 			float contentBonus = 0;
1263 			/*
1264 			 * Give a bonus if sibling nodes and top candidates have the example
1265 			 * same classname
1266 			 */
1267 			if (siblingNode instanceof Element
1268 					&& ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class"))
1269 					&& !topCandidate.getAttribute("class").equals(""))
1270 			{
1271 				contentBonus += (Float) topCandidate.getUserData("readability") * 0.2F;
1272 			}
1273 
1274 			if (siblingNode.getUserData("readability") != null
1275 					&& ((Float) siblingNode.getUserData("readability") + contentBonus) >= siblingScoreThreshold)
1276 			{
1277 				append = true;
1278 			}
1279 
1280 			if (siblingNode.getNodeName().equals("P")) {
1281 				final float linkDensity = getLinkDensity((Element) siblingNode);
1282 				final String nodeContent = getInnerText((Element) siblingNode);
1283 				final int nodeLength = nodeContent.length();
1284 
1285 				if (nodeLength > 80 && linkDensity < 0.25)
1286 				{
1287 					append = true;
1288 				}
1289 				else if (nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1)
1290 				{
1291 					append = true;
1292 				}
1293 			}
1294 
1295 			if (append)
1296 			{
1297 				dbg("Appending node: " + siblingNode);
1298 
1299 				Node nodeToAppend = null;
1300 				if (!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) {
1301 					/*
1302 					 * We have a node that isn't a common block level element,
1303 					 * like a form or td tag. Turn it into a div so it doesn't
1304 					 * get filtered out later by accident.
1305 					 */
1306 
1307 					dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div.");
1308 					nodeToAppend = document.createElement("div");
1309 					if (siblingNode instanceof Element)
1310 						((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id"));
1311 
1312 					// nodeToAppend.innerHTML = siblingNode.innerHTML;
1313 					final NodeList nl = siblingNode.getChildNodes();
1314 					for (int i = 0; i < nl.getLength(); i++)
1315 						nodeToAppend.appendChild(nl.item(i));
1316 				} else {
1317 					nodeToAppend = siblingNode;
1318 					s--;
1319 					sl--;
1320 				}
1321 
1322 				/*
1323 				 * To ensure a node does not interfere with readability styles,
1324 				 * remove its classnames
1325 				 */
1326 				if (nodeToAppend instanceof Element)
1327 					((Element) nodeToAppend).setAttribute("class", "");
1328 
1329 				/*
1330 				 * Append sibling and subtract from our list because it removes
1331 				 * the node when you append to another node
1332 				 */
1333 				articleContent.appendChild(nodeToAppend);
1334 			}
1335 		}
1336 
1337 		/**
1338 		 * So we have all of the content that we need. Now we clean it up for
1339 		 * presentation.
1340 		 **/
1341 		prepArticle(articleContent);
1342 
1343 		return articleContent;
1344 	}
1345 
1346 	protected String getInnerHTML(Node n) {
1347 		if (n.getNodeType() == Node.TEXT_NODE)
1348 			return n.getTextContent();
1349 
1350 		String result = "";
1351 		final NodeList nl = n.getChildNodes();
1352 		for (int i = 0; i < nl.getLength(); i++) {
1353 			if (nl.item(i).getNodeType() == Node.TEXT_NODE)
1354 				result += nl.item(i).getTextContent();
1355 			else if (nl.item(i).getNodeType() == Node.COMMENT_NODE)
1356 				result += "<!-- " + nl.item(i).getTextContent() + " -->";
1357 			else
1358 				result += nodeToString(nl.item(i));
1359 		}
1360 
1361 		return result;
1362 	}
1363 
1364 	protected String nodeToString(Node n) {
1365 		return nodeToString(n, false);
1366 	}
1367 
1368 	protected static String nodeToString(Node n, boolean pretty) {
1369 		try {
1370 			final DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
1371 			final DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS");
1372 			final LSSerializer writer = impl.createLSSerializer();
1373 
1374 			writer.getDomConfig().setParameter("xml-declaration", false);
1375 			if (pretty) {
1376 				writer.getDomConfig().setParameter("format-pretty-print", true);
1377 			}
1378 
1379 			return writer.writeToString(n);
1380 		} catch (final Exception e) {
1381 			throw new RuntimeException(e);
1382 		}
1383 	}
1384 
1385 	protected Node stringToNode(String str) {
1386 		try {
1387 			final DOMFragmentParser parser = new DOMFragmentParser();
1388 			final DocumentFragment fragment = document.createDocumentFragment();
1389 			parser.parse(new InputSource(new StringReader(str)), fragment);
1390 			return fragment;
1391 
1392 			// try and return the element itself if possible...
1393 			// NodeList nl = fragment.getChildNodes();
1394 			// for (int i=0; i<nl.getLength(); i++) if (nl.item(i).getNodeType()
1395 			// == Node.ELEMENT_NODE) return nl.item(i);
1396 			// return fragment;
1397 
1398 		} catch (final Exception e) {
1399 			throw new RuntimeException(e);
1400 		}
1401 	}
1402 
1403 	/**
1404 	 * Get the inner text of a node - cross browser compatibly. This also strips
1405 	 * out any excess whitespace to be found.
1406 	 *
1407 	 * @param Element
1408 	 * @return string
1409 	 **/
1410 	protected String getInnerText(Element e, boolean normalizeSpaces) {
1411 		String textContent = "";
1412 
1413 		textContent = e.getTextContent().replaceAll(Regexps.trimRe, "");
1414 
1415 		if (normalizeSpaces) {
1416 			return textContent.replaceAll(Regexps.normalizeRe, " ");
1417 		} else {
1418 			return textContent;
1419 		}
1420 	}
1421 
1422 	protected String getInnerTextSep(Node e) {
1423 		if (e.hasChildNodes()) {
1424 			String s = "";
1425 			final NodeList nl = e.getChildNodes();
1426 			for (int i = 0; i < nl.getLength(); i++) {
1427 				if (!nl.item(i).getNodeName().equalsIgnoreCase("script"))
1428 					s += getInnerTextSep(nl.item(i));
1429 			}
1430 			return s;
1431 		} else {
1432 			return e.getTextContent() + " ";
1433 		}
1434 	}
1435 
1436 	protected String getInnerText(Element e) {
1437 		return getInnerText(e, true);
1438 	}
1439 
1440 	/**
1441 	 * @return The article HTML content as a {@link String}.
1442 	 */
1443 	public String getArticleHTML() {
1444 		if (articleContent == null)
1445 			return "";
1446 		return nodeToString(articleContent, true);
1447 	}
1448 
1449 	/**
1450 	 * @return The articles HTML dom node.
1451 	 */
1452 	public Node getArticleHTML_DOM() {
1453 		return articleContent;
1454 	}
1455 
1456 	protected String getArticleDateString() {
1457 		return article_date_string;
1458 	}
1459 
1460 	/**
1461 	 * @return The article date.
1462 	 */
1463 	public Date getArticleDate() {
1464 		return article_date;
1465 	}
1466 
1467 	/**
1468 	 * @return The text of the article.
1469 	 */
1470 	public String getArticleText() {
1471 		if (articleContent == null)
1472 			return "Unable to find article content";
1473 		// return getInnerText(articleContent, false);
1474 		return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
1475 	}
1476 
1477 	/**
1478 	 * @return Any links in the article.
1479 	 */
1480 	public List<Anchor> getArticleLinks() {
1481 		final List<Anchor> anchors = new ArrayList<Anchor>();
1482 		if (articleContent == null)
1483 			return anchors;
1484 
1485 		final NodeList nl = articleContent.getElementsByTagName("a");
1486 		for (int i = 0; i < nl.getLength(); i++) {
1487 			final Element a = (Element) nl.item(i);
1488 
1489 			final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1490 			anchors.add(anchor);
1491 		}
1492 		return anchors;
1493 	}
1494 
1495 	/**
1496 	 * @return Any links in the document.
1497 	 */
1498 	public List<Anchor> getAllLinks() {
1499 		final List<Anchor> anchors = new ArrayList<Anchor>();
1500 
1501 		final NodeList nl = document.getElementsByTagName("a");
1502 		for (int i = 0; i < nl.getLength(); i++) {
1503 			final Element a = (Element) nl.item(i);
1504 			final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1505 			anchors.add(anchor);
1506 		}
1507 		return anchors;
1508 	}
1509 
1510 	/**
1511 	 * @return Any images in the article.
1512 	 */
1513 	public List<String> getArticleImages() {
1514 		final List<String> images = new ArrayList<String>();
1515 		if (articleContent == null)
1516 			return images;
1517 
1518 		final NodeList nl = articleContent.getElementsByTagName("img");
1519 		for (int i = 0; i < nl.getLength(); i++) {
1520 			final Element img = (Element) nl.item(i);
1521 			images.add(img.getAttribute("src"));
1522 		}
1523 		return images;
1524 	}
1525 
1526 	/**
1527 	 * @return Any subheadings in the article.
1528 	 */
1529 	public List<String> getArticleSubheadings() {
1530 		final List<String> subtitles = new ArrayList<String>();
1531 		if (articleContent == null)
1532 			return subtitles;
1533 
1534 		for (int j = 1; j <= 6; j++) {
1535 			final NodeList nl = articleContent.getElementsByTagName("h" + j);
1536 			if (nl.getLength() > 0) {
1537 				for (int i = 0; i < nl.getLength(); i++) {
1538 					subtitles.add(nl.item(i).getTextContent());
1539 				}
1540 				break;
1541 			}
1542 		}
1543 
1544 		if (subtitles.size() == 0) {
1545 			// try looking for other likely-looking elements
1546 
1547 			final NodeList nl = articleContent.getElementsByTagName("*");
1548 			for (int i = 0; i < nl.getLength(); i++) {
1549 				if (nl.item(i) instanceof Element &&
1550 						((Element) nl.item(i)).getAttribute("class") != null &&
1551 						search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1)
1552 					subtitles.add(nl.item(i).getTextContent());
1553 			}
1554 		}
1555 
1556 		return subtitles;
1557 	}
1558 
1559 	protected List<Node> findChildNodesWithName(Node parent, String name) {
1560 		final NodeList children = parent.getChildNodes();
1561 		final List<Node> results = new ArrayList<Node>();
1562 
1563 		for (int i = 0; i < children.getLength(); ++i) {
1564 			final Node child = children.item(i);
1565 			if (child == null)
1566 				continue;
1567 
1568 			final String nodeName = child.getNodeName();
1569 			if (nodeName == null)
1570 				continue;
1571 
1572 			if (nodeName.equals(name)) {
1573 				results.add(child);
1574 			}
1575 		}
1576 		return results;
1577 	}
1578 
1579 	protected int findChildNodeIndex(Node parent, Node childToFind)
1580 	{
1581 		for (int index = 0; index < parent.getChildNodes().getLength(); index++)
1582 			if (parent.getChildNodes().item(index) == childToFind)
1583 				return index;
1584 		return -1;
1585 	}
1586 
1587 	protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException {
1588 		final Node parend = walker.getCurrentNode();
1589 
1590 		if (parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null)
1591 		{
1592 			if (parend.getTextContent().trim().length() > 0)
1593 			{
1594 				final int index = findChildNodeIndex(parend.getParentNode(), parend);
1595 				if (index != -1)
1596 				{
1597 					// square brackets are not valid XML/HTML identifier
1598 					// characters, so we can use them here
1599 					map.add(new MappingNode(
1600 							parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + index + "]",
1601 							parend.getNodeValue()));
1602 
1603 					// System.out.println(
1604 					// "ELEMENT '"+parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue()
1605 					// + "["+index+"]"+"'");
1606 					// System.out.println( "VALUE:  '"+parend.getNodeValue()+"'"
1607 					// );
1608 				}
1609 			}
1610 		}
1611 
1612 		// traverse children:
1613 		for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
1614 			getArticleTextMapping(walker, map);
1615 		}
1616 
1617 		// return position to the current (level up):
1618 		walker.setCurrentNode(parend);
1619 	}
1620 
1621 	protected class MappingNode {
1622 		String id;
1623 		String text;
1624 
1625 		public MappingNode(String id, String text) {
1626 			this.id = id;
1627 			this.text = text;
1628 		}
1629 
1630 		public String getId() {
1631 			return id;
1632 		}
1633 
1634 		public String getText() {
1635 			return text;
1636 		}
1637 
1638 		@Override
1639 		public String toString() {
1640 			return "MappingNode(" + id + " -> " + text + ")";
1641 		}
1642 	}
1643 
1644 	/**
1645 	 * Get the mapping between bits of text in the dom & their xpaths
1646 	 *
1647 	 * @return mapping from xpath to text
1648 	 */
1649 	public List<MappingNode> getArticleTextMapping() {
1650 		if (articleContent == null)
1651 			return null;
1652 
1653 		final List<MappingNode> map = new ArrayList<MappingNode>();
1654 
1655 		final TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT
1656 				| NodeFilter.SHOW_ELEMENT, null, true);
1657 
1658 		getArticleTextMapping(walker, map);
1659 
1660 		return map;
1661 	}
1662 
1663 	/**
1664 	 * Convenience method to build a {@link Readability} instance from an html
1665 	 * string.
1666 	 *
1667 	 * @param html
1668 	 *            The html string
1669 	 * @return new {@link Readability} instance.
1670 	 * @throws SAXException
1671 	 * @throws IOException
1672 	 */
1673 	public static Readability getReadability(String html) throws SAXException, IOException {
1674 		return getReadability(html, false);
1675 	}
1676 
1677 	/**
1678 	 * Convenience method to build a {@link Readability} instance from an html
1679 	 * string.
1680 	 *
1681 	 * @param html
1682 	 *            The html string
1683 	 * @param addTitle
1684 	 *            Should the title be added to the generated article?
1685 	 * @return new {@link Readability} instance.
1686 	 * @throws SAXException
1687 	 * @throws IOException
1688 	 */
1689 	public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
1690 		final DOMParser parser = new DOMParser();
1691 		parser.parse(new InputSource(new StringReader(html)));
1692 
1693 		return new Readability(parser.getDocument(), false, addTitle);
1694 	}
1695 
1696 	/**
1697 	 * Testing
1698 	 *
1699 	 * @param argv
1700 	 * @throws Exception
1701 	 */
1702 	public static void main(String[] argv) throws Exception {
1703 		// URL input = new
1704 		// URL("file:///home/dd/Programming/Readability4J/t.html");
1705 		// URL input = new
1706 		// URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
1707 		final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
1708 		// URL input = new URL("http://euobserver.com/9/30465");
1709 		// URL input = new URL("http://euobserver.com/?aid=23383");
1710 		// URL input = new
1711 		// URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
1712 		// URL input = new URL("file:///Users/jsh2/Desktop/test.html");
1713 		// URL input = new
1714 		// URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
1715 		// URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
1716 		// URL input = new
1717 		// URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
1718 		// URL input = new
1719 		// URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
1720 		// URL input = new
1721 		// URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
1722 		// URL input = new URL("http://www.thebigproject.co.uk/news/");
1723 		// URL input = new
1724 		// URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
1725 		// URL input = new
1726 		// URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");
1727 
1728 		// URL input = new
1729 		// URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
1730 		// URL input = new
1731 		// URL("http://uk.mac.ign.com/articles/573/573319p1.html");
1732 		final DOMParser parser = new DOMParser();
1733 		parser.parse(new InputSource(input.openStream()));
1734 
1735 		final Readability r = new Readability(parser.getDocument(), true, true);
1736 
1737 		// System.out.println(r.getArticleTitle());
1738 		System.out.println(r.getArticleHTML());
1739 		// System.out.println(r.getAllLinks());
1740 		// System.out.println(r.getArticleText());
1741 
1742 		System.out.println();
1743 		System.out.println("***");
1744 		System.out.println();
1745 
1746 		for (final MappingNode s : r.getArticleTextMapping())
1747 			System.out.println(s);
1748 
1749 		// PrintStream out = new PrintStream("news-sites");
1750 		// for (Anchor anchor : r.getAllLinks()) {
1751 		// out.println(anchor.getHref() + "\t" + anchor.getText());
1752 		// }
1753 		// out.close();
1754 
1755 		System.out.println(r.getArticleImages());
1756 		// System.out.println(r.getArticleSubheadings());
1757 		// System.out.println(r.getArticleHTML());
1758 		// System.out.println(r.getArticleHTML_DOM());
1759 
1760 		// System.out.println(r.getArticleDateString());
1761 		// System.out.println(r.getArticleDate());
1762 	}
1763 }