1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  
16  
17  package org.openimaj.web.readability;
18  
19  import java.io.IOException;
20  import java.io.StringReader;
21  import java.net.URL;
22  import java.text.ParseException;
23  import java.text.SimpleDateFormat;
24  import java.util.ArrayList;
25  import java.util.Date;
26  import java.util.EnumSet;
27  import java.util.List;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.cyberneko.html.parsers.DOMFragmentParser;
32  import org.cyberneko.html.parsers.DOMParser;
33  import org.pojava.datetime.DateTime;
34  import org.w3c.dom.DOMException;
35  import org.w3c.dom.Document;
36  import org.w3c.dom.DocumentFragment;
37  import org.w3c.dom.Element;
38  import org.w3c.dom.Node;
39  import org.w3c.dom.NodeList;
40  import org.w3c.dom.bootstrap.DOMImplementationRegistry;
41  import org.w3c.dom.ls.DOMImplementationLS;
42  import org.w3c.dom.ls.LSSerializer;
43  import org.w3c.dom.traversal.DocumentTraversal;
44  import org.w3c.dom.traversal.NodeFilter;
45  import org.w3c.dom.traversal.TreeWalker;
46  import org.xml.sax.InputSource;
47  import org.xml.sax.SAXException;
48  
49  
50  
51  
52  
53  
54  
55  
56  
57  
58  
59  public class Readability
60  {
61  	
62  
63  
64  	protected static class Regexps {
65  
66  		public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner"; 
67  		public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
68  		public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
69  		public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
70  		public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
71  		public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
72  		public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>";
73  		public static String trimRe = "^\\s+|\\s+$";
74  		public static String normalizeRe = "\\s{2,}";
75  		public static String killBreaksRe = "(<br\\s*\\/?>(\\s| ?)*){1,}";
76  		public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
77  
78  		public static String titleSeparatorRe = "\\|\\-\\/";
79  
80  		
81  		
82  		public static String likelySubheadCandidateRe = "(?i)cross-head";
83  	}
84  
85  	enum Flag {
86  		FLAG_STRIP_UNLIKELYS,
87  		FLAG_WEIGHT_CLASSES
88  	}
89  
90  	
91  
92  
93  	public static float LINK_DENSITY_THRESHOLD = 0.33F;
94  
95  	
96  	protected Document document;
97  	private Node bodyCache;
98  	protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class);
99  
100 	protected String articleTitle;
101 	protected Element articleContent;
102 	protected String article_date_string;
103 	protected Date article_date;
104 	protected String article_contentType;
105 
106 	protected boolean debug = false;
107 
108 	protected boolean addTitle = false;
109 
110 	
111 
112 
113 
114 
115 
116 	public Readability(Document document) {
117 		this(document, false);
118 	}
119 
120 	
121 
122 
123 
124 
125 
126 
127 
128 
129 	public Readability(Document document, boolean debug) {
130 		this(document, debug, false);
131 	}
132 
133 	
134 
135 
136 
137 
138 
139 
140 
141 
142 
143 
144 
145 	public Readability(Document document, boolean debug, boolean addTitle) {
146 		this.debug = debug;
147 		this.document = document;
148 		this.addTitle = addTitle;
149 		augmentDocument(document);
150 		init();
151 	}
152 
153 	
154 
155 
156 
157 
158 
159 	public static void augmentDocument(Document document) {
160 		final DocumentTraversal traversal = (DocumentTraversal) document;
161 
162 		final TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true);
163 
164 		traverseLevel(walker, 0);
165 	}
166 
167 	private static int traverseLevel(TreeWalker walker, int counter) {
168 		
169 		final Node parend = walker.getCurrentNode();
170 
171 		if (parend instanceof Element) {
172 			if (((Element) parend).getAttribute("id").length() == 0) {
173 				((Element) parend).setAttribute("id", "gen-id-" + counter);
174 				counter++;
175 			}
176 		}
177 
178 		
179 		for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
180 			counter = traverseLevel(walker, counter);
181 		}
182 
183 		
184 		walker.setCurrentNode(parend);
185 
186 		return counter;
187 	}
188 
189 	protected void dbg(String s) {
190 		if (debug)
191 			System.err.println(s);
192 	}
193 
194 	protected String getTitle() {
195 		final NodeList l = document.getElementsByTagName("title");
196 
197 		if (l.getLength() == 0)
198 			return "";
199 
200 		return l.item(0).getTextContent();
201 	}
202 
203 	
204 
205 
206 
207 
208 
209 
210 	protected String[] match(String input, String regex) {
211 		final Matcher matcher = Pattern.compile(regex).matcher(input);
212 		final List<String> matches = new ArrayList<String>();
213 
214 		while (matcher.find()) {
215 			matches.add(matcher.group(0));
216 		}
217 
218 		return matches.toArray(new String[matches.size()]);
219 	}
220 
221 	
222 
223 
224 	public boolean hasContent() {
225 		return articleContent != null;
226 	}
227 
228 	
229 
230 
231 
232 
233 
234 
235 	protected int search(String input, String regex) {
236 		final Matcher matcher = Pattern.compile(regex).matcher(input);
237 
238 		if (!matcher.find())
239 			return -1;
240 		return matcher.start();
241 	}
242 
243 	protected void findArticleEncoding() {
244 		final NodeList nl = document.getElementsByTagName("meta");
245 		for (int j = 0; j < nl.getLength(); j++) {
246 			if (((Element) nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) {
247 				article_contentType = ((Element) nl.item(j)).getAttribute("content");
248 				return;
249 			}
250 		}
251 
252 	}
253 
254 	protected void findArticleDate() {
255 		
256 		
257 		NodeList nl = document.getElementsByTagName("meta");
258 		for (int j = 0; j < nl.getLength(); j++) {
259 			if (((Element) nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) {
260 				article_date_string = ((Element) nl.item(j)).getAttribute("content");
261 				article_date = DateTime.parse(article_date_string).toDate();
262 				return;
263 			}
264 			if (((Element) nl.item(j)).getAttribute("name").equals("DC.date.issued")) {
265 				article_date_string = ((Element) nl.item(j)).getAttribute("content");
266 				article_date = DateTime.parse(article_date_string).toDate();
267 				return;
268 			}
269 		}
270 
271 		
272 		
273 		nl = document.getElementsByTagName("time");
274 		for (int j = 0; j < nl.getLength(); j++) {
275 			if (((Element) nl.item(j)).getAttributeNode("pubdate") != null) {
276 				article_date_string = ((Element) nl.item(j)).getAttribute("datetime");
277 				article_date = DateTime.parse(article_date_string).toDate();
278 				return;
279 			}
280 		}
281 
282 		
283 		
284 		
285 		nl = document.getElementsByTagName("*");
286 		for (int j = 0; j < nl.getLength(); j++) {
287 			if ((((Element) nl.item(j)).getAttribute("class").contains("date") ||
288 					((Element) nl.item(j)).getAttribute("class").contains("Date")) &&
289 					!(((Element) nl.item(j)).getAttribute("class").contains("update") ||
290 							((Element) nl.item(j)).getAttribute("class").contains("Update")))
291 			{
292 				article_date_string = getInnerTextSep(nl.item(j)).trim();
293 				parseDate();
294 				return;
295 			}
296 		}
297 		for (int j = 0; j < nl.getLength(); j++) {
298 			if ((((Element) nl.item(j)).getAttribute("id").contains("date") ||
299 					((Element) nl.item(j)).getAttribute("id").contains("Date")) &&
300 					!(((Element) nl.item(j)).getAttribute("id").contains("update") ||
301 							((Element) nl.item(j)).getAttribute("id").contains("Update")))
302 			{
303 				article_date_string = getInnerTextSep(nl.item(j)).trim();
304 				parseDate();
305 				return;
306 			}
307 		}
308 
309 		
310 		nl = document.getElementsByTagName("*");
311 		for (int j = 0; j < nl.getLength(); j++) {
312 			final String text = nl.item(j).getTextContent();
313 
314 			if (text == null)
315 				continue;
316 
317 			final Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)");
318 			final Matcher m = p.matcher(text);
319 			if (m.find()) {
320 				article_date_string = m.group(1);
321 
322 				String cpy = article_date_string.replaceAll("th", "");
323 				cpy = cpy.replaceAll("st", "");
324 				cpy = cpy.replaceAll("nd", "");
325 				cpy = cpy.replaceAll("rd", "");
326 
327 				final SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy");
328 				try {
329 					article_date = sdf.parse(cpy);
330 				} catch (final ParseException e) {
331 				}
332 				return;
333 			}
334 		}
335 	}
336 
337 	@SuppressWarnings("deprecation")
338 	protected void parseDate() {
339 		if (article_date_string == null || article_date_string.trim().isEmpty())
340 			return;
341 
342 		if (article_date_string.contains("Today")) {
343 			try {
344 				final SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z");
345 				article_date = sdf.parse(article_date_string);
346 				final Date now = new Date();
347 				article_date.setDate(now.getDate());
348 				article_date.setMonth(now.getMonth());
349 				article_date.setYear(now.getYear());
350 			} catch (final ParseException e) {
351 			}
352 		} else {
353 			try {
354 				final SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy");
355 				article_date = sdf.parse(article_date_string);
356 			} catch (final ParseException e) {
357 				try {
358 					final SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z");
359 					article_date = sdf.parse(article_date_string);
360 				} catch (final ParseException ee) {
361 					try {
362 						final SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
363 						article_date = sdf.parse(article_date_string);
364 					} catch (final ParseException eee) {
365 						try {
366 							article_date = DateTime.parse(article_date_string).toDate();
367 						} catch (final IllegalArgumentException ie) {
368 						} catch (final java.lang.ArrayIndexOutOfBoundsException ie) {
369 							System.out.println(article_date_string);
370 						}
371 					}
372 				}
373 			}
374 		}
375 	}
376 
377 	
378 
379 
380 
381 
382 	protected String findArticleTitle() {
383 		String curTitle = "", origTitle = "";
384 
385 		curTitle = origTitle = getTitle();
386 
387 		
388 		final List<String> potentialTitles = new ArrayList<String>();
389 		for (int i = 1; i <= 6; i++) {
390 			final NodeList nl = document.getElementsByTagName("h" + i);
391 			if (nl.getLength() > 0) {
392 				for (int j = 0; j < nl.getLength(); j++)
393 					potentialTitles.add(nl.item(j).getTextContent().trim());
394 			}
395 		}
396 
397 		String potentialTitle = null;
398 		int score = 0;
399 		for (final String s : potentialTitles) {
400 			if (s.length() > score && curTitle.contains(s)) {
401 				potentialTitle = s;
402 				score = s.length();
403 			}
404 		}
405 		if (potentialTitle != null)
406 			return potentialTitle;
407 		
408 
409 		if (match(curTitle, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0)
410 		{
411 			curTitle = origTitle.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1");
412 
413 			if (curTitle.split(" ").length < 3) {
414 				curTitle = origTitle.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe
415 						+ "]+(.*)", "$1");
416 			}
417 		}
418 		else if (curTitle.indexOf(": ") != -1)
419 		{
420 			curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1");
421 
422 			if (curTitle.split(" ").length < 3) {
423 				curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1");
424 			}
425 		}
426 		else if (curTitle.length() > 150 || curTitle.length() < 15)
427 		{
428 			final NodeList hOnes = document.getElementsByTagName("h1");
429 			if (hOnes.getLength() == 1)
430 			{
431 				curTitle = getInnerText((Element) hOnes.item(0));
432 			}
433 		}
434 
435 		curTitle = curTitle.replaceAll(Regexps.trimRe, "");
436 
437 		if (curTitle.split(" ").length <= 3) {
438 			curTitle = origTitle;
439 		}
440 
441 		return curTitle;
442 	}
443 
444 	
445 
446 
447 
448 
449 	protected Element getBody() {
450 		final NodeList nl = document.getElementsByTagName("body");
451 
452 		if (nl.getLength() == 0)
453 			return null;
454 		else
455 			return (Element) nl.item(0);
456 	}
457 
458 	
459 
460 
461 
462 
463 
464 
465 
466 
467 	protected void init() {
468 		if (getBody() != null && bodyCache == null) {
469 			bodyCache = getBody().cloneNode(true);
470 		}
471 
472 		findArticleDate(); 
473 
474 		findArticleEncoding();
475 
476 		prepDocument();
477 
478 		
479 		articleTitle = findArticleTitle();
480 		articleContent = grabArticle();
481 
482 		
483 
484 
485 
486 
487 
488 
489 		if (getInnerText(articleContent, false).length() < 250)
490 		{
491 			if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) {
492 				flags.remove(Flag.FLAG_STRIP_UNLIKELYS);
493 				getBody().getParentNode().replaceChild(bodyCache, getBody());
494 				init();
495 				return;
496 			}
497 			else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
498 				flags.remove(Flag.FLAG_WEIGHT_CLASSES);
499 				getBody().getParentNode().replaceChild(bodyCache, getBody());
500 				init();
501 				return;
502 			}
503 			else {
504 				articleContent = null;
505 			}
506 		}
507 
508 		if (addTitle && articleContent != null) {
509 			final Element titleNode = document.createElement("h1");
510 			titleNode.setAttribute("id", "title");
511 			titleNode.appendChild(document.createTextNode(getArticleTitle()));
512 			articleContent.insertBefore(titleNode, articleContent.getFirstChild());
513 		}
514 	}
515 
516 	
517 
518 
519 
520 
521 	protected void prepDocument() {
522 		
523 
524 
525 
526 
527 		if (getBody() == null)
528 		{
529 			final Node body = document.createElement("body");
530 			document.appendChild(body);
531 		}
532 
533 		
534 		
535 		
536 		
537 		
538 		
539 		
540 		
541 		
542 		
543 		
544 		
545 		
546 		
547 		
548 		
549 		
550 		
551 		
552 		
553 		
554 		
555 		
556 		
557 		
558 		
559 		
560 		
561 		
562 		
563 		
564 		
565 		
566 		
567 		
568 		
569 		
570 		
571 		
572 		
573 		
574 
575 		
576 		final NodeList scripts = document.getElementsByTagName("script");
577 		for (int i = scripts.getLength() - 1; i >= 0; i--)
578 		{
579 			scripts.item(i).getParentNode().removeChild(scripts.item(i));
580 		}
581 
582 		
583 		final NodeList styleTags = document.getElementsByTagName("style");
584 		for (int st = styleTags.getLength() - 1; st >= 0; st--) {
585 			styleTags.item(st).getParentNode().removeChild(styleTags.item(st));
586 		}
587 
588 		
589 		final NodeList metaTags = document.getElementsByTagName("meta");
590 		for (int mt = metaTags.getLength() - 1; mt >= 0; mt--) {
591 			metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt));
592 		}
593 
594 		
595 		
596 
597 
598 
599 		
600 		
601 		
602 		final Element body = getBody();
603 		
604 		
605 		
606 		
607 
608 		
609 		final Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(
610 				Regexps.replaceFontsRe, "<$1span>"));
611 		removeChildren(body);
612 		body.appendChild(frag);
613 
614 		
615 		removeComments(document);
616 	}
617 
618 	protected void removeComments(Node n) {
619 		if (n.getNodeType() == Node.COMMENT_NODE) {
620 			n.getParentNode().removeChild(n);
621 		} else {
622 			final NodeList nl = n.getChildNodes();
623 			for (int i = nl.getLength() - 1; i >= 0; i--)
624 				removeComments(nl.item(i));
625 		}
626 	}
627 
628 	
629 
630 
631 
632 
633 
634 
635 
636 	protected void prepArticle(Element articleContent) {
637 		cleanStyles(articleContent);
638 		killBreaks(articleContent);
639 
640 		
641 		clean(articleContent, "form");
642 		clean(articleContent, "object");
643 		clean(articleContent, "h1");
644 		
645 
646 
647 
648 		if (articleContent.getElementsByTagName("h2").getLength() == 1) {
649 			clean(articleContent, "h2");
650 		}
651 		clean(articleContent, "iframe");
652 
653 		cleanHeaders(articleContent);
654 
655 		
656 
657 
658 
659 		cleanConditionally(articleContent, "table");
660 		cleanConditionally(articleContent, "ul");
661 		cleanConditionally(articleContent, "div");
662 
663 		
664 		final NodeList articleParagraphs = articleContent.getElementsByTagName("p");
665 		for (int i = articleParagraphs.getLength() - 1; i >= 0; i--)
666 		{
667 			final int imgCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength();
668 			final int embedCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength();
669 			final int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength();
670 
671 			if (imgCount == 0 && embedCount == 0 && objectCount == 0
672 					&& getInnerText((Element) articleParagraphs.item(i), false) == "")
673 			{
674 				articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i));
675 			}
676 		}
677 
678 		
679 		
680 		final Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
681 		removeChildren(articleContent);
682 		articleContent.appendChild(n);
683 
684 		
685 		final NodeList nl = articleContent.getElementsByTagName("p");
686 		for (int i = nl.getLength() - 1; i >= 0; i--) {
687 			if (nl.item(i).getTextContent().trim().length() == 0)
688 			{
689 				nl.item(i).getParentNode().removeChild(nl.item(i));
690 			} else if (nl.item(i).getChildNodes().getLength() == 1
691 					&& nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE)
692 			{
693 				nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n");
694 			}
695 			else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled"))
696 			{
697 				nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i));
698 			}
699 		}
700 
701 	}
702 
703 	protected void removeChildren(Node n) {
704 		final NodeList nl = n.getChildNodes();
705 		final int nn = nl.getLength();
706 		for (int i = 0; i < nn; i++)
707 			n.removeChild(nl.item(0));
708 	}
709 
710 	
711 
712 
713 
714 
715 
716 	protected void initializeNode(Element node) {
717 		float contentScore = 0;
718 
719 		if (node.getTagName() == "DIV") {
720 			contentScore += 5;
721 		} else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") {
722 			contentScore += 3;
723 		} else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL"
724 				|| node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT"
725 				|| node.getTagName() == "LI" || node.getTagName() == "FORM")
726 		{
727 			contentScore -= 3;
728 		} else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3"
729 				|| node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6"
730 				|| node.getTagName() == "TH")
731 		{
732 			contentScore -= 5;
733 		}
734 
735 		contentScore += getClassWeight(node);
736 		node.setUserData("readability", contentScore, null);
737 	}
738 
739 	
740 
741 
742 
743 
744 
745 
746 	protected int getClassWeight(Element e) {
747 		if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
748 			return 0;
749 		}
750 
751 		int weight = 0;
752 
753 		
754 		if (e.getAttribute("class") != "")
755 		{
756 			if (search(e.getAttribute("class"), Regexps.negativeRe) != -1) {
757 				weight -= 25;
758 			}
759 
760 			if (search(e.getAttribute("class"), Regexps.positiveRe) != -1) {
761 				weight += 25;
762 			}
763 		}
764 
765 		
766 		if (e.getAttribute("id") != "")
767 		{
768 			if (search(e.getAttribute("id"), Regexps.negativeRe) != -1) {
769 				weight -= 25;
770 			}
771 
772 			if (search(e.getAttribute("id"), Regexps.positiveRe) != -1) {
773 				weight += 25;
774 			}
775 		}
776 
777 		return weight;
778 	}
779 
780 	protected void cleanStyles() {
781 		cleanStyles((Element) document);
782 	}
783 
784 	
785 
786 
787 
788 
789 
790 	protected void cleanStyles(Element e) {
791 		if (e == null)
792 			return;
793 		Node cur = e.getFirstChild();
794 
795 		
796 		if (!e.getAttribute("class").equals("readability-styled"))
797 			e.removeAttribute("style");
798 
799 		
800 		while (cur != null) {
801 			if (cur.getNodeType() == Element.ELEMENT_NODE) {
802 				
803 				if (!((Element) cur).getAttribute("class").equals("readability-styled")) {
804 					((Element) cur).removeAttribute("style");
805 				}
806 				cleanStyles((Element) cur);
807 			}
808 			cur = cur.getNextSibling();
809 		}
810 	}
811 
812 	
813 
814 
815 
816 
817 	protected void killBreaks(Element e) {
818 		
819 		
820 
821 		final Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe, "<BR />"));
822 		removeChildren(e);
823 		e.appendChild(n);
824 	}
825 
826 	
827 
828 
829 
830 
831 
832 
833 
834 	protected void clean(Element e, String tag) {
835 		final NodeList targetList = e.getElementsByTagName(tag);
836 		final boolean isEmbed = (tag.equals("object") || tag.equals("embed"));
837 
838 		for (int y = targetList.getLength() - 1; y >= 0; y--) {
839 			
840 
841 
842 
843 			if (isEmbed) {
844 				String attributeValues = "";
845 				for (int i = 0, il = targetList.item(y).getAttributes().getLength(); i < il; i++) {
846 					attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|";
847 				}
848 
849 				
850 
851 
852 
853 				if (search(attributeValues, Regexps.videoRe) != -1) {
854 					continue;
855 				}
856 
857 				
858 				if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) {
859 					continue;
860 				}
861 			}
862 
863 			targetList.item(y).getParentNode().removeChild(targetList.item(y));
864 		}
865 	}
866 
867 	
868 
869 
870 
871 
872 
873 	protected void cleanHeaders(Element e) {
874 		for (int headerIndex = 1; headerIndex < 7; headerIndex++) {
875 			final NodeList headers = e.getElementsByTagName("h" + headerIndex);
876 			for (int i = headers.getLength() - 1; i >= 0; i--) {
877 				if (getClassWeight((Element) headers.item(i)) < 0
878 						|| getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD)
879 				{
880 					headers.item(i).getParentNode().removeChild(headers.item(i));
881 				}
882 			}
883 		}
884 	}
885 
886 	
887 
888 
889 
890 
891 
892 
893 
894 	protected float getLinkDensity(Element e) {
895 		final NodeList links = e.getElementsByTagName("a");
896 		final int textLength = getInnerText(e).length();
897 		int linkLength = 0;
898 
899 		for (int i = 0, il = links.getLength(); i < il; i++)
900 		{
901 			linkLength += getInnerText((Element) links.item(i)).length();
902 		}
903 
904 		if (linkLength == 0)
905 			return 0;
906 
907 		return (float) linkLength / (float) textLength;
908 	}
909 
910 	
911 
912 
913 
914 
915 	protected void cleanConditionally(Element e, String tag) {
916 		final NodeList tagsList = e.getElementsByTagName(tag);
917 		final int curTagsLength = tagsList.getLength();
918 
919 		
920 
921 
922 
923 
924 
925 
926 		for (int i = curTagsLength - 1; i >= 0; i--) {
927 			final int weight = getClassWeight((Element) tagsList.item(i));
928 			final float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float) (tagsList.item(i)
929 					.getUserData("readability")) : 0;
930 
931 			dbg("Cleaning Conditionally "
932 					+ tagsList.item(i)
933 					+ " ("
934 					+ ((Element) tagsList.item(i)).getAttribute("class")
935 					+ ":"
936 					+ ((Element) tagsList.item(i)).getAttribute("id")
937 					+ ")"
938 					+ ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i)
939 							.getUserData("readability")) : ""));
940 
941 			if (weight + contentScore < 0)
942 			{
943 				dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
944 						+ ((Element) tagsList.item(i)).getAttribute("id") + ")");
945 				tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
946 			}
947 			else if (getCharCount((Element) tagsList.item(i), ",") < 10) {
948 				
949 
950 
951 
952 
953 				final int p = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength();
954 				final int img = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength();
955 				final int li = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength() - 100;
956 				final int input = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength();
957 
958 				int embedCount = 0;
959 				final NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed");
960 				for (int ei = 0, il = embeds.getLength(); ei < il; ei++) {
961 					if (search(((Element) embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) {
962 						embedCount++;
963 					}
964 				}
965 
966 				final float linkDensity = getLinkDensity((Element) tagsList.item(i));
967 				final int contentLength = getInnerText((Element) tagsList.item(i)).length();
968 				boolean toRemove = false;
969 
970 				if (img > p) {
971 					toRemove = true;
972 				} else if (li > p && tag != "ul" && tag != "ol") {
973 					toRemove = true;
974 				} else if (input > Math.floor(p / 3)) {
975 					toRemove = true;
976 				} else if (contentLength < 25 && (img == 0 || img > 2)) {
977 					toRemove = true;
978 				} else if (weight < 25 && linkDensity > 0.2) {
979 					toRemove = true;
980 				} else if (weight >= 25 && linkDensity > 0.5) {
981 					toRemove = true;
982 				} else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) {
983 					toRemove = true;
984 				}
985 
986 				if (img == 1 && p == 0 && contentLength == 0) {
987 					final Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0);
988 
989 					String w = "";
990 					if (theImg.getAttribute("width") != null)
991 						w = theImg.getAttribute("width");
992 
993 					String h = "";
994 					if (theImg.getAttribute("height") != null)
995 						h = theImg.getAttribute("height");
996 
997 					if (!(w.equals("0") || h.equals("0")))
998 						toRemove = false; 
999 					
1000 				}
1001 
1002 				if (toRemove) {
1003 					dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
1004 							+ ((Element) tagsList.item(i)).getAttribute("id") + ")");
1005 					tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
1006 				}
1007 			}
1008 		}
1009 	}
1010 
1011 	
1012 
1013 
1014 
1015 
1016 
1017 
1018 
1019 	protected int getCharCount(Element e, String s) {
1020 		return getInnerText(e).split(s).length - 1;
1021 	}
1022 
1023 	protected int getCharCount(Element e) {
1024 		return getCharCount(e, ",");
1025 	}
1026 
1027 	
1028 
1029 
1030 	public String getArticleTitle() {
1031 		return articleTitle;
1032 	}
1033 
1034 	
1035 
1036 
1037 	public String getArticleContentType() {
1038 		return article_contentType;
1039 	}
1040 
1041 	
1042 
1043 
1044 
1045 
1046 
1047 
1048 	protected Element grabArticle() {
1049 		final boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS);
1050 
1051 		
1052 
1053 
1054 
1055 
1056 
1057 
1058 
1059 
1060 
1061 		Element node = null;
1062 		final List<Element> nodesToScore = new ArrayList<Element>();
1063 		for (int nodeIndex = 0; (node = (Element) document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++)
1064 		{
1065 			
1066 			if (stripUnlikelyCandidates) {
1067 				final String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id");
1068 				if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 &&
1069 						search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 &&
1070 						!node.getTagName().equals("BODY"))
1071 				{
1072 					dbg("Removing unlikely candidate - " + unlikelyMatchString);
1073 					node.getParentNode().removeChild(node);
1074 					nodeIndex--;
1075 					continue;
1076 				}
1077 			}
1078 
1079 			if (node.getTagName().equals("P") || node.getTagName().equals("TD")) {
1080 				nodesToScore.add(node);
1081 			}
1082 
1083 			
1084 
1085 
1086 
1087 			if (node.getTagName().equals("DIV")) {
1088 
1089 				if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) {
1090 					dbg("Altering div to p");
1091 					final Element newNode = document.createElement("P");
1092 
1093 					
1094 					final NodeList nl = node.getChildNodes();
1095 					for (int i = 0; i < nl.getLength(); i++)
1096 						newNode.appendChild(nl.item(i));
1097 
1098 					node.getParentNode().replaceChild(newNode, node);
1099 					nodeIndex--;
1100 				}
1101 				else
1102 				{
1103 					
1104 					for (int i = 0, il = node.getChildNodes().getLength(); i < il; i++) {
1105 						final Node childNode = node.getChildNodes().item(i);
1106 						if (childNode.getNodeType() == Element.TEXT_NODE) {
1107 							dbg("replacing text node with a p tag with the same content.");
1108 							final Element p = document.createElement("p");
1109 							
1110 							p.setNodeValue(childNode.getNodeValue());
1111 							p.setTextContent(childNode.getTextContent());
1112 							
1113 							p.setAttribute("class", "readability-styled");
1114 							childNode.getParentNode().replaceChild(p, childNode);
1115 						}
1116 					}
1117 				}
1118 			}
1119 		}
1120 
1121 		
1122 
1123 
1124 
1125 
1126 
1127 
1128 		final List<Element> candidates = new ArrayList<Element>();
1129 		for (int pt = 0; pt < nodesToScore.size(); pt++) {
1130 			final Element parentNode = (Element) nodesToScore.get(pt).getParentNode();
1131 			final Element grandParentNode = (Element) parentNode.getParentNode();
1132 			final String innerText = getInnerText(nodesToScore.get(pt));
1133 
1134 			
1135 
1136 
1137 
1138 			if (innerText.length() < 25) {
1139 				continue;
1140 			}
1141 
1142 			
1143 			if (parentNode.getUserData("readability") == null)
1144 			{
1145 				initializeNode(parentNode);
1146 				candidates.add(parentNode);
1147 			}
1148 
1149 			
1150 			if (grandParentNode.getUserData("readability") == null)
1151 			{
1152 				initializeNode(grandParentNode);
1153 				candidates.add(grandParentNode);
1154 			}
1155 
1156 			float contentScore = 0;
1157 
1158 			
1159 			contentScore++;
1160 
1161 			
1162 			contentScore += innerText.split(",").length;
1163 
1164 			
1165 
1166 
1167 
1168 			contentScore += Math.min(Math.floor(innerText.length() / 100F), 3F);
1169 
1170 			
1171 			parentNode.setUserData("readability", ((Float) (parentNode.getUserData("readability")) + contentScore), null);
1172 			grandParentNode.setUserData("readability", ((Float) (grandParentNode.getUserData("readability")))
1173 					+ (contentScore / 2F), null);
1174 		}
1175 
1176 		
1177 
1178 
1179 
1180 		Element topCandidate = null;
1181 		for (int c = 0, cl = candidates.size(); c < cl; c++)
1182 		{
1183 			
1184 
1185 
1186 
1187 
1188 
1189 			candidates.get(c).setUserData("readability",
1190 					(Float) (candidates.get(c).getUserData("readability")) * (1F - getLinkDensity(candidates.get(c))),
1191 					null);
1192 
1193 			dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class") + ":"
1194 					+ candidates.get(c).getAttribute("id") + ") with score "
1195 					+ candidates.get(c).getUserData("readability"));
1196 
1197 			if (topCandidate == null
1198 					|| (Float) (candidates.get(c).getUserData("readability")) > ((Float) topCandidate
1199 							.getUserData("readability")))
1200 			{
1201 				topCandidate = candidates.get(c);
1202 			}
1203 		}
1204 
1205 		if (topCandidate != null)
1206 			dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class") + ":"
1207 					+ topCandidate.getAttribute("id") + ") with score " + topCandidate.getUserData("readability"));
1208 
1209 		
1210 
1211 
1212 
1213 
1214 		if (topCandidate == null || topCandidate.getTagName().equals("BODY"))
1215 		{
1216 			topCandidate = document.createElement("DIV");
1217 
1218 			
1219 			final NodeList nl = getBody().getChildNodes();
1220 			for (int i = 0; i < nl.getLength(); i++)
1221 				topCandidate.appendChild(nl.item(i));
1222 			
1223 
1224 			getBody().appendChild(topCandidate);
1225 			initializeNode(topCandidate);
1226 		}
1227 
1228 		
1229 
1230 
1231 
1232 
1233 		final Element articleContent = document.createElement("DIV");
1234 		articleContent.setAttribute("id", "readability-content");
1235 		final float siblingScoreThreshold = Math.max(10F, (Float) topCandidate.getUserData("readability") * 0.2F);
1236 		final NodeList siblingNodes = topCandidate.getParentNode().getChildNodes();
1237 
1238 		for (int s = 0, sl = siblingNodes.getLength(); s < sl; s++)
1239 		{
1240 			final Node siblingNode = siblingNodes.item(s);
1241 			boolean append = false;
1242 
1243 			if (siblingNode instanceof Element)
1244 				dbg("Looking at sibling node: "
1245 						+ siblingNode
1246 						+ " ("
1247 						+ ((Element) siblingNode).getAttribute("class")
1248 						+ ":"
1249 						+ ((Element) siblingNode).getAttribute("id")
1250 						+ ")"
1251 						+ ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode
1252 								.getUserData("readability")) : ""));
1253 			dbg("Sibling has score "
1254 					+ (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability")
1255 							: "Unknown"));
1256 
1257 			if (siblingNode == topCandidate)
1258 			{
1259 				append = true;
1260 			}
1261 
1262 			float contentBonus = 0;
1263 			
1264 
1265 
1266 
1267 			if (siblingNode instanceof Element
1268 					&& ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class"))
1269 					&& !topCandidate.getAttribute("class").equals(""))
1270 			{
1271 				contentBonus += (Float) topCandidate.getUserData("readability") * 0.2F;
1272 			}
1273 
1274 			if (siblingNode.getUserData("readability") != null
1275 					&& ((Float) siblingNode.getUserData("readability") + contentBonus) >= siblingScoreThreshold)
1276 			{
1277 				append = true;
1278 			}
1279 
1280 			if (siblingNode.getNodeName().equals("P")) {
1281 				final float linkDensity = getLinkDensity((Element) siblingNode);
1282 				final String nodeContent = getInnerText((Element) siblingNode);
1283 				final int nodeLength = nodeContent.length();
1284 
1285 				if (nodeLength > 80 && linkDensity < 0.25)
1286 				{
1287 					append = true;
1288 				}
1289 				else if (nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1)
1290 				{
1291 					append = true;
1292 				}
1293 			}
1294 
1295 			if (append)
1296 			{
1297 				dbg("Appending node: " + siblingNode);
1298 
1299 				Node nodeToAppend = null;
1300 				if (!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) {
1301 					
1302 
1303 
1304 
1305 
1306 
1307 					dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div.");
1308 					nodeToAppend = document.createElement("div");
1309 					if (siblingNode instanceof Element)
1310 						((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id"));
1311 
1312 					
1313 					final NodeList nl = siblingNode.getChildNodes();
1314 					for (int i = 0; i < nl.getLength(); i++)
1315 						nodeToAppend.appendChild(nl.item(i));
1316 				} else {
1317 					nodeToAppend = siblingNode;
1318 					s--;
1319 					sl--;
1320 				}
1321 
1322 				
1323 
1324 
1325 
1326 				if (nodeToAppend instanceof Element)
1327 					((Element) nodeToAppend).setAttribute("class", "");
1328 
1329 				
1330 
1331 
1332 
1333 				articleContent.appendChild(nodeToAppend);
1334 			}
1335 		}
1336 
1337 		
1338 
1339 
1340 
1341 		prepArticle(articleContent);
1342 
1343 		return articleContent;
1344 	}
1345 
1346 	protected String getInnerHTML(Node n) {
1347 		if (n.getNodeType() == Node.TEXT_NODE)
1348 			return n.getTextContent();
1349 
1350 		String result = "";
1351 		final NodeList nl = n.getChildNodes();
1352 		for (int i = 0; i < nl.getLength(); i++) {
1353 			if (nl.item(i).getNodeType() == Node.TEXT_NODE)
1354 				result += nl.item(i).getTextContent();
1355 			else if (nl.item(i).getNodeType() == Node.COMMENT_NODE)
1356 				result += "<!-- " + nl.item(i).getTextContent() + " -->";
1357 			else
1358 				result += nodeToString(nl.item(i));
1359 		}
1360 
1361 		return result;
1362 	}
1363 
1364 	protected String nodeToString(Node n) {
1365 		return nodeToString(n, false);
1366 	}
1367 
1368 	protected static String nodeToString(Node n, boolean pretty) {
1369 		try {
1370 			final DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
1371 			final DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS");
1372 			final LSSerializer writer = impl.createLSSerializer();
1373 
1374 			writer.getDomConfig().setParameter("xml-declaration", false);
1375 			if (pretty) {
1376 				writer.getDomConfig().setParameter("format-pretty-print", true);
1377 			}
1378 
1379 			return writer.writeToString(n);
1380 		} catch (final Exception e) {
1381 			throw new RuntimeException(e);
1382 		}
1383 	}
1384 
1385 	protected Node stringToNode(String str) {
1386 		try {
1387 			final DOMFragmentParser parser = new DOMFragmentParser();
1388 			final DocumentFragment fragment = document.createDocumentFragment();
1389 			parser.parse(new InputSource(new StringReader(str)), fragment);
1390 			return fragment;
1391 
1392 			
1393 			
1394 			
1395 			
1396 			
1397 
1398 		} catch (final Exception e) {
1399 			throw new RuntimeException(e);
1400 		}
1401 	}
1402 
1403 	
1404 
1405 
1406 
1407 
1408 
1409 
1410 	protected String getInnerText(Element e, boolean normalizeSpaces) {
1411 		String textContent = "";
1412 
1413 		textContent = e.getTextContent().replaceAll(Regexps.trimRe, "");
1414 
1415 		if (normalizeSpaces) {
1416 			return textContent.replaceAll(Regexps.normalizeRe, " ");
1417 		} else {
1418 			return textContent;
1419 		}
1420 	}
1421 
1422 	protected String getInnerTextSep(Node e) {
1423 		if (e.hasChildNodes()) {
1424 			String s = "";
1425 			final NodeList nl = e.getChildNodes();
1426 			for (int i = 0; i < nl.getLength(); i++) {
1427 				if (!nl.item(i).getNodeName().equalsIgnoreCase("script"))
1428 					s += getInnerTextSep(nl.item(i));
1429 			}
1430 			return s;
1431 		} else {
1432 			return e.getTextContent() + " ";
1433 		}
1434 	}
1435 
1436 	protected String getInnerText(Element e) {
1437 		return getInnerText(e, true);
1438 	}
1439 
1440 	
1441 
1442 
1443 	public String getArticleHTML() {
1444 		if (articleContent == null)
1445 			return "";
1446 		return nodeToString(articleContent, true);
1447 	}
1448 
1449 	
1450 
1451 
1452 	public Node getArticleHTML_DOM() {
1453 		return articleContent;
1454 	}
1455 
1456 	protected String getArticleDateString() {
1457 		return article_date_string;
1458 	}
1459 
1460 	
1461 
1462 
1463 	public Date getArticleDate() {
1464 		return article_date;
1465 	}
1466 
1467 	
1468 
1469 
1470 	public String getArticleText() {
1471 		if (articleContent == null)
1472 			return "Unable to find article content";
1473 		
1474 		return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
1475 	}
1476 
1477 	
1478 
1479 
1480 	public List<Anchor> getArticleLinks() {
1481 		final List<Anchor> anchors = new ArrayList<Anchor>();
1482 		if (articleContent == null)
1483 			return anchors;
1484 
1485 		final NodeList nl = articleContent.getElementsByTagName("a");
1486 		for (int i = 0; i < nl.getLength(); i++) {
1487 			final Element a = (Element) nl.item(i);
1488 
1489 			final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1490 			anchors.add(anchor);
1491 		}
1492 		return anchors;
1493 	}
1494 
1495 	
1496 
1497 
1498 	public List<Anchor> getAllLinks() {
1499 		final List<Anchor> anchors = new ArrayList<Anchor>();
1500 
1501 		final NodeList nl = document.getElementsByTagName("a");
1502 		for (int i = 0; i < nl.getLength(); i++) {
1503 			final Element a = (Element) nl.item(i);
1504 			final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1505 			anchors.add(anchor);
1506 		}
1507 		return anchors;
1508 	}
1509 
1510 	
1511 
1512 
1513 	public List<String> getArticleImages() {
1514 		final List<String> images = new ArrayList<String>();
1515 		if (articleContent == null)
1516 			return images;
1517 
1518 		final NodeList nl = articleContent.getElementsByTagName("img");
1519 		for (int i = 0; i < nl.getLength(); i++) {
1520 			final Element img = (Element) nl.item(i);
1521 			images.add(img.getAttribute("src"));
1522 		}
1523 		return images;
1524 	}
1525 
1526 	
1527 
1528 
1529 	public List<String> getArticleSubheadings() {
1530 		final List<String> subtitles = new ArrayList<String>();
1531 		if (articleContent == null)
1532 			return subtitles;
1533 
1534 		for (int j = 1; j <= 6; j++) {
1535 			final NodeList nl = articleContent.getElementsByTagName("h" + j);
1536 			if (nl.getLength() > 0) {
1537 				for (int i = 0; i < nl.getLength(); i++) {
1538 					subtitles.add(nl.item(i).getTextContent());
1539 				}
1540 				break;
1541 			}
1542 		}
1543 
1544 		if (subtitles.size() == 0) {
1545 			
1546 
1547 			final NodeList nl = articleContent.getElementsByTagName("*");
1548 			for (int i = 0; i < nl.getLength(); i++) {
1549 				if (nl.item(i) instanceof Element &&
1550 						((Element) nl.item(i)).getAttribute("class") != null &&
1551 						search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1)
1552 					subtitles.add(nl.item(i).getTextContent());
1553 			}
1554 		}
1555 
1556 		return subtitles;
1557 	}
1558 
1559 	protected List<Node> findChildNodesWithName(Node parent, String name) {
1560 		final NodeList children = parent.getChildNodes();
1561 		final List<Node> results = new ArrayList<Node>();
1562 
1563 		for (int i = 0; i < children.getLength(); ++i) {
1564 			final Node child = children.item(i);
1565 			if (child == null)
1566 				continue;
1567 
1568 			final String nodeName = child.getNodeName();
1569 			if (nodeName == null)
1570 				continue;
1571 
1572 			if (nodeName.equals(name)) {
1573 				results.add(child);
1574 			}
1575 		}
1576 		return results;
1577 	}
1578 
1579 	protected int findChildNodeIndex(Node parent, Node childToFind)
1580 	{
1581 		for (int index = 0; index < parent.getChildNodes().getLength(); index++)
1582 			if (parent.getChildNodes().item(index) == childToFind)
1583 				return index;
1584 		return -1;
1585 	}
1586 
1587 	protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException {
1588 		final Node parend = walker.getCurrentNode();
1589 
1590 		if (parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null)
1591 		{
1592 			if (parend.getTextContent().trim().length() > 0)
1593 			{
1594 				final int index = findChildNodeIndex(parend.getParentNode(), parend);
1595 				if (index != -1)
1596 				{
1597 					
1598 					
1599 					map.add(new MappingNode(
1600 							parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + index + "]",
1601 							parend.getNodeValue()));
1602 
1603 					
1604 					
1605 					
1606 					
1607 					
1608 				}
1609 			}
1610 		}
1611 
1612 		
1613 		for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
1614 			getArticleTextMapping(walker, map);
1615 		}
1616 
1617 		
1618 		walker.setCurrentNode(parend);
1619 	}
1620 
1621 	protected class MappingNode {
1622 		String id;
1623 		String text;
1624 
1625 		public MappingNode(String id, String text) {
1626 			this.id = id;
1627 			this.text = text;
1628 		}
1629 
1630 		public String getId() {
1631 			return id;
1632 		}
1633 
1634 		public String getText() {
1635 			return text;
1636 		}
1637 
1638 		@Override
1639 		public String toString() {
1640 			return "MappingNode(" + id + " -> " + text + ")";
1641 		}
1642 	}
1643 
1644 	
1645 
1646 
1647 
1648 
1649 	public List<MappingNode> getArticleTextMapping() {
1650 		if (articleContent == null)
1651 			return null;
1652 
1653 		final List<MappingNode> map = new ArrayList<MappingNode>();
1654 
1655 		final TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT
1656 				| NodeFilter.SHOW_ELEMENT, null, true);
1657 
1658 		getArticleTextMapping(walker, map);
1659 
1660 		return map;
1661 	}
1662 
1663 	
1664 
1665 
1666 
1667 
1668 
1669 
1670 
1671 
1672 
1673 	public static Readability getReadability(String html) throws SAXException, IOException {
1674 		return getReadability(html, false);
1675 	}
1676 
1677 	
1678 
1679 
1680 
1681 
1682 
1683 
1684 
1685 
1686 
1687 
1688 
1689 	public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
1690 		final DOMParser parser = new DOMParser();
1691 		parser.parse(new InputSource(new StringReader(html)));
1692 
1693 		return new Readability(parser.getDocument(), false, addTitle);
1694 	}
1695 
1696 	
1697 
1698 
1699 
1700 
1701 
1702 	public static void main(String[] argv) throws Exception {
1703 		
1704 		
1705 		
1706 		
1707 		final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
1708 		
1709 		
1710 		
1711 		
1712 		
1713 		
1714 		
1715 		
1716 		
1717 		
1718 		
1719 		
1720 		
1721 		
1722 		
1723 		
1724 		
1725 		
1726 		
1727 
1728 		
1729 		
1730 		
1731 		
1732 		final DOMParser parser = new DOMParser();
1733 		parser.parse(new InputSource(input.openStream()));
1734 
1735 		final Readability r = new Readability(parser.getDocument(), true, true);
1736 
1737 		
1738 		System.out.println(r.getArticleHTML());
1739 		
1740 		
1741 
1742 		System.out.println();
1743 		System.out.println("***");
1744 		System.out.println();
1745 
1746 		for (final MappingNode s : r.getArticleTextMapping())
1747 			System.out.println(s);
1748 
1749 		
1750 		
1751 		
1752 		
1753 		
1754 
1755 		System.out.println(r.getArticleImages());
1756 		
1757 		
1758 		
1759 
1760 		
1761 		
1762 	}
1763 }