1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.openimaj.web.readability;
18
19 import java.io.IOException;
20 import java.io.StringReader;
21 import java.net.URL;
22 import java.text.ParseException;
23 import java.text.SimpleDateFormat;
24 import java.util.ArrayList;
25 import java.util.Date;
26 import java.util.EnumSet;
27 import java.util.List;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.cyberneko.html.parsers.DOMFragmentParser;
32 import org.cyberneko.html.parsers.DOMParser;
33 import org.pojava.datetime.DateTime;
34 import org.w3c.dom.DOMException;
35 import org.w3c.dom.Document;
36 import org.w3c.dom.DocumentFragment;
37 import org.w3c.dom.Element;
38 import org.w3c.dom.Node;
39 import org.w3c.dom.NodeList;
40 import org.w3c.dom.bootstrap.DOMImplementationRegistry;
41 import org.w3c.dom.ls.DOMImplementationLS;
42 import org.w3c.dom.ls.LSSerializer;
43 import org.w3c.dom.traversal.DocumentTraversal;
44 import org.w3c.dom.traversal.NodeFilter;
45 import org.w3c.dom.traversal.TreeWalker;
46 import org.xml.sax.InputSource;
47 import org.xml.sax.SAXException;
48
49
50
51
52
53
54
55
56
57
58
59 public class Readability
60 {
61
62
63
64 protected static class Regexps {
65
66 public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner";
67 public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
68 public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
69 public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
70 public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
71 public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
72 public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>";
73 public static String trimRe = "^\\s+|\\s+$";
74 public static String normalizeRe = "\\s{2,}";
75 public static String killBreaksRe = "(<br\\s*\\/?>(\\s| ?)*){1,}";
76 public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
77
78 public static String titleSeparatorRe = "\\|\\-\\/";
79
80
81
82 public static String likelySubheadCandidateRe = "(?i)cross-head";
83 }
84
85 enum Flag {
86 FLAG_STRIP_UNLIKELYS,
87 FLAG_WEIGHT_CLASSES
88 }
89
90
91
92
93 public static float LINK_DENSITY_THRESHOLD = 0.33F;
94
95
96 protected Document document;
97 private Node bodyCache;
98 protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class);
99
100 protected String articleTitle;
101 protected Element articleContent;
102 protected String article_date_string;
103 protected Date article_date;
104 protected String article_contentType;
105
106 protected boolean debug = false;
107
108 protected boolean addTitle = false;
109
110
111
112
113
114
115
116 public Readability(Document document) {
117 this(document, false);
118 }
119
120
121
122
123
124
125
126
127
128
129 public Readability(Document document, boolean debug) {
130 this(document, debug, false);
131 }
132
133
134
135
136
137
138
139
140
141
142
143
144
145 public Readability(Document document, boolean debug, boolean addTitle) {
146 this.debug = debug;
147 this.document = document;
148 this.addTitle = addTitle;
149 augmentDocument(document);
150 init();
151 }
152
153
154
155
156
157
158
159 public static void augmentDocument(Document document) {
160 final DocumentTraversal traversal = (DocumentTraversal) document;
161
162 final TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true);
163
164 traverseLevel(walker, 0);
165 }
166
167 private static int traverseLevel(TreeWalker walker, int counter) {
168
169 final Node parend = walker.getCurrentNode();
170
171 if (parend instanceof Element) {
172 if (((Element) parend).getAttribute("id").length() == 0) {
173 ((Element) parend).setAttribute("id", "gen-id-" + counter);
174 counter++;
175 }
176 }
177
178
179 for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
180 counter = traverseLevel(walker, counter);
181 }
182
183
184 walker.setCurrentNode(parend);
185
186 return counter;
187 }
188
189 protected void dbg(String s) {
190 if (debug)
191 System.err.println(s);
192 }
193
194 protected String getTitle() {
195 final NodeList l = document.getElementsByTagName("title");
196
197 if (l.getLength() == 0)
198 return "";
199
200 return l.item(0).getTextContent();
201 }
202
203
204
205
206
207
208
209
210 protected String[] match(String input, String regex) {
211 final Matcher matcher = Pattern.compile(regex).matcher(input);
212 final List<String> matches = new ArrayList<String>();
213
214 while (matcher.find()) {
215 matches.add(matcher.group(0));
216 }
217
218 return matches.toArray(new String[matches.size()]);
219 }
220
221
222
223
224 public boolean hasContent() {
225 return articleContent != null;
226 }
227
228
229
230
231
232
233
234
235 protected int search(String input, String regex) {
236 final Matcher matcher = Pattern.compile(regex).matcher(input);
237
238 if (!matcher.find())
239 return -1;
240 return matcher.start();
241 }
242
243 protected void findArticleEncoding() {
244 final NodeList nl = document.getElementsByTagName("meta");
245 for (int j = 0; j < nl.getLength(); j++) {
246 if (((Element) nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) {
247 article_contentType = ((Element) nl.item(j)).getAttribute("content");
248 return;
249 }
250 }
251
252 }
253
254 protected void findArticleDate() {
255
256
257 NodeList nl = document.getElementsByTagName("meta");
258 for (int j = 0; j < nl.getLength(); j++) {
259 if (((Element) nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) {
260 article_date_string = ((Element) nl.item(j)).getAttribute("content");
261 article_date = DateTime.parse(article_date_string).toDate();
262 return;
263 }
264 if (((Element) nl.item(j)).getAttribute("name").equals("DC.date.issued")) {
265 article_date_string = ((Element) nl.item(j)).getAttribute("content");
266 article_date = DateTime.parse(article_date_string).toDate();
267 return;
268 }
269 }
270
271
272
273 nl = document.getElementsByTagName("time");
274 for (int j = 0; j < nl.getLength(); j++) {
275 if (((Element) nl.item(j)).getAttributeNode("pubdate") != null) {
276 article_date_string = ((Element) nl.item(j)).getAttribute("datetime");
277 article_date = DateTime.parse(article_date_string).toDate();
278 return;
279 }
280 }
281
282
283
284
285 nl = document.getElementsByTagName("*");
286 for (int j = 0; j < nl.getLength(); j++) {
287 if ((((Element) nl.item(j)).getAttribute("class").contains("date") ||
288 ((Element) nl.item(j)).getAttribute("class").contains("Date")) &&
289 !(((Element) nl.item(j)).getAttribute("class").contains("update") ||
290 ((Element) nl.item(j)).getAttribute("class").contains("Update")))
291 {
292 article_date_string = getInnerTextSep(nl.item(j)).trim();
293 parseDate();
294 return;
295 }
296 }
297 for (int j = 0; j < nl.getLength(); j++) {
298 if ((((Element) nl.item(j)).getAttribute("id").contains("date") ||
299 ((Element) nl.item(j)).getAttribute("id").contains("Date")) &&
300 !(((Element) nl.item(j)).getAttribute("id").contains("update") ||
301 ((Element) nl.item(j)).getAttribute("id").contains("Update")))
302 {
303 article_date_string = getInnerTextSep(nl.item(j)).trim();
304 parseDate();
305 return;
306 }
307 }
308
309
310 nl = document.getElementsByTagName("*");
311 for (int j = 0; j < nl.getLength(); j++) {
312 final String text = nl.item(j).getTextContent();
313
314 if (text == null)
315 continue;
316
317 final Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)");
318 final Matcher m = p.matcher(text);
319 if (m.find()) {
320 article_date_string = m.group(1);
321
322 String cpy = article_date_string.replaceAll("th", "");
323 cpy = cpy.replaceAll("st", "");
324 cpy = cpy.replaceAll("nd", "");
325 cpy = cpy.replaceAll("rd", "");
326
327 final SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy");
328 try {
329 article_date = sdf.parse(cpy);
330 } catch (final ParseException e) {
331 }
332 return;
333 }
334 }
335 }
336
337 @SuppressWarnings("deprecation")
338 protected void parseDate() {
339 if (article_date_string == null || article_date_string.trim().isEmpty())
340 return;
341
342 if (article_date_string.contains("Today")) {
343 try {
344 final SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z");
345 article_date = sdf.parse(article_date_string);
346 final Date now = new Date();
347 article_date.setDate(now.getDate());
348 article_date.setMonth(now.getMonth());
349 article_date.setYear(now.getYear());
350 } catch (final ParseException e) {
351 }
352 } else {
353 try {
354 final SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy");
355 article_date = sdf.parse(article_date_string);
356 } catch (final ParseException e) {
357 try {
358 final SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z");
359 article_date = sdf.parse(article_date_string);
360 } catch (final ParseException ee) {
361 try {
362 final SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
363 article_date = sdf.parse(article_date_string);
364 } catch (final ParseException eee) {
365 try {
366 article_date = DateTime.parse(article_date_string).toDate();
367 } catch (final IllegalArgumentException ie) {
368 } catch (final java.lang.ArrayIndexOutOfBoundsException ie) {
369 System.out.println(article_date_string);
370 }
371 }
372 }
373 }
374 }
375 }
376
377
378
379
380
381
382 protected String findArticleTitle() {
383 String curTitle = "", origTitle = "";
384
385 curTitle = origTitle = getTitle();
386
387
388 final List<String> potentialTitles = new ArrayList<String>();
389 for (int i = 1; i <= 6; i++) {
390 final NodeList nl = document.getElementsByTagName("h" + i);
391 if (nl.getLength() > 0) {
392 for (int j = 0; j < nl.getLength(); j++)
393 potentialTitles.add(nl.item(j).getTextContent().trim());
394 }
395 }
396
397 String potentialTitle = null;
398 int score = 0;
399 for (final String s : potentialTitles) {
400 if (s.length() > score && curTitle.contains(s)) {
401 potentialTitle = s;
402 score = s.length();
403 }
404 }
405 if (potentialTitle != null)
406 return potentialTitle;
407
408
409 if (match(curTitle, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0)
410 {
411 curTitle = origTitle.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1");
412
413 if (curTitle.split(" ").length < 3) {
414 curTitle = origTitle.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe
415 + "]+(.*)", "$1");
416 }
417 }
418 else if (curTitle.indexOf(": ") != -1)
419 {
420 curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1");
421
422 if (curTitle.split(" ").length < 3) {
423 curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1");
424 }
425 }
426 else if (curTitle.length() > 150 || curTitle.length() < 15)
427 {
428 final NodeList hOnes = document.getElementsByTagName("h1");
429 if (hOnes.getLength() == 1)
430 {
431 curTitle = getInnerText((Element) hOnes.item(0));
432 }
433 }
434
435 curTitle = curTitle.replaceAll(Regexps.trimRe, "");
436
437 if (curTitle.split(" ").length <= 3) {
438 curTitle = origTitle;
439 }
440
441 return curTitle;
442 }
443
444
445
446
447
448
449 protected Element getBody() {
450 final NodeList nl = document.getElementsByTagName("body");
451
452 if (nl.getLength() == 0)
453 return null;
454 else
455 return (Element) nl.item(0);
456 }
457
458
459
460
461
462
463
464
465
466
467 protected void init() {
468 if (getBody() != null && bodyCache == null) {
469 bodyCache = getBody().cloneNode(true);
470 }
471
472 findArticleDate();
473
474 findArticleEncoding();
475
476 prepDocument();
477
478
479 articleTitle = findArticleTitle();
480 articleContent = grabArticle();
481
482
483
484
485
486
487
488
489 if (getInnerText(articleContent, false).length() < 250)
490 {
491 if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) {
492 flags.remove(Flag.FLAG_STRIP_UNLIKELYS);
493 getBody().getParentNode().replaceChild(bodyCache, getBody());
494 init();
495 return;
496 }
497 else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
498 flags.remove(Flag.FLAG_WEIGHT_CLASSES);
499 getBody().getParentNode().replaceChild(bodyCache, getBody());
500 init();
501 return;
502 }
503 else {
504 articleContent = null;
505 }
506 }
507
508 if (addTitle && articleContent != null) {
509 final Element titleNode = document.createElement("h1");
510 titleNode.setAttribute("id", "title");
511 titleNode.appendChild(document.createTextNode(getArticleTitle()));
512 articleContent.insertBefore(titleNode, articleContent.getFirstChild());
513 }
514 }
515
516
517
518
519
520
521 protected void prepDocument() {
522
523
524
525
526
527 if (getBody() == null)
528 {
529 final Node body = document.createElement("body");
530 document.appendChild(body);
531 }
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576 final NodeList scripts = document.getElementsByTagName("script");
577 for (int i = scripts.getLength() - 1; i >= 0; i--)
578 {
579 scripts.item(i).getParentNode().removeChild(scripts.item(i));
580 }
581
582
583 final NodeList styleTags = document.getElementsByTagName("style");
584 for (int st = styleTags.getLength() - 1; st >= 0; st--) {
585 styleTags.item(st).getParentNode().removeChild(styleTags.item(st));
586 }
587
588
589 final NodeList metaTags = document.getElementsByTagName("meta");
590 for (int mt = metaTags.getLength() - 1; mt >= 0; mt--) {
591 metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt));
592 }
593
594
595
596
597
598
599
600
601
602 final Element body = getBody();
603
604
605
606
607
608
609 final Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(
610 Regexps.replaceFontsRe, "<$1span>"));
611 removeChildren(body);
612 body.appendChild(frag);
613
614
615 removeComments(document);
616 }
617
618 protected void removeComments(Node n) {
619 if (n.getNodeType() == Node.COMMENT_NODE) {
620 n.getParentNode().removeChild(n);
621 } else {
622 final NodeList nl = n.getChildNodes();
623 for (int i = nl.getLength() - 1; i >= 0; i--)
624 removeComments(nl.item(i));
625 }
626 }
627
628
629
630
631
632
633
634
635
636 protected void prepArticle(Element articleContent) {
637 cleanStyles(articleContent);
638 killBreaks(articleContent);
639
640
641 clean(articleContent, "form");
642 clean(articleContent, "object");
643 clean(articleContent, "h1");
644
645
646
647
648 if (articleContent.getElementsByTagName("h2").getLength() == 1) {
649 clean(articleContent, "h2");
650 }
651 clean(articleContent, "iframe");
652
653 cleanHeaders(articleContent);
654
655
656
657
658
659 cleanConditionally(articleContent, "table");
660 cleanConditionally(articleContent, "ul");
661 cleanConditionally(articleContent, "div");
662
663
664 final NodeList articleParagraphs = articleContent.getElementsByTagName("p");
665 for (int i = articleParagraphs.getLength() - 1; i >= 0; i--)
666 {
667 final int imgCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength();
668 final int embedCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength();
669 final int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength();
670
671 if (imgCount == 0 && embedCount == 0 && objectCount == 0
672 && getInnerText((Element) articleParagraphs.item(i), false) == "")
673 {
674 articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i));
675 }
676 }
677
678
679
680 final Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
681 removeChildren(articleContent);
682 articleContent.appendChild(n);
683
684
685 final NodeList nl = articleContent.getElementsByTagName("p");
686 for (int i = nl.getLength() - 1; i >= 0; i--) {
687 if (nl.item(i).getTextContent().trim().length() == 0)
688 {
689 nl.item(i).getParentNode().removeChild(nl.item(i));
690 } else if (nl.item(i).getChildNodes().getLength() == 1
691 && nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE)
692 {
693 nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n");
694 }
695 else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled"))
696 {
697 nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i));
698 }
699 }
700
701 }
702
703 protected void removeChildren(Node n) {
704 final NodeList nl = n.getChildNodes();
705 final int nn = nl.getLength();
706 for (int i = 0; i < nn; i++)
707 n.removeChild(nl.item(0));
708 }
709
710
711
712
713
714
715
716 protected void initializeNode(Element node) {
717 float contentScore = 0;
718
719 if (node.getTagName() == "DIV") {
720 contentScore += 5;
721 } else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") {
722 contentScore += 3;
723 } else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL"
724 || node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT"
725 || node.getTagName() == "LI" || node.getTagName() == "FORM")
726 {
727 contentScore -= 3;
728 } else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3"
729 || node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6"
730 || node.getTagName() == "TH")
731 {
732 contentScore -= 5;
733 }
734
735 contentScore += getClassWeight(node);
736 node.setUserData("readability", contentScore, null);
737 }
738
739
740
741
742
743
744
745
746 protected int getClassWeight(Element e) {
747 if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
748 return 0;
749 }
750
751 int weight = 0;
752
753
754 if (e.getAttribute("class") != "")
755 {
756 if (search(e.getAttribute("class"), Regexps.negativeRe) != -1) {
757 weight -= 25;
758 }
759
760 if (search(e.getAttribute("class"), Regexps.positiveRe) != -1) {
761 weight += 25;
762 }
763 }
764
765
766 if (e.getAttribute("id") != "")
767 {
768 if (search(e.getAttribute("id"), Regexps.negativeRe) != -1) {
769 weight -= 25;
770 }
771
772 if (search(e.getAttribute("id"), Regexps.positiveRe) != -1) {
773 weight += 25;
774 }
775 }
776
777 return weight;
778 }
779
780 protected void cleanStyles() {
781 cleanStyles((Element) document);
782 }
783
784
785
786
787
788
789
790 protected void cleanStyles(Element e) {
791 if (e == null)
792 return;
793 Node cur = e.getFirstChild();
794
795
796 if (!e.getAttribute("class").equals("readability-styled"))
797 e.removeAttribute("style");
798
799
800 while (cur != null) {
801 if (cur.getNodeType() == Element.ELEMENT_NODE) {
802
803 if (!((Element) cur).getAttribute("class").equals("readability-styled")) {
804 ((Element) cur).removeAttribute("style");
805 }
806 cleanStyles((Element) cur);
807 }
808 cur = cur.getNextSibling();
809 }
810 }
811
812
813
814
815
816
817 protected void killBreaks(Element e) {
818
819
820
821 final Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe, "<BR />"));
822 removeChildren(e);
823 e.appendChild(n);
824 }
825
826
827
828
829
830
831
832
833
834 protected void clean(Element e, String tag) {
835 final NodeList targetList = e.getElementsByTagName(tag);
836 final boolean isEmbed = (tag.equals("object") || tag.equals("embed"));
837
838 for (int y = targetList.getLength() - 1; y >= 0; y--) {
839
840
841
842
843 if (isEmbed) {
844 String attributeValues = "";
845 for (int i = 0, il = targetList.item(y).getAttributes().getLength(); i < il; i++) {
846 attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|";
847 }
848
849
850
851
852
853 if (search(attributeValues, Regexps.videoRe) != -1) {
854 continue;
855 }
856
857
858 if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) {
859 continue;
860 }
861 }
862
863 targetList.item(y).getParentNode().removeChild(targetList.item(y));
864 }
865 }
866
867
868
869
870
871
872
873 protected void cleanHeaders(Element e) {
874 for (int headerIndex = 1; headerIndex < 7; headerIndex++) {
875 final NodeList headers = e.getElementsByTagName("h" + headerIndex);
876 for (int i = headers.getLength() - 1; i >= 0; i--) {
877 if (getClassWeight((Element) headers.item(i)) < 0
878 || getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD)
879 {
880 headers.item(i).getParentNode().removeChild(headers.item(i));
881 }
882 }
883 }
884 }
885
886
887
888
889
890
891
892
893
894 protected float getLinkDensity(Element e) {
895 final NodeList links = e.getElementsByTagName("a");
896 final int textLength = getInnerText(e).length();
897 int linkLength = 0;
898
899 for (int i = 0, il = links.getLength(); i < il; i++)
900 {
901 linkLength += getInnerText((Element) links.item(i)).length();
902 }
903
904 if (linkLength == 0)
905 return 0;
906
907 return (float) linkLength / (float) textLength;
908 }
909
910
911
912
913
914
915 protected void cleanConditionally(Element e, String tag) {
916 final NodeList tagsList = e.getElementsByTagName(tag);
917 final int curTagsLength = tagsList.getLength();
918
919
920
921
922
923
924
925
926 for (int i = curTagsLength - 1; i >= 0; i--) {
927 final int weight = getClassWeight((Element) tagsList.item(i));
928 final float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float) (tagsList.item(i)
929 .getUserData("readability")) : 0;
930
931 dbg("Cleaning Conditionally "
932 + tagsList.item(i)
933 + " ("
934 + ((Element) tagsList.item(i)).getAttribute("class")
935 + ":"
936 + ((Element) tagsList.item(i)).getAttribute("id")
937 + ")"
938 + ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i)
939 .getUserData("readability")) : ""));
940
941 if (weight + contentScore < 0)
942 {
943 dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
944 + ((Element) tagsList.item(i)).getAttribute("id") + ")");
945 tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
946 }
947 else if (getCharCount((Element) tagsList.item(i), ",") < 10) {
948
949
950
951
952
953 final int p = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength();
954 final int img = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength();
955 final int li = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength() - 100;
956 final int input = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength();
957
958 int embedCount = 0;
959 final NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed");
960 for (int ei = 0, il = embeds.getLength(); ei < il; ei++) {
961 if (search(((Element) embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) {
962 embedCount++;
963 }
964 }
965
966 final float linkDensity = getLinkDensity((Element) tagsList.item(i));
967 final int contentLength = getInnerText((Element) tagsList.item(i)).length();
968 boolean toRemove = false;
969
970 if (img > p) {
971 toRemove = true;
972 } else if (li > p && tag != "ul" && tag != "ol") {
973 toRemove = true;
974 } else if (input > Math.floor(p / 3)) {
975 toRemove = true;
976 } else if (contentLength < 25 && (img == 0 || img > 2)) {
977 toRemove = true;
978 } else if (weight < 25 && linkDensity > 0.2) {
979 toRemove = true;
980 } else if (weight >= 25 && linkDensity > 0.5) {
981 toRemove = true;
982 } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) {
983 toRemove = true;
984 }
985
986 if (img == 1 && p == 0 && contentLength == 0) {
987 final Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0);
988
989 String w = "";
990 if (theImg.getAttribute("width") != null)
991 w = theImg.getAttribute("width");
992
993 String h = "";
994 if (theImg.getAttribute("height") != null)
995 h = theImg.getAttribute("height");
996
997 if (!(w.equals("0") || h.equals("0")))
998 toRemove = false;
999
1000 }
1001
1002 if (toRemove) {
1003 dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
1004 + ((Element) tagsList.item(i)).getAttribute("id") + ")");
1005 tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
1006 }
1007 }
1008 }
1009 }
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019 protected int getCharCount(Element e, String s) {
1020 return getInnerText(e).split(s).length - 1;
1021 }
1022
1023 protected int getCharCount(Element e) {
1024 return getCharCount(e, ",");
1025 }
1026
1027
1028
1029
1030 public String getArticleTitle() {
1031 return articleTitle;
1032 }
1033
1034
1035
1036
1037 public String getArticleContentType() {
1038 return article_contentType;
1039 }
1040
1041
1042
1043
1044
1045
1046
1047
1048 protected Element grabArticle() {
1049 final boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS);
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061 Element node = null;
1062 final List<Element> nodesToScore = new ArrayList<Element>();
1063 for (int nodeIndex = 0; (node = (Element) document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++)
1064 {
1065
1066 if (stripUnlikelyCandidates) {
1067 final String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id");
1068 if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 &&
1069 search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 &&
1070 !node.getTagName().equals("BODY"))
1071 {
1072 dbg("Removing unlikely candidate - " + unlikelyMatchString);
1073 node.getParentNode().removeChild(node);
1074 nodeIndex--;
1075 continue;
1076 }
1077 }
1078
1079 if (node.getTagName().equals("P") || node.getTagName().equals("TD")) {
1080 nodesToScore.add(node);
1081 }
1082
1083
1084
1085
1086
1087 if (node.getTagName().equals("DIV")) {
1088
1089 if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) {
1090 dbg("Altering div to p");
1091 final Element newNode = document.createElement("P");
1092
1093
1094 final NodeList nl = node.getChildNodes();
1095 for (int i = 0; i < nl.getLength(); i++)
1096 newNode.appendChild(nl.item(i));
1097
1098 node.getParentNode().replaceChild(newNode, node);
1099 nodeIndex--;
1100 }
1101 else
1102 {
1103
1104 for (int i = 0, il = node.getChildNodes().getLength(); i < il; i++) {
1105 final Node childNode = node.getChildNodes().item(i);
1106 if (childNode.getNodeType() == Element.TEXT_NODE) {
1107 dbg("replacing text node with a p tag with the same content.");
1108 final Element p = document.createElement("p");
1109
1110 p.setNodeValue(childNode.getNodeValue());
1111 p.setTextContent(childNode.getTextContent());
1112
1113 p.setAttribute("class", "readability-styled");
1114 childNode.getParentNode().replaceChild(p, childNode);
1115 }
1116 }
1117 }
1118 }
1119 }
1120
1121
1122
1123
1124
1125
1126
1127
1128 final List<Element> candidates = new ArrayList<Element>();
1129 for (int pt = 0; pt < nodesToScore.size(); pt++) {
1130 final Element parentNode = (Element) nodesToScore.get(pt).getParentNode();
1131 final Element grandParentNode = (Element) parentNode.getParentNode();
1132 final String innerText = getInnerText(nodesToScore.get(pt));
1133
1134
1135
1136
1137
1138 if (innerText.length() < 25) {
1139 continue;
1140 }
1141
1142
1143 if (parentNode.getUserData("readability") == null)
1144 {
1145 initializeNode(parentNode);
1146 candidates.add(parentNode);
1147 }
1148
1149
1150 if (grandParentNode.getUserData("readability") == null)
1151 {
1152 initializeNode(grandParentNode);
1153 candidates.add(grandParentNode);
1154 }
1155
1156 float contentScore = 0;
1157
1158
1159 contentScore++;
1160
1161
1162 contentScore += innerText.split(",").length;
1163
1164
1165
1166
1167
1168 contentScore += Math.min(Math.floor(innerText.length() / 100F), 3F);
1169
1170
1171 parentNode.setUserData("readability", ((Float) (parentNode.getUserData("readability")) + contentScore), null);
1172 grandParentNode.setUserData("readability", ((Float) (grandParentNode.getUserData("readability")))
1173 + (contentScore / 2F), null);
1174 }
1175
1176
1177
1178
1179
1180 Element topCandidate = null;
1181 for (int c = 0, cl = candidates.size(); c < cl; c++)
1182 {
1183
1184
1185
1186
1187
1188
1189 candidates.get(c).setUserData("readability",
1190 (Float) (candidates.get(c).getUserData("readability")) * (1F - getLinkDensity(candidates.get(c))),
1191 null);
1192
1193 dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class") + ":"
1194 + candidates.get(c).getAttribute("id") + ") with score "
1195 + candidates.get(c).getUserData("readability"));
1196
1197 if (topCandidate == null
1198 || (Float) (candidates.get(c).getUserData("readability")) > ((Float) topCandidate
1199 .getUserData("readability")))
1200 {
1201 topCandidate = candidates.get(c);
1202 }
1203 }
1204
1205 if (topCandidate != null)
1206 dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class") + ":"
1207 + topCandidate.getAttribute("id") + ") with score " + topCandidate.getUserData("readability"));
1208
1209
1210
1211
1212
1213
1214 if (topCandidate == null || topCandidate.getTagName().equals("BODY"))
1215 {
1216 topCandidate = document.createElement("DIV");
1217
1218
1219 final NodeList nl = getBody().getChildNodes();
1220 for (int i = 0; i < nl.getLength(); i++)
1221 topCandidate.appendChild(nl.item(i));
1222
1223
1224 getBody().appendChild(topCandidate);
1225 initializeNode(topCandidate);
1226 }
1227
1228
1229
1230
1231
1232
1233 final Element articleContent = document.createElement("DIV");
1234 articleContent.setAttribute("id", "readability-content");
1235 final float siblingScoreThreshold = Math.max(10F, (Float) topCandidate.getUserData("readability") * 0.2F);
1236 final NodeList siblingNodes = topCandidate.getParentNode().getChildNodes();
1237
1238 for (int s = 0, sl = siblingNodes.getLength(); s < sl; s++)
1239 {
1240 final Node siblingNode = siblingNodes.item(s);
1241 boolean append = false;
1242
1243 if (siblingNode instanceof Element)
1244 dbg("Looking at sibling node: "
1245 + siblingNode
1246 + " ("
1247 + ((Element) siblingNode).getAttribute("class")
1248 + ":"
1249 + ((Element) siblingNode).getAttribute("id")
1250 + ")"
1251 + ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode
1252 .getUserData("readability")) : ""));
1253 dbg("Sibling has score "
1254 + (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability")
1255 : "Unknown"));
1256
1257 if (siblingNode == topCandidate)
1258 {
1259 append = true;
1260 }
1261
1262 float contentBonus = 0;
1263
1264
1265
1266
1267 if (siblingNode instanceof Element
1268 && ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class"))
1269 && !topCandidate.getAttribute("class").equals(""))
1270 {
1271 contentBonus += (Float) topCandidate.getUserData("readability") * 0.2F;
1272 }
1273
1274 if (siblingNode.getUserData("readability") != null
1275 && ((Float) siblingNode.getUserData("readability") + contentBonus) >= siblingScoreThreshold)
1276 {
1277 append = true;
1278 }
1279
1280 if (siblingNode.getNodeName().equals("P")) {
1281 final float linkDensity = getLinkDensity((Element) siblingNode);
1282 final String nodeContent = getInnerText((Element) siblingNode);
1283 final int nodeLength = nodeContent.length();
1284
1285 if (nodeLength > 80 && linkDensity < 0.25)
1286 {
1287 append = true;
1288 }
1289 else if (nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1)
1290 {
1291 append = true;
1292 }
1293 }
1294
1295 if (append)
1296 {
1297 dbg("Appending node: " + siblingNode);
1298
1299 Node nodeToAppend = null;
1300 if (!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) {
1301
1302
1303
1304
1305
1306
1307 dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div.");
1308 nodeToAppend = document.createElement("div");
1309 if (siblingNode instanceof Element)
1310 ((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id"));
1311
1312
1313 final NodeList nl = siblingNode.getChildNodes();
1314 for (int i = 0; i < nl.getLength(); i++)
1315 nodeToAppend.appendChild(nl.item(i));
1316 } else {
1317 nodeToAppend = siblingNode;
1318 s--;
1319 sl--;
1320 }
1321
1322
1323
1324
1325
1326 if (nodeToAppend instanceof Element)
1327 ((Element) nodeToAppend).setAttribute("class", "");
1328
1329
1330
1331
1332
1333 articleContent.appendChild(nodeToAppend);
1334 }
1335 }
1336
1337
1338
1339
1340
1341 prepArticle(articleContent);
1342
1343 return articleContent;
1344 }
1345
1346 protected String getInnerHTML(Node n) {
1347 if (n.getNodeType() == Node.TEXT_NODE)
1348 return n.getTextContent();
1349
1350 String result = "";
1351 final NodeList nl = n.getChildNodes();
1352 for (int i = 0; i < nl.getLength(); i++) {
1353 if (nl.item(i).getNodeType() == Node.TEXT_NODE)
1354 result += nl.item(i).getTextContent();
1355 else if (nl.item(i).getNodeType() == Node.COMMENT_NODE)
1356 result += "<!-- " + nl.item(i).getTextContent() + " -->";
1357 else
1358 result += nodeToString(nl.item(i));
1359 }
1360
1361 return result;
1362 }
1363
1364 protected String nodeToString(Node n) {
1365 return nodeToString(n, false);
1366 }
1367
1368 protected static String nodeToString(Node n, boolean pretty) {
1369 try {
1370 final DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
1371 final DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS");
1372 final LSSerializer writer = impl.createLSSerializer();
1373
1374 writer.getDomConfig().setParameter("xml-declaration", false);
1375 if (pretty) {
1376 writer.getDomConfig().setParameter("format-pretty-print", true);
1377 }
1378
1379 return writer.writeToString(n);
1380 } catch (final Exception e) {
1381 throw new RuntimeException(e);
1382 }
1383 }
1384
1385 protected Node stringToNode(String str) {
1386 try {
1387 final DOMFragmentParser parser = new DOMFragmentParser();
1388 final DocumentFragment fragment = document.createDocumentFragment();
1389 parser.parse(new InputSource(new StringReader(str)), fragment);
1390 return fragment;
1391
1392
1393
1394
1395
1396
1397
1398 } catch (final Exception e) {
1399 throw new RuntimeException(e);
1400 }
1401 }
1402
1403
1404
1405
1406
1407
1408
1409
1410 protected String getInnerText(Element e, boolean normalizeSpaces) {
1411 String textContent = "";
1412
1413 textContent = e.getTextContent().replaceAll(Regexps.trimRe, "");
1414
1415 if (normalizeSpaces) {
1416 return textContent.replaceAll(Regexps.normalizeRe, " ");
1417 } else {
1418 return textContent;
1419 }
1420 }
1421
1422 protected String getInnerTextSep(Node e) {
1423 if (e.hasChildNodes()) {
1424 String s = "";
1425 final NodeList nl = e.getChildNodes();
1426 for (int i = 0; i < nl.getLength(); i++) {
1427 if (!nl.item(i).getNodeName().equalsIgnoreCase("script"))
1428 s += getInnerTextSep(nl.item(i));
1429 }
1430 return s;
1431 } else {
1432 return e.getTextContent() + " ";
1433 }
1434 }
1435
1436 protected String getInnerText(Element e) {
1437 return getInnerText(e, true);
1438 }
1439
1440
1441
1442
1443 public String getArticleHTML() {
1444 if (articleContent == null)
1445 return "";
1446 return nodeToString(articleContent, true);
1447 }
1448
1449
1450
1451
1452 public Node getArticleHTML_DOM() {
1453 return articleContent;
1454 }
1455
1456 protected String getArticleDateString() {
1457 return article_date_string;
1458 }
1459
1460
1461
1462
1463 public Date getArticleDate() {
1464 return article_date;
1465 }
1466
1467
1468
1469
1470 public String getArticleText() {
1471 if (articleContent == null)
1472 return "Unable to find article content";
1473
1474 return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
1475 }
1476
1477
1478
1479
1480 public List<Anchor> getArticleLinks() {
1481 final List<Anchor> anchors = new ArrayList<Anchor>();
1482 if (articleContent == null)
1483 return anchors;
1484
1485 final NodeList nl = articleContent.getElementsByTagName("a");
1486 for (int i = 0; i < nl.getLength(); i++) {
1487 final Element a = (Element) nl.item(i);
1488
1489 final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1490 anchors.add(anchor);
1491 }
1492 return anchors;
1493 }
1494
1495
1496
1497
1498 public List<Anchor> getAllLinks() {
1499 final List<Anchor> anchors = new ArrayList<Anchor>();
1500
1501 final NodeList nl = document.getElementsByTagName("a");
1502 for (int i = 0; i < nl.getLength(); i++) {
1503 final Element a = (Element) nl.item(i);
1504 final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1505 anchors.add(anchor);
1506 }
1507 return anchors;
1508 }
1509
1510
1511
1512
1513 public List<String> getArticleImages() {
1514 final List<String> images = new ArrayList<String>();
1515 if (articleContent == null)
1516 return images;
1517
1518 final NodeList nl = articleContent.getElementsByTagName("img");
1519 for (int i = 0; i < nl.getLength(); i++) {
1520 final Element img = (Element) nl.item(i);
1521 images.add(img.getAttribute("src"));
1522 }
1523 return images;
1524 }
1525
1526
1527
1528
1529 public List<String> getArticleSubheadings() {
1530 final List<String> subtitles = new ArrayList<String>();
1531 if (articleContent == null)
1532 return subtitles;
1533
1534 for (int j = 1; j <= 6; j++) {
1535 final NodeList nl = articleContent.getElementsByTagName("h" + j);
1536 if (nl.getLength() > 0) {
1537 for (int i = 0; i < nl.getLength(); i++) {
1538 subtitles.add(nl.item(i).getTextContent());
1539 }
1540 break;
1541 }
1542 }
1543
1544 if (subtitles.size() == 0) {
1545
1546
1547 final NodeList nl = articleContent.getElementsByTagName("*");
1548 for (int i = 0; i < nl.getLength(); i++) {
1549 if (nl.item(i) instanceof Element &&
1550 ((Element) nl.item(i)).getAttribute("class") != null &&
1551 search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1)
1552 subtitles.add(nl.item(i).getTextContent());
1553 }
1554 }
1555
1556 return subtitles;
1557 }
1558
1559 protected List<Node> findChildNodesWithName(Node parent, String name) {
1560 final NodeList children = parent.getChildNodes();
1561 final List<Node> results = new ArrayList<Node>();
1562
1563 for (int i = 0; i < children.getLength(); ++i) {
1564 final Node child = children.item(i);
1565 if (child == null)
1566 continue;
1567
1568 final String nodeName = child.getNodeName();
1569 if (nodeName == null)
1570 continue;
1571
1572 if (nodeName.equals(name)) {
1573 results.add(child);
1574 }
1575 }
1576 return results;
1577 }
1578
1579 protected int findChildNodeIndex(Node parent, Node childToFind)
1580 {
1581 for (int index = 0; index < parent.getChildNodes().getLength(); index++)
1582 if (parent.getChildNodes().item(index) == childToFind)
1583 return index;
1584 return -1;
1585 }
1586
1587 protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException {
1588 final Node parend = walker.getCurrentNode();
1589
1590 if (parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null)
1591 {
1592 if (parend.getTextContent().trim().length() > 0)
1593 {
1594 final int index = findChildNodeIndex(parend.getParentNode(), parend);
1595 if (index != -1)
1596 {
1597
1598
1599 map.add(new MappingNode(
1600 parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + index + "]",
1601 parend.getNodeValue()));
1602
1603
1604
1605
1606
1607
1608 }
1609 }
1610 }
1611
1612
1613 for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
1614 getArticleTextMapping(walker, map);
1615 }
1616
1617
1618 walker.setCurrentNode(parend);
1619 }
1620
1621 protected class MappingNode {
1622 String id;
1623 String text;
1624
1625 public MappingNode(String id, String text) {
1626 this.id = id;
1627 this.text = text;
1628 }
1629
1630 public String getId() {
1631 return id;
1632 }
1633
1634 public String getText() {
1635 return text;
1636 }
1637
1638 @Override
1639 public String toString() {
1640 return "MappingNode(" + id + " -> " + text + ")";
1641 }
1642 }
1643
1644
1645
1646
1647
1648
1649 public List<MappingNode> getArticleTextMapping() {
1650 if (articleContent == null)
1651 return null;
1652
1653 final List<MappingNode> map = new ArrayList<MappingNode>();
1654
1655 final TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT
1656 | NodeFilter.SHOW_ELEMENT, null, true);
1657
1658 getArticleTextMapping(walker, map);
1659
1660 return map;
1661 }
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673 public static Readability getReadability(String html) throws SAXException, IOException {
1674 return getReadability(html, false);
1675 }
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689 public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
1690 final DOMParser parser = new DOMParser();
1691 parser.parse(new InputSource(new StringReader(html)));
1692
1693 return new Readability(parser.getDocument(), false, addTitle);
1694 }
1695
1696
1697
1698
1699
1700
1701
1702 public static void main(String[] argv) throws Exception {
1703
1704
1705
1706
1707 final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732 final DOMParser parser = new DOMParser();
1733 parser.parse(new InputSource(input.openStream()));
1734
1735 final Readability r = new Readability(parser.getDocument(), true, true);
1736
1737
1738 System.out.println(r.getArticleHTML());
1739
1740
1741
1742 System.out.println();
1743 System.out.println("***");
1744 System.out.println();
1745
1746 for (final MappingNode s : r.getArticleTextMapping())
1747 System.out.println(s);
1748
1749
1750
1751
1752
1753
1754
1755 System.out.println(r.getArticleImages());
1756
1757
1758
1759
1760
1761
1762 }
1763 }