001/**
002 * Copyright 2010 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.readability;
018
019import java.io.IOException;
020import java.io.StringReader;
021import java.net.URL;
022import java.text.ParseException;
023import java.text.SimpleDateFormat;
024import java.util.ArrayList;
025import java.util.Date;
026import java.util.EnumSet;
027import java.util.List;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030
031import org.cyberneko.html.parsers.DOMFragmentParser;
032import org.cyberneko.html.parsers.DOMParser;
033import org.pojava.datetime.DateTime;
034import org.w3c.dom.DOMException;
035import org.w3c.dom.Document;
036import org.w3c.dom.DocumentFragment;
037import org.w3c.dom.Element;
038import org.w3c.dom.Node;
039import org.w3c.dom.NodeList;
040import org.w3c.dom.bootstrap.DOMImplementationRegistry;
041import org.w3c.dom.ls.DOMImplementationLS;
042import org.w3c.dom.ls.LSSerializer;
043import org.w3c.dom.traversal.DocumentTraversal;
044import org.w3c.dom.traversal.NodeFilter;
045import org.w3c.dom.traversal.TreeWalker;
046import org.xml.sax.InputSource;
047import org.xml.sax.SAXException;
048
049/**
050 * Class for extracting the "content" from web-pages, and ignoring adverts, etc.
051 * Based upon readability.js (http://lab.arc90.com/experiments/readability/) and
052 * modified to behave better for certain sites (and typically better mimic
053 * Safari Reader functionality).
054 *
055 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
056 * @author Michael Matthews (mikemat@yahoo-inc.com)
057 * @author David Dupplaw (dpd@ecs.soton.ac.uk)
058 */
059public class Readability
060{
061        /**
062         * Regular expressions for different types of content
063         */
064        protected static class Regexps {
065
066                public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner"; // caption?
067                public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
068                public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
069                public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
070                public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
071                public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
072                public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>";
073                public static String trimRe = "^\\s+|\\s+$";
074                public static String normalizeRe = "\\s{2,}";
075                public static String killBreaksRe = "(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}";
076                public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
077
078                public static String titleSeparatorRe = "\\|\\-\\/";
079
080                // this is used to try and find elements that represent sub-headings
081                // (that are not h1..h6)
082                public static String likelySubheadCandidateRe = "(?i)cross-head";
083        }
084
085        enum Flag {
086                FLAG_STRIP_UNLIKELYS,
087                FLAG_WEIGHT_CLASSES
088        }
089
090        /**
091         * Threshold for removing elements with lots of links
092         */
093        public static float LINK_DENSITY_THRESHOLD = 0.33F;
094
095        // IVARS below
096        protected Document document;
097        private Node bodyCache;
098        protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class);
099
100        protected String articleTitle;
101        protected Element articleContent;
102        protected String article_date_string;
103        protected Date article_date;
104        protected String article_contentType;
105
106        protected boolean debug = false;
107
108        protected boolean addTitle = false;
109
110        /**
111         * Construct with the given document. Debugging is disabled.
112         *
113         * @param document
114         *            The document.
115         */
116        public Readability(Document document) {
117                this(document, false);
118        }
119
120        /**
121         * Construct with the given document. The second argument can be used to
122         * enable debugging output.
123         *
124         * @param document
125         *            The document.
126         * @param debug
127         *            Enable debugging output.
128         */
129        public Readability(Document document, boolean debug) {
130                this(document, debug, false);
131        }
132
133        /**
134         * Construct with the given document. The second argument can be used to
135         * enable debugging output. The third option controls whether the title
136         * should be included in the output.
137         *
138         * @param document
139         *            The document.
140         * @param debug
141         *            Enable debugging output.
142         * @param addTitle
143         *            Add title to output.
144         */
145        public Readability(Document document, boolean debug, boolean addTitle) {
146                this.debug = debug;
147                this.document = document;
148                this.addTitle = addTitle;
149                augmentDocument(document);
150                init();
151        }
152
153        /**
154         * Iterates through all the ELEMENT nodes in a document and gives them ids
155         * if they don't already have them.
156         *
157         * @param document
158         */
159        public static void augmentDocument(Document document) {
160                final DocumentTraversal traversal = (DocumentTraversal) document;
161
162                final TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true);
163
164                traverseLevel(walker, 0);
165        }
166
167        private static int traverseLevel(TreeWalker walker, int counter) {
168                // describe current node:
169                final Node parend = walker.getCurrentNode();
170
171                if (parend instanceof Element) {
172                        if (((Element) parend).getAttribute("id").length() == 0) {
173                                ((Element) parend).setAttribute("id", "gen-id-" + counter);
174                                counter++;
175                        }
176                }
177
178                // traverse children:
179                for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
180                        counter = traverseLevel(walker, counter);
181                }
182
183                // return position to the current (level up):
184                walker.setCurrentNode(parend);
185
186                return counter;
187        }
188
189        protected void dbg(String s) {
190                if (debug)
191                        System.err.println(s);
192        }
193
194        protected String getTitle() {
195                final NodeList l = document.getElementsByTagName("title");
196
197                if (l.getLength() == 0)
198                        return "";
199
200                return l.item(0).getTextContent();
201        }
202
203        /**
204         * Javascript-like String.match
205         *
206         * @param input
207         * @param regex
208         * @return
209         */
210        protected String[] match(String input, String regex) {
211                final Matcher matcher = Pattern.compile(regex).matcher(input);
212                final List<String> matches = new ArrayList<String>();
213
214                while (matcher.find()) {
215                        matches.add(matcher.group(0));
216                }
217
218                return matches.toArray(new String[matches.size()]);
219        }
220
221        /**
222         * @return True if the article has any detected content; false otherwise.
223         */
224        public boolean hasContent() {
225                return articleContent != null;
226        }
227
228        /**
229         * Javascript-like String.search
230         *
231         * @param input
232         * @param regex
233         * @return
234         */
235        protected int search(String input, String regex) {
236                final Matcher matcher = Pattern.compile(regex).matcher(input);
237
238                if (!matcher.find())
239                        return -1;
240                return matcher.start();
241        }
242
243        protected void findArticleEncoding() {
244                final NodeList nl = document.getElementsByTagName("meta");
245                for (int j = 0; j < nl.getLength(); j++) {
246                        if (((Element) nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) {
247                                article_contentType = ((Element) nl.item(j)).getAttribute("content");
248                                return;
249                        }
250                }
251
252        }
253
254        protected void findArticleDate() {
255                // <meta name="OriginalPublicationDate" content="2010/07/12 14:08:02"/>
256                // <meta name="DC.date.issued" content="2010-07-12">
257                NodeList nl = document.getElementsByTagName("meta");
258                for (int j = 0; j < nl.getLength(); j++) {
259                        if (((Element) nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) {
260                                article_date_string = ((Element) nl.item(j)).getAttribute("content");
261                                article_date = DateTime.parse(article_date_string).toDate();
262                                return;
263                        }
264                        if (((Element) nl.item(j)).getAttribute("name").equals("DC.date.issued")) {
265                                article_date_string = ((Element) nl.item(j)).getAttribute("content");
266                                article_date = DateTime.parse(article_date_string).toDate();
267                                return;
268                        }
269                }
270
271                // <time datetime="2010-07-12T10:26BST" pubdate>Monday 12 July 2010
272                // 10.26 BST</time>
273                nl = document.getElementsByTagName("time");
274                for (int j = 0; j < nl.getLength(); j++) {
275                        if (((Element) nl.item(j)).getAttributeNode("pubdate") != null) {
276                                article_date_string = ((Element) nl.item(j)).getAttribute("datetime");
277                                article_date = DateTime.parse(article_date_string).toDate();
278                                return;
279                        }
280                }
281
282                // <span class="date">14:08 GMT, Monday, 12 July 2010 15:08 UK</span>
283                // <p class="date">09.07.2010 @ 17:49 CET</p>
284                // <p class="date">Today @ 09:29 CET</p>
285                nl = document.getElementsByTagName("*");
286                for (int j = 0; j < nl.getLength(); j++) {
287                        if ((((Element) nl.item(j)).getAttribute("class").contains("date") ||
288                                        ((Element) nl.item(j)).getAttribute("class").contains("Date")) &&
289                                        !(((Element) nl.item(j)).getAttribute("class").contains("update") ||
290                                                        ((Element) nl.item(j)).getAttribute("class").contains("Update")))
291                        {
292                                article_date_string = getInnerTextSep(nl.item(j)).trim();
293                                parseDate();
294                                return;
295                        }
296                }
297                for (int j = 0; j < nl.getLength(); j++) {
298                        if ((((Element) nl.item(j)).getAttribute("id").contains("date") ||
299                                        ((Element) nl.item(j)).getAttribute("id").contains("Date")) &&
300                                        !(((Element) nl.item(j)).getAttribute("id").contains("update") ||
301                                                        ((Element) nl.item(j)).getAttribute("id").contains("Update")))
302                        {
303                                article_date_string = getInnerTextSep(nl.item(j)).trim();
304                                parseDate();
305                                return;
306                        }
307                }
308
309                // Last updated at 3:05 PM on 12th July 2010
310                nl = document.getElementsByTagName("*");
311                for (int j = 0; j < nl.getLength(); j++) {
312                        final String text = nl.item(j).getTextContent();
313
314                        if (text == null)
315                                continue;
316
317                        final Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)");
318                        final Matcher m = p.matcher(text);
319                        if (m.find()) {
320                                article_date_string = m.group(1);
321
322                                String cpy = article_date_string.replaceAll("th", "");
323                                cpy = cpy.replaceAll("st", "");
324                                cpy = cpy.replaceAll("nd", "");
325                                cpy = cpy.replaceAll("rd", "");
326
327                                final SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy");
328                                try {
329                                        article_date = sdf.parse(cpy);
330                                } catch (final ParseException e) {
331                                }
332                                return;
333                        }
334                }
335        }
336
337        @SuppressWarnings("deprecation")
338        protected void parseDate() {
339                if (article_date_string == null || article_date_string.trim().isEmpty())
340                        return;
341
342                if (article_date_string.contains("Today")) {
343                        try {
344                                final SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z");
345                                article_date = sdf.parse(article_date_string);
346                                final Date now = new Date();
347                                article_date.setDate(now.getDate());
348                                article_date.setMonth(now.getMonth());
349                                article_date.setYear(now.getYear());
350                        } catch (final ParseException e) {
351                        }
352                } else {
353                        try {
354                                final SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy");
355                                article_date = sdf.parse(article_date_string);
356                        } catch (final ParseException e) {
357                                try {
358                                        final SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z");
359                                        article_date = sdf.parse(article_date_string);
360                                } catch (final ParseException ee) {
361                                        try {
362                                                final SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
363                                                article_date = sdf.parse(article_date_string);
364                                        } catch (final ParseException eee) {
365                                                try {
366                                                        article_date = DateTime.parse(article_date_string).toDate();
367                                                } catch (final IllegalArgumentException ie) {
368                                                } catch (final java.lang.ArrayIndexOutOfBoundsException ie) {
369                                                        System.out.println(article_date_string);
370                                                }
371                                        }
372                                }
373                        }
374                }
375        }
376
377        /**
378         * Get the article title.
379         *
380         * @return void
381         **/
382        protected String findArticleTitle() {
383                String curTitle = "", origTitle = "";
384
385                curTitle = origTitle = getTitle();
386
387                //
388                final List<String> potentialTitles = new ArrayList<String>();
389                for (int i = 1; i <= 6; i++) {
390                        final NodeList nl = document.getElementsByTagName("h" + i);
391                        if (nl.getLength() > 0) {
392                                for (int j = 0; j < nl.getLength(); j++)
393                                        potentialTitles.add(nl.item(j).getTextContent().trim());
394                        }
395                }
396
397                String potentialTitle = null;
398                int score = 0;
399                for (final String s : potentialTitles) {
400                        if (s.length() > score && curTitle.contains(s)) {
401                                potentialTitle = s;
402                                score = s.length();
403                        }
404                }
405                if (potentialTitle != null)
406                        return potentialTitle;
407                //
408
409                if (match(curTitle, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0)
410                {
411                        curTitle = origTitle.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1");
412
413                        if (curTitle.split(" ").length < 3) {
414                                curTitle = origTitle.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe
415                                                + "]+(.*)", "$1");
416                        }
417                }
418                else if (curTitle.indexOf(": ") != -1)
419                {
420                        curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1");
421
422                        if (curTitle.split(" ").length < 3) {
423                                curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1");
424                        }
425                }
426                else if (curTitle.length() > 150 || curTitle.length() < 15)
427                {
428                        final NodeList hOnes = document.getElementsByTagName("h1");
429                        if (hOnes.getLength() == 1)
430                        {
431                                curTitle = getInnerText((Element) hOnes.item(0));
432                        }
433                }
434
435                curTitle = curTitle.replaceAll(Regexps.trimRe, "");
436
437                if (curTitle.split(" ").length <= 3) {
438                        curTitle = origTitle;
439                }
440
441                return curTitle;
442        }
443
444        /**
445         * Equivalent to document.body in JS
446         *
447         * @return
448         */
449        protected Element getBody() {
450                final NodeList nl = document.getElementsByTagName("body");
451
452                if (nl.getLength() == 0)
453                        return null;
454                else
455                        return (Element) nl.item(0);
456        }
457
458        /**
459         * Runs readability.
460         *
461         * Workflow: 1. Prep the document by removing script tags, css, etc. 2.
462         * Build readability"s DOM tree. 3. Grab the article content from the
463         * current dom tree. 4. Replace the current DOM tree with the new one. 5.
464         * Read peacefully.
465         *
466         **/
467        protected void init() {
468                if (getBody() != null && bodyCache == null) {
469                        bodyCache = getBody().cloneNode(true);
470                }
471
472                findArticleDate(); // must be done before prepDocument()
473
474                findArticleEncoding();
475
476                prepDocument();
477
478                /* Build readability"s DOM tree */
479                articleTitle = findArticleTitle();
480                articleContent = grabArticle();
481
482                /**
483                 * If we attempted to strip unlikely candidates on the first run
484                 * through, and we ended up with no content, that may mean we stripped
485                 * out the actual content so we couldn"t parse it. So re-run init while
486                 * preserving unlikely candidates to have a better shot at getting our
487                 * content out properly.
488                 **/
489                if (getInnerText(articleContent, false).length() < 250)
490                {
491                        if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) {
492                                flags.remove(Flag.FLAG_STRIP_UNLIKELYS);
493                                getBody().getParentNode().replaceChild(bodyCache, getBody());
494                                init();
495                                return;
496                        }
497                        else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
498                                flags.remove(Flag.FLAG_WEIGHT_CLASSES);
499                                getBody().getParentNode().replaceChild(bodyCache, getBody());
500                                init();
501                                return;
502                        }
503                        else {
504                                articleContent = null;
505                        }
506                }
507
508                if (addTitle && articleContent != null) {
509                        final Element titleNode = document.createElement("h1");
510                        titleNode.setAttribute("id", "title");
511                        titleNode.appendChild(document.createTextNode(getArticleTitle()));
512                        articleContent.insertBefore(titleNode, articleContent.getFirstChild());
513                }
514        }
515
516        /**
517         * Prepare the HTML document for readability to scrape it. This includes
518         * things like stripping javascript, CSS, and handling terrible markup.
519         *
520         **/
521        protected void prepDocument() {
522                /**
523                 * In some cases a body element can"t be found (if the HTML is totally
524                 * hosed for example) so we create a new body node and append it to the
525                 * document.
526                 */
527                if (getBody() == null)
528                {
529                        final Node body = document.createElement("body");
530                        document.appendChild(body);
531                }
532
533                // frames are not supported in this version!
534                // NodeList frames = document.getElementsByTagName("frame");
535                // if(frames.length > 0)
536                // {
537                // Node bestFrame = null;
538                // int bestFrameSize = 0;
539                // for(int frameIndex = 0; frameIndex < frames.getLength();
540                // frameIndex++)
541                // {
542                // int frameSize = frames.item(frameIndex).offsetWidth +
543                // frames[frameIndex].offsetHeight;
544                // var canAccessFrame = false;
545                // try {
546                // frames[frameIndex].contentWindow.document.body;
547                // canAccessFrame = true;
548                // }
549                // catch(eFrames) {
550                // dbg(eFrames);
551                // }
552                //
553                // if(canAccessFrame && frameSize > bestFrameSize)
554                // {
555                // bestFrame = frames[frameIndex];
556                // bestFrameSize = frameSize;
557                // }
558                // }
559                //
560                // if(bestFrame)
561                // {
562                // var newBody = document.createElement("body");
563                // newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
564                // newBody.style.overflow = "scroll";
565                // document.body = newBody;
566                //
567                // var frameset = document.getElementsByTagName("frameset")[0];
568                // if(frameset) {
569                // frameset.parentNode.removeChild(frameset); }
570                //
571                // readability.frameHack = true;
572                // }
573                // }
574
575                /* remove all scripts that are not readability */
576                final NodeList scripts = document.getElementsByTagName("script");
577                for (int i = scripts.getLength() - 1; i >= 0; i--)
578                {
579                        scripts.item(i).getParentNode().removeChild(scripts.item(i));
580                }
581
582                /* Remove all style tags in head */
583                final NodeList styleTags = document.getElementsByTagName("style");
584                for (int st = styleTags.getLength() - 1; st >= 0; st--) {
585                        styleTags.item(st).getParentNode().removeChild(styleTags.item(st));
586                }
587
588                /* Remove all meta tags */
589                final NodeList metaTags = document.getElementsByTagName("meta");
590                for (int mt = metaTags.getLength() - 1; mt >= 0; mt--) {
591                        metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt));
592                }
593
594                /* Turn all double br's into p's */
595                /*
596                 * Note, this is pretty costly as far as processing goes. Maybe optimize
597                 * later.
598                 */
599                // document.body.innerHTML =
600                // document.body.innerHTML.replace(readability.regexps.replaceBrsRe,
601                // '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>');
602                final Element body = getBody();
603                // Node rep =
604                // stringToNode(nodeToString(body).replaceAll(Regexps.replaceBrsRe,
605                // "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>"));
606                // body.getParentNode().replaceChild(rep, body);
607
608                // This is slow!
609                final Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(
610                                Regexps.replaceFontsRe, "<$1span>"));
611                removeChildren(body);
612                body.appendChild(frag);
613
614                /* Remove all comments */
615                removeComments(document);
616        }
617
618        protected void removeComments(Node n) {
619                if (n.getNodeType() == Node.COMMENT_NODE) {
620                        n.getParentNode().removeChild(n);
621                } else {
622                        final NodeList nl = n.getChildNodes();
623                        for (int i = nl.getLength() - 1; i >= 0; i--)
624                                removeComments(nl.item(i));
625                }
626        }
627
628        /**
629         * Prepare the article node for display. Clean out any inline styles,
630         * iframes, forms, strip extraneous
631         * <p>
632         * tags, etc.
633         *
634         * @param Element
635         **/
636        protected void prepArticle(Element articleContent) {
637                cleanStyles(articleContent);
638                killBreaks(articleContent);
639
640                /* Clean out junk from the article content */
641                clean(articleContent, "form");
642                clean(articleContent, "object");
643                clean(articleContent, "h1");
644                /**
645                 * If there is only one h2, they are probably using it as a header and
646                 * not a subheader, so remove it since we already have a header.
647                 ***/
648                if (articleContent.getElementsByTagName("h2").getLength() == 1) {
649                        clean(articleContent, "h2");
650                }
651                clean(articleContent, "iframe");
652
653                cleanHeaders(articleContent);
654
655                /*
656                 * Do these last as the previous stuff may have removed junk that will
657                 * affect these
658                 */
659                cleanConditionally(articleContent, "table");
660                cleanConditionally(articleContent, "ul");
661                cleanConditionally(articleContent, "div");
662
663                /* Remove extra paragraphs */
664                final NodeList articleParagraphs = articleContent.getElementsByTagName("p");
665                for (int i = articleParagraphs.getLength() - 1; i >= 0; i--)
666                {
667                        final int imgCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength();
668                        final int embedCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength();
669                        final int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength();
670
671                        if (imgCount == 0 && embedCount == 0 && objectCount == 0
672                                        && getInnerText((Element) articleParagraphs.item(i), false) == "")
673                        {
674                                articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i));
675                        }
676                }
677
678                // articleContent.innerHTML =
679                // articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, "<p");
680                final Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
681                removeChildren(articleContent);
682                articleContent.appendChild(n);
683
684                // now remove empty p's and tidy up
685                final NodeList nl = articleContent.getElementsByTagName("p");
686                for (int i = nl.getLength() - 1; i >= 0; i--) {
687                        if (nl.item(i).getTextContent().trim().length() == 0)
688                        {
689                                nl.item(i).getParentNode().removeChild(nl.item(i));
690                        } else if (nl.item(i).getChildNodes().getLength() == 1
691                                        && nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE)
692                        {
693                                nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n");
694                        }
695                        else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled"))
696                        {
697                                nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i));
698                        }
699                }
700
701        }
702
703        protected void removeChildren(Node n) {
704                final NodeList nl = n.getChildNodes();
705                final int nn = nl.getLength();
706                for (int i = 0; i < nn; i++)
707                        n.removeChild(nl.item(0));
708        }
709
710        /**
711         * Initialize a node with the readability object. Also checks the
712         * className/id for special names to add to its score.
713         *
714         * @param Element
715         **/
716        protected void initializeNode(Element node) {
717                float contentScore = 0;
718
719                if (node.getTagName() == "DIV") {
720                        contentScore += 5;
721                } else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") {
722                        contentScore += 3;
723                } else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL"
724                                || node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT"
725                                || node.getTagName() == "LI" || node.getTagName() == "FORM")
726                {
727                        contentScore -= 3;
728                } else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3"
729                                || node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6"
730                                || node.getTagName() == "TH")
731                {
732                        contentScore -= 5;
733                }
734
735                contentScore += getClassWeight(node);
736                node.setUserData("readability", contentScore, null);
737        }
738
739        /**
740         * Get an elements class/id weight. Uses regular expressions to tell if this
741         * element looks good or bad.
742         *
743         * @param Element
744         * @return number (Integer)
745         **/
746        protected int getClassWeight(Element e) {
747                if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
748                        return 0;
749                }
750
751                int weight = 0;
752
753                /* Look for a special classname */
754                if (e.getAttribute("class") != "")
755                {
756                        if (search(e.getAttribute("class"), Regexps.negativeRe) != -1) {
757                                weight -= 25;
758                        }
759
760                        if (search(e.getAttribute("class"), Regexps.positiveRe) != -1) {
761                                weight += 25;
762                        }
763                }
764
765                /* Look for a special ID */
766                if (e.getAttribute("id") != "")
767                {
768                        if (search(e.getAttribute("id"), Regexps.negativeRe) != -1) {
769                                weight -= 25;
770                        }
771
772                        if (search(e.getAttribute("id"), Regexps.positiveRe) != -1) {
773                                weight += 25;
774                        }
775                }
776
777                return weight;
778        }
779
780        protected void cleanStyles() {
781                cleanStyles((Element) document);
782        }
783
784        /**
785         * Remove the style attribute on every e and under. TODO: Test if
786         * getElementsByTagName(*) is faster.
787         *
788         * @param Element
789         **/
790        protected void cleanStyles(Element e) {
791                if (e == null)
792                        return;
793                Node cur = e.getFirstChild();
794
795                // Remove any root styles, if we"re able.
796                if (!e.getAttribute("class").equals("readability-styled"))
797                        e.removeAttribute("style");
798
799                // Go until there are no more child nodes
800                while (cur != null) {
801                        if (cur.getNodeType() == Element.ELEMENT_NODE) {
802                                // Remove style attribute(s) :
803                                if (!((Element) cur).getAttribute("class").equals("readability-styled")) {
804                                        ((Element) cur).removeAttribute("style");
805                                }
806                                cleanStyles((Element) cur);
807                        }
808                        cur = cur.getNextSibling();
809                }
810        }
811
812        /**
813         * Remove extraneous break tags from a node.
814         *
815         * @param Element
816         **/
817        protected void killBreaks(Element e) {
818                // e.innerHTML =
819                // e.innerHTML.replace(readability.regexps.killBreaksRe,"<br />");
820
821                final Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe, "<BR />"));
822                removeChildren(e);
823                e.appendChild(n);
824        }
825
826        /**
827         * Clean a node of all elements of type "tag". (Unless it"s a youtube/vimeo
828         * video. People love movies.)
829         *
830         * @param Element
831         * @param string
832         *            tag to clean
833         **/
834        protected void clean(Element e, String tag) {
835                final NodeList targetList = e.getElementsByTagName(tag);
836                final boolean isEmbed = (tag.equals("object") || tag.equals("embed"));
837
838                for (int y = targetList.getLength() - 1; y >= 0; y--) {
839                        /*
840                         * Allow youtube and vimeo videos through as people usually want to
841                         * see those.
842                         */
843                        if (isEmbed) {
844                                String attributeValues = "";
845                                for (int i = 0, il = targetList.item(y).getAttributes().getLength(); i < il; i++) {
846                                        attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|";
847                                }
848
849                                /*
850                                 * First, check the elements attributes to see if any of them
851                                 * contain youtube or vimeo
852                                 */
853                                if (search(attributeValues, Regexps.videoRe) != -1) {
854                                        continue;
855                                }
856
857                                /* Then check the elements inside this element for the same. */
858                                if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) {
859                                        continue;
860                                }
861                        }
862
863                        targetList.item(y).getParentNode().removeChild(targetList.item(y));
864                }
865        }
866
867        /**
868         * Clean out spurious headers from an Element. Checks things like classnames
869         * and link density.
870         *
871         * @param Element
872         **/
873        protected void cleanHeaders(Element e) {
874                for (int headerIndex = 1; headerIndex < 7; headerIndex++) {
875                        final NodeList headers = e.getElementsByTagName("h" + headerIndex);
876                        for (int i = headers.getLength() - 1; i >= 0; i--) {
877                                if (getClassWeight((Element) headers.item(i)) < 0
878                                                || getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD)
879                                {
880                                        headers.item(i).getParentNode().removeChild(headers.item(i));
881                                }
882                        }
883                }
884        }
885
886        /**
887         * Get the density of links as a percentage of the content This is the
888         * amount of text that is inside a link divided by the total text in the
889         * node.
890         *
891         * @param Element
892         * @return number (float)
893         **/
894        protected float getLinkDensity(Element e) {
895                final NodeList links = e.getElementsByTagName("a");
896                final int textLength = getInnerText(e).length();
897                int linkLength = 0;
898
899                for (int i = 0, il = links.getLength(); i < il; i++)
900                {
901                        linkLength += getInnerText((Element) links.item(i)).length();
902                }
903
904                if (linkLength == 0)
905                        return 0;
906
907                return (float) linkLength / (float) textLength;
908        }
909
910        /**
911         * Clean an element of all tags of type "tag" if they look fishy. "Fishy" is
912         * an algorithm based on content length, classnames, link density, number of
913         * images & embeds, etc.
914         **/
915        protected void cleanConditionally(Element e, String tag) {
916                final NodeList tagsList = e.getElementsByTagName(tag);
917                final int curTagsLength = tagsList.getLength();
918
919                /**
920                 * Gather counts for other typical elements embedded within. Traverse
921                 * backwards so we can remove nodes at the same time without effecting
922                 * the traversal.
923                 *
924                 * Todo: Consider taking into account original contentScore here.
925                 **/
926                for (int i = curTagsLength - 1; i >= 0; i--) {
927                        final int weight = getClassWeight((Element) tagsList.item(i));
928                        final float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float) (tagsList.item(i)
929                                        .getUserData("readability")) : 0;
930
931                        dbg("Cleaning Conditionally "
932                                        + tagsList.item(i)
933                                        + " ("
934                                        + ((Element) tagsList.item(i)).getAttribute("class")
935                                        + ":"
936                                        + ((Element) tagsList.item(i)).getAttribute("id")
937                                        + ")"
938                                        + ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i)
939                                                        .getUserData("readability")) : ""));
940
941                        if (weight + contentScore < 0)
942                        {
943                                dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
944                                                + ((Element) tagsList.item(i)).getAttribute("id") + ")");
945                                tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
946                        }
947                        else if (getCharCount((Element) tagsList.item(i), ",") < 10) {
948                                /**
949                                 * If there are not very many commas, and the number of
950                                 * non-paragraph elements is more than paragraphs or other
951                                 * ominous signs, remove the element.
952                                 **/
953                                final int p = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength();
954                                final int img = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength();
955                                final int li = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength() - 100;
956                                final int input = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength();
957
958                                int embedCount = 0;
959                                final NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed");
960                                for (int ei = 0, il = embeds.getLength(); ei < il; ei++) {
961                                        if (search(((Element) embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) {
962                                                embedCount++;
963                                        }
964                                }
965
966                                final float linkDensity = getLinkDensity((Element) tagsList.item(i));
967                                final int contentLength = getInnerText((Element) tagsList.item(i)).length();
968                                boolean toRemove = false;
969
970                                if (img > p) {
971                                        toRemove = true;
972                                } else if (li > p && tag != "ul" && tag != "ol") {
973                                        toRemove = true;
974                                } else if (input > Math.floor(p / 3)) {
975                                        toRemove = true;
976                                } else if (contentLength < 25 && (img == 0 || img > 2)) {
977                                        toRemove = true;
978                                } else if (weight < 25 && linkDensity > 0.2) {
979                                        toRemove = true;
980                                } else if (weight >= 25 && linkDensity > 0.5) {
981                                        toRemove = true;
982                                } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) {
983                                        toRemove = true;
984                                }
985
986                                if (img == 1 && p == 0 && contentLength == 0) {
987                                        final Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0);
988
989                                        String w = "";
990                                        if (theImg.getAttribute("width") != null)
991                                                w = theImg.getAttribute("width");
992
993                                        String h = "";
994                                        if (theImg.getAttribute("height") != null)
995                                                h = theImg.getAttribute("height");
996
997                                        if (!(w.equals("0") || h.equals("0")))
998                                                toRemove = false; // special case - it's just an inline
999                                        // image
1000                                }
1001
1002                                if (toRemove) {
1003                                        dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class") + ":"
1004                                                        + ((Element) tagsList.item(i)).getAttribute("id") + ")");
1005                                        tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
1006                                }
1007                        }
1008                }
1009        }
1010
1011        /**
1012         * Get the number of times a string s appears in the node e.
1013         *
1014         * @param Element
1015         * @param string
1016         *            - what to split on. Default is ","
1017         * @return number (integer)
1018         **/
1019        protected int getCharCount(Element e, String s) {
1020                return getInnerText(e).split(s).length - 1;
1021        }
1022
1023        protected int getCharCount(Element e) {
1024                return getCharCount(e, ",");
1025        }
1026
1027        /**
1028         * @return The article title
1029         */
1030        public String getArticleTitle() {
1031                return articleTitle;
1032        }
1033
1034        /**
1035         * @return The content type of the article
1036         */
1037        public String getArticleContentType() {
1038                return article_contentType;
1039        }
1040
1041        /***
1042         * grabArticle - Using a variety of metrics (content score, classname,
1043         * element types), find the content that is most likely to be the stuff a
1044         * user wants to read. Then return it wrapped up in a div.
1045         *
1046         * @return Element
1047         **/
1048        protected Element grabArticle() {
1049                final boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS);
1050
1051                /**
1052                 * First, node prepping. Trash nodes that look cruddy (like ones with
1053                 * the class name "comment", etc), and turn divs into P tags where they
1054                 * have been used inappropriately (as in, where they contain no other
1055                 * block level elements.)
1056                 *
1057                 * Note: Assignment from index for performance. See
1058                 * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 Todo:
1059                 * Shouldn't this be a reverse traversal?
1060                 **/
1061                Element node = null;
1062                final List<Element> nodesToScore = new ArrayList<Element>();
1063                for (int nodeIndex = 0; (node = (Element) document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++)
1064                {
1065                        /* Remove unlikely candidates */
1066                        if (stripUnlikelyCandidates) {
1067                                final String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id");
1068                                if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 &&
1069                                                search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 &&
1070                                                !node.getTagName().equals("BODY"))
1071                                {
1072                                        dbg("Removing unlikely candidate - " + unlikelyMatchString);
1073                                        node.getParentNode().removeChild(node);
1074                                        nodeIndex--;
1075                                        continue;
1076                                }
1077                        }
1078
1079                        if (node.getTagName().equals("P") || node.getTagName().equals("TD")) {
1080                                nodesToScore.add(node);
1081                        }
1082
1083                        /*
1084                         * Turn all divs that don't have children block level elements into
1085                         * p's
1086                         */
1087                        if (node.getTagName().equals("DIV")) {
1088
1089                                if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) {
1090                                        dbg("Altering div to p");
1091                                        final Element newNode = document.createElement("P");
1092
1093                                        // newNode.innerHTML = node.innerHTML;
1094                                        final NodeList nl = node.getChildNodes();
1095                                        for (int i = 0; i < nl.getLength(); i++)
1096                                                newNode.appendChild(nl.item(i));
1097
1098                                        node.getParentNode().replaceChild(newNode, node);
1099                                        nodeIndex--;
1100                                }
1101                                else
1102                                {
1103                                        /* EXPERIMENTAL */
1104                                        for (int i = 0, il = node.getChildNodes().getLength(); i < il; i++) {
1105                                                final Node childNode = node.getChildNodes().item(i);
1106                                                if (childNode.getNodeType() == Element.TEXT_NODE) {
1107                                                        dbg("replacing text node with a p tag with the same content.");
1108                                                        final Element p = document.createElement("p");
1109                                                        // p.innerHTML = childNode.nodeValue;
1110                                                        p.setNodeValue(childNode.getNodeValue());
1111                                                        p.setTextContent(childNode.getTextContent());
1112                                                        // p.style.display = "inline";
1113                                                        p.setAttribute("class", "readability-styled");
1114                                                        childNode.getParentNode().replaceChild(p, childNode);
1115                                                }
1116                                        }
1117                                }
1118                        }
1119                }
1120
1121                /**
1122                 * Loop through all paragraphs, and assign a score to them based on how
1123                 * content-y they look. Then add their score to their parent node.
1124                 *
1125                 * A score is determined by things like number of commas, class names,
1126                 * etc. Maybe eventually link density.
1127                 **/
1128                final List<Element> candidates = new ArrayList<Element>();
1129                for (int pt = 0; pt < nodesToScore.size(); pt++) {
1130                        final Element parentNode = (Element) nodesToScore.get(pt).getParentNode();
1131                        final Element grandParentNode = (Element) parentNode.getParentNode();
1132                        final String innerText = getInnerText(nodesToScore.get(pt));
1133
1134                        /*
1135                         * If this paragraph is less than 25 characters, don't even count
1136                         * it.
1137                         */
1138                        if (innerText.length() < 25) {
1139                                continue;
1140                        }
1141
1142                        /* Initialize readability data for the parent. */
1143                        if (parentNode.getUserData("readability") == null)
1144                        {
1145                                initializeNode(parentNode);
1146                                candidates.add(parentNode);
1147                        }
1148
1149                        /* Initialize readability data for the grandparent. */
1150                        if (grandParentNode.getUserData("readability") == null)
1151                        {
1152                                initializeNode(grandParentNode);
1153                                candidates.add(grandParentNode);
1154                        }
1155
1156                        float contentScore = 0;
1157
1158                        /* Add a point for the paragraph itself as a base. */
1159                        contentScore++;
1160
1161                        /* Add points for any commas within this paragraph */
1162                        contentScore += innerText.split(",").length;
1163
1164                        /*
1165                         * For every 100 characters in this paragraph, add another point. Up
1166                         * to 3 points.
1167                         */
1168                        contentScore += Math.min(Math.floor(innerText.length() / 100F), 3F);
1169
1170                        /* Add the score to the parent. The grandparent gets half. */
1171                        parentNode.setUserData("readability", ((Float) (parentNode.getUserData("readability")) + contentScore), null);
1172                        grandParentNode.setUserData("readability", ((Float) (grandParentNode.getUserData("readability")))
1173                                        + (contentScore / 2F), null);
1174                }
1175
1176                /**
1177                 * After we've calculated scores, loop through all of the possible
1178                 * candidate nodes we found and find the one with the highest score.
1179                 **/
1180                Element topCandidate = null;
1181                for (int c = 0, cl = candidates.size(); c < cl; c++)
1182                {
1183                        /**
1184                         * Scale the final candidates score based on link density. Good
1185                         * content should have a relatively small link density (5% or less)
1186                         * and be mostly unaffected by this operation.
1187                         **/
1188
1189                        candidates.get(c).setUserData("readability",
1190                                        (Float) (candidates.get(c).getUserData("readability")) * (1F - getLinkDensity(candidates.get(c))),
1191                                        null);
1192
1193                        dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class") + ":"
1194                                        + candidates.get(c).getAttribute("id") + ") with score "
1195                                        + candidates.get(c).getUserData("readability"));
1196
1197                        if (topCandidate == null
1198                                        || (Float) (candidates.get(c).getUserData("readability")) > ((Float) topCandidate
1199                                                        .getUserData("readability")))
1200                        {
1201                                topCandidate = candidates.get(c);
1202                        }
1203                }
1204
1205                if (topCandidate != null)
1206                        dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class") + ":"
1207                                        + topCandidate.getAttribute("id") + ") with score " + topCandidate.getUserData("readability"));
1208
1209                /**
1210                 * If we still have no top candidate, just use the body as a last
1211                 * resort. We also have to copy the body node so it is something we can
1212                 * modify.
1213                 **/
1214                if (topCandidate == null || topCandidate.getTagName().equals("BODY"))
1215                {
1216                        topCandidate = document.createElement("DIV");
1217
1218                        // topCandidate.innerHTML = document.body.innerHTML;
1219                        final NodeList nl = getBody().getChildNodes();
1220                        for (int i = 0; i < nl.getLength(); i++)
1221                                topCandidate.appendChild(nl.item(i));
1222                        // document.body.innerHTML = ""; //should be covered by above
1223
1224                        getBody().appendChild(topCandidate);
1225                        initializeNode(topCandidate);
1226                }
1227
1228                /**
1229                 * Now that we have the top candidate, look through its siblings for
1230                 * content that might also be related. Things like preambles, content
1231                 * split by ads that we removed, etc.
1232                 **/
1233                final Element articleContent = document.createElement("DIV");
1234                articleContent.setAttribute("id", "readability-content");
1235                final float siblingScoreThreshold = Math.max(10F, (Float) topCandidate.getUserData("readability") * 0.2F);
1236                final NodeList siblingNodes = topCandidate.getParentNode().getChildNodes();
1237
1238                for (int s = 0, sl = siblingNodes.getLength(); s < sl; s++)
1239                {
1240                        final Node siblingNode = siblingNodes.item(s);
1241                        boolean append = false;
1242
1243                        if (siblingNode instanceof Element)
1244                                dbg("Looking at sibling node: "
1245                                                + siblingNode
1246                                                + " ("
1247                                                + ((Element) siblingNode).getAttribute("class")
1248                                                + ":"
1249                                                + ((Element) siblingNode).getAttribute("id")
1250                                                + ")"
1251                                                + ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode
1252                                                                .getUserData("readability")) : ""));
1253                        dbg("Sibling has score "
1254                                        + (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability")
1255                                                        : "Unknown"));
1256
1257                        if (siblingNode == topCandidate)
1258                        {
1259                                append = true;
1260                        }
1261
1262                        float contentBonus = 0;
1263                        /*
1264                         * Give a bonus if sibling nodes and top candidates have the example
1265                         * same classname
1266                         */
1267                        if (siblingNode instanceof Element
1268                                        && ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class"))
1269                                        && !topCandidate.getAttribute("class").equals(""))
1270                        {
1271                                contentBonus += (Float) topCandidate.getUserData("readability") * 0.2F;
1272                        }
1273
1274                        if (siblingNode.getUserData("readability") != null
1275                                        && ((Float) siblingNode.getUserData("readability") + contentBonus) >= siblingScoreThreshold)
1276                        {
1277                                append = true;
1278                        }
1279
1280                        if (siblingNode.getNodeName().equals("P")) {
1281                                final float linkDensity = getLinkDensity((Element) siblingNode);
1282                                final String nodeContent = getInnerText((Element) siblingNode);
1283                                final int nodeLength = nodeContent.length();
1284
1285                                if (nodeLength > 80 && linkDensity < 0.25)
1286                                {
1287                                        append = true;
1288                                }
1289                                else if (nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1)
1290                                {
1291                                        append = true;
1292                                }
1293                        }
1294
1295                        if (append)
1296                        {
1297                                dbg("Appending node: " + siblingNode);
1298
1299                                Node nodeToAppend = null;
1300                                if (!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) {
1301                                        /*
1302                                         * We have a node that isn't a common block level element,
1303                                         * like a form or td tag. Turn it into a div so it doesn't
1304                                         * get filtered out later by accident.
1305                                         */
1306
1307                                        dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div.");
1308                                        nodeToAppend = document.createElement("div");
1309                                        if (siblingNode instanceof Element)
1310                                                ((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id"));
1311
1312                                        // nodeToAppend.innerHTML = siblingNode.innerHTML;
1313                                        final NodeList nl = siblingNode.getChildNodes();
1314                                        for (int i = 0; i < nl.getLength(); i++)
1315                                                nodeToAppend.appendChild(nl.item(i));
1316                                } else {
1317                                        nodeToAppend = siblingNode;
1318                                        s--;
1319                                        sl--;
1320                                }
1321
1322                                /*
1323                                 * To ensure a node does not interfere with readability styles,
1324                                 * remove its classnames
1325                                 */
1326                                if (nodeToAppend instanceof Element)
1327                                        ((Element) nodeToAppend).setAttribute("class", "");
1328
1329                                /*
1330                                 * Append sibling and subtract from our list because it removes
1331                                 * the node when you append to another node
1332                                 */
1333                                articleContent.appendChild(nodeToAppend);
1334                        }
1335                }
1336
1337                /**
1338                 * So we have all of the content that we need. Now we clean it up for
1339                 * presentation.
1340                 **/
1341                prepArticle(articleContent);
1342
1343                return articleContent;
1344        }
1345
1346        protected String getInnerHTML(Node n) {
1347                if (n.getNodeType() == Node.TEXT_NODE)
1348                        return n.getTextContent();
1349
1350                String result = "";
1351                final NodeList nl = n.getChildNodes();
1352                for (int i = 0; i < nl.getLength(); i++) {
1353                        if (nl.item(i).getNodeType() == Node.TEXT_NODE)
1354                                result += nl.item(i).getTextContent();
1355                        else if (nl.item(i).getNodeType() == Node.COMMENT_NODE)
1356                                result += "<!-- " + nl.item(i).getTextContent() + " -->";
1357                        else
1358                                result += nodeToString(nl.item(i));
1359                }
1360
1361                return result;
1362        }
1363
1364        protected String nodeToString(Node n) {
1365                return nodeToString(n, false);
1366        }
1367
1368        protected static String nodeToString(Node n, boolean pretty) {
1369                try {
1370                        final DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
1371                        final DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS");
1372                        final LSSerializer writer = impl.createLSSerializer();
1373
1374                        writer.getDomConfig().setParameter("xml-declaration", false);
1375                        if (pretty) {
1376                                writer.getDomConfig().setParameter("format-pretty-print", true);
1377                        }
1378
1379                        return writer.writeToString(n);
1380                } catch (final Exception e) {
1381                        throw new RuntimeException(e);
1382                }
1383        }
1384
1385        protected Node stringToNode(String str) {
1386                try {
1387                        final DOMFragmentParser parser = new DOMFragmentParser();
1388                        final DocumentFragment fragment = document.createDocumentFragment();
1389                        parser.parse(new InputSource(new StringReader(str)), fragment);
1390                        return fragment;
1391
1392                        // try and return the element itself if possible...
1393                        // NodeList nl = fragment.getChildNodes();
1394                        // for (int i=0; i<nl.getLength(); i++) if (nl.item(i).getNodeType()
1395                        // == Node.ELEMENT_NODE) return nl.item(i);
1396                        // return fragment;
1397
1398                } catch (final Exception e) {
1399                        throw new RuntimeException(e);
1400                }
1401        }
1402
1403        /**
1404         * Get the inner text of a node - cross browser compatibly. This also strips
1405         * out any excess whitespace to be found.
1406         *
1407         * @param Element
1408         * @return string
1409         **/
1410        protected String getInnerText(Element e, boolean normalizeSpaces) {
1411                String textContent = "";
1412
1413                textContent = e.getTextContent().replaceAll(Regexps.trimRe, "");
1414
1415                if (normalizeSpaces) {
1416                        return textContent.replaceAll(Regexps.normalizeRe, " ");
1417                } else {
1418                        return textContent;
1419                }
1420        }
1421
1422        protected String getInnerTextSep(Node e) {
1423                if (e.hasChildNodes()) {
1424                        String s = "";
1425                        final NodeList nl = e.getChildNodes();
1426                        for (int i = 0; i < nl.getLength(); i++) {
1427                                if (!nl.item(i).getNodeName().equalsIgnoreCase("script"))
1428                                        s += getInnerTextSep(nl.item(i));
1429                        }
1430                        return s;
1431                } else {
1432                        return e.getTextContent() + " ";
1433                }
1434        }
1435
1436        protected String getInnerText(Element e) {
1437                return getInnerText(e, true);
1438        }
1439
1440        /**
1441         * @return The article HTML content as a {@link String}.
1442         */
1443        public String getArticleHTML() {
1444                if (articleContent == null)
1445                        return "";
1446                return nodeToString(articleContent, true);
1447        }
1448
1449        /**
1450         * @return The articles HTML dom node.
1451         */
1452        public Node getArticleHTML_DOM() {
1453                return articleContent;
1454        }
1455
1456        protected String getArticleDateString() {
1457                return article_date_string;
1458        }
1459
1460        /**
1461         * @return The article date.
1462         */
1463        public Date getArticleDate() {
1464                return article_date;
1465        }
1466
1467        /**
1468         * @return The text of the article.
1469         */
1470        public String getArticleText() {
1471                if (articleContent == null)
1472                        return "Unable to find article content";
1473                // return getInnerText(articleContent, false);
1474                return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
1475        }
1476
1477        /**
1478         * @return Any links in the article.
1479         */
1480        public List<Anchor> getArticleLinks() {
1481                final List<Anchor> anchors = new ArrayList<Anchor>();
1482                if (articleContent == null)
1483                        return anchors;
1484
1485                final NodeList nl = articleContent.getElementsByTagName("a");
1486                for (int i = 0; i < nl.getLength(); i++) {
1487                        final Element a = (Element) nl.item(i);
1488
1489                        final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1490                        anchors.add(anchor);
1491                }
1492                return anchors;
1493        }
1494
1495        /**
1496         * @return Any links in the document.
1497         */
1498        public List<Anchor> getAllLinks() {
1499                final List<Anchor> anchors = new ArrayList<Anchor>();
1500
1501                final NodeList nl = document.getElementsByTagName("a");
1502                for (int i = 0; i < nl.getLength(); i++) {
1503                        final Element a = (Element) nl.item(i);
1504                        final Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1505                        anchors.add(anchor);
1506                }
1507                return anchors;
1508        }
1509
1510        /**
1511         * @return Any images in the article.
1512         */
1513        public List<String> getArticleImages() {
1514                final List<String> images = new ArrayList<String>();
1515                if (articleContent == null)
1516                        return images;
1517
1518                final NodeList nl = articleContent.getElementsByTagName("img");
1519                for (int i = 0; i < nl.getLength(); i++) {
1520                        final Element img = (Element) nl.item(i);
1521                        images.add(img.getAttribute("src"));
1522                }
1523                return images;
1524        }
1525
1526        /**
1527         * @return Any subheadings in the article.
1528         */
1529        public List<String> getArticleSubheadings() {
1530                final List<String> subtitles = new ArrayList<String>();
1531                if (articleContent == null)
1532                        return subtitles;
1533
1534                for (int j = 1; j <= 6; j++) {
1535                        final NodeList nl = articleContent.getElementsByTagName("h" + j);
1536                        if (nl.getLength() > 0) {
1537                                for (int i = 0; i < nl.getLength(); i++) {
1538                                        subtitles.add(nl.item(i).getTextContent());
1539                                }
1540                                break;
1541                        }
1542                }
1543
1544                if (subtitles.size() == 0) {
1545                        // try looking for other likely-looking elements
1546
1547                        final NodeList nl = articleContent.getElementsByTagName("*");
1548                        for (int i = 0; i < nl.getLength(); i++) {
1549                                if (nl.item(i) instanceof Element &&
1550                                                ((Element) nl.item(i)).getAttribute("class") != null &&
1551                                                search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1)
1552                                        subtitles.add(nl.item(i).getTextContent());
1553                        }
1554                }
1555
1556                return subtitles;
1557        }
1558
1559        protected List<Node> findChildNodesWithName(Node parent, String name) {
1560                final NodeList children = parent.getChildNodes();
1561                final List<Node> results = new ArrayList<Node>();
1562
1563                for (int i = 0; i < children.getLength(); ++i) {
1564                        final Node child = children.item(i);
1565                        if (child == null)
1566                                continue;
1567
1568                        final String nodeName = child.getNodeName();
1569                        if (nodeName == null)
1570                                continue;
1571
1572                        if (nodeName.equals(name)) {
1573                                results.add(child);
1574                        }
1575                }
1576                return results;
1577        }
1578
1579        protected int findChildNodeIndex(Node parent, Node childToFind)
1580        {
1581                for (int index = 0; index < parent.getChildNodes().getLength(); index++)
1582                        if (parent.getChildNodes().item(index) == childToFind)
1583                                return index;
1584                return -1;
1585        }
1586
1587        protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException {
1588                final Node parend = walker.getCurrentNode();
1589
1590                if (parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null)
1591                {
1592                        if (parend.getTextContent().trim().length() > 0)
1593                        {
1594                                final int index = findChildNodeIndex(parend.getParentNode(), parend);
1595                                if (index != -1)
1596                                {
1597                                        // square brackets are not valid XML/HTML identifier
1598                                        // characters, so we can use them here
1599                                        map.add(new MappingNode(
1600                                                        parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + index + "]",
1601                                                        parend.getNodeValue()));
1602
1603                                        // System.out.println(
1604                                        // "ELEMENT '"+parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue()
1605                                        // + "["+index+"]"+"'");
1606                                        // System.out.println( "VALUE:  '"+parend.getNodeValue()+"'"
1607                                        // );
1608                                }
1609                        }
1610                }
1611
1612                // traverse children:
1613                for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
1614                        getArticleTextMapping(walker, map);
1615                }
1616
1617                // return position to the current (level up):
1618                walker.setCurrentNode(parend);
1619        }
1620
1621        protected class MappingNode {
1622                String id;
1623                String text;
1624
1625                public MappingNode(String id, String text) {
1626                        this.id = id;
1627                        this.text = text;
1628                }
1629
1630                public String getId() {
1631                        return id;
1632                }
1633
1634                public String getText() {
1635                        return text;
1636                }
1637
1638                @Override
1639                public String toString() {
1640                        return "MappingNode(" + id + " -> " + text + ")";
1641                }
1642        }
1643
1644        /**
1645         * Get the mapping between bits of text in the dom & their xpaths
1646         *
1647         * @return mapping from xpath to text
1648         */
1649        public List<MappingNode> getArticleTextMapping() {
1650                if (articleContent == null)
1651                        return null;
1652
1653                final List<MappingNode> map = new ArrayList<MappingNode>();
1654
1655                final TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT
1656                                | NodeFilter.SHOW_ELEMENT, null, true);
1657
1658                getArticleTextMapping(walker, map);
1659
1660                return map;
1661        }
1662
1663        /**
1664         * Convenience method to build a {@link Readability} instance from an html
1665         * string.
1666         *
1667         * @param html
1668         *            The html string
1669         * @return new {@link Readability} instance.
1670         * @throws SAXException
1671         * @throws IOException
1672         */
1673        public static Readability getReadability(String html) throws SAXException, IOException {
1674                return getReadability(html, false);
1675        }
1676
1677        /**
1678         * Convenience method to build a {@link Readability} instance from an html
1679         * string.
1680         *
1681         * @param html
1682         *            The html string
1683         * @param addTitle
1684         *            Should the title be added to the generated article?
1685         * @return new {@link Readability} instance.
1686         * @throws SAXException
1687         * @throws IOException
1688         */
1689        public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
1690                final DOMParser parser = new DOMParser();
1691                parser.parse(new InputSource(new StringReader(html)));
1692
1693                return new Readability(parser.getDocument(), false, addTitle);
1694        }
1695
1696        /**
1697         * Testing
1698         *
1699         * @param argv
1700         * @throws Exception
1701         */
1702        public static void main(String[] argv) throws Exception {
1703                // URL input = new
1704                // URL("file:///home/dd/Programming/Readability4J/t.html");
1705                // URL input = new
1706                // URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
1707                final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
1708                // URL input = new URL("http://euobserver.com/9/30465");
1709                // URL input = new URL("http://euobserver.com/?aid=23383");
1710                // URL input = new
1711                // URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
1712                // URL input = new URL("file:///Users/jsh2/Desktop/test.html");
1713                // URL input = new
1714                // URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
1715                // URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
1716                // URL input = new
1717                // URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
1718                // URL input = new
1719                // URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
1720                // URL input = new
1721                // URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
1722                // URL input = new URL("http://www.thebigproject.co.uk/news/");
1723                // URL input = new
1724                // URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
1725                // URL input = new
1726                // URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");
1727
1728                // URL input = new
1729                // URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
1730                // URL input = new
1731                // URL("http://uk.mac.ign.com/articles/573/573319p1.html");
1732                final DOMParser parser = new DOMParser();
1733                parser.parse(new InputSource(input.openStream()));
1734
1735                final Readability r = new Readability(parser.getDocument(), true, true);
1736
1737                // System.out.println(r.getArticleTitle());
1738                System.out.println(r.getArticleHTML());
1739                // System.out.println(r.getAllLinks());
1740                // System.out.println(r.getArticleText());
1741
1742                System.out.println();
1743                System.out.println("***");
1744                System.out.println();
1745
1746                for (final MappingNode s : r.getArticleTextMapping())
1747                        System.out.println(s);
1748
1749                // PrintStream out = new PrintStream("news-sites");
1750                // for (Anchor anchor : r.getAllLinks()) {
1751                // out.println(anchor.getHref() + "\t" + anchor.getText());
1752                // }
1753                // out.close();
1754
1755                System.out.println(r.getArticleImages());
1756                // System.out.println(r.getArticleSubheadings());
1757                // System.out.println(r.getArticleHTML());
1758                // System.out.println(r.getArticleHTML_DOM());
1759
1760                // System.out.println(r.getArticleDateString());
1761                // System.out.println(r.getArticleDate());
1762        }
1763}