001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.web;
031
032import java.io.File;
033import java.io.FileInputStream;
034import java.io.IOException;
035import java.io.PrintStream;
036import java.net.MalformedURLException;
037import java.net.URL;
038
039import org.cyberneko.html.parsers.DOMParser;
040import org.kohsuke.args4j.CmdLineException;
041import org.kohsuke.args4j.CmdLineParser;
042import org.openimaj.web.readability.Anchor;
043import org.openimaj.web.readability.Readability;
044import org.xml.sax.InputSource;
045import org.xml.sax.SAXException;
046
047/**
048 * Command-line driver for the readability4j engine.
049 * 
050 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
051 *
052 */
053public class Reader {
054        /**
055         * Main method.
056         * @param args
057         * @throws MalformedURLException
058         * @throws IOException
059         * @throws SAXException
060         */
061        public static void main(String [] args) throws MalformedURLException, IOException, SAXException {
062                ReaderOptions options = new ReaderOptions();
063                CmdLineParser parser = new CmdLineParser(options);
064                
065                PrintStream out = new PrintStream(System.out, true, "UTF-8");
066                
067            try {
068                    parser.parseArgument(args);
069                    options.validate();
070                } catch(CmdLineException e) {
071                    System.err.println(e.getMessage());
072                    System.err.println("Usage: java -jar Readability4J.jar [options...] files_or_urls");
073                    parser.printUsage(System.err);
074                    return;
075                }
076                
077                for (String document : options.getDocuments()) {
078                        InputSource is = null;
079                        
080                        if (document.contains("://")) {
081                                is = new InputSource(new URL(document).openStream());
082                        } else {
083                                is = new InputSource(new FileInputStream(new File(document)));
084                        }
085                        
086                        DOMParser domparser = new DOMParser();
087                        domparser.parse(is);
088                        
089                        Readability r = new Readability(domparser.getDocument(), options.isDebug());
090                        
091                        if (options.isMultiDocument()) {
092                                //print document location if parsing multiple
093                                out.println("*** Document: " + document + " ***");
094                        }
095                        
096                        if (options.isTitle()) {
097                                if (options.isMultiMode()) 
098                                        out.println("* TITLE *");
099                                out.println(r.getArticleTitle());
100                        }
101                        
102                        if (options.isSubhead()) {
103                                if (options.isMultiMode()) 
104                                        out.println("* SUB-HEADINGS *");
105                                
106                                for (String heading : r.getArticleSubheadings()) {
107                                        out.println(heading);
108                                }
109                        }
110
111                        if (options.isDate()) {
112                                if (options.isMultiMode()) 
113                                        out.println("* DATE *");
114
115                                out.println(r.getArticleDate());
116                        }
117                        
118                        if (options.isHtml()) {
119                                if (options.isMultiMode()) 
120                                        out.println("* HTML *");
121                                out.println(r.getArticleHTML());
122                        }
123                        
124                        if (options.isText()) {
125                                if (options.isMultiMode()) 
126                                        out.println("* TEXT *");
127                                out.println(r.getArticleText());
128                        }
129                        
130                        if (options.isLinks()) {
131                                if (options.isMultiMode()) 
132                                        out.println("* LINKS *");
133                                
134                                for (Anchor a : r.getArticleLinks()) {
135                                        out.println(a.getHref() + "\t" + a.getText());
136                                }
137                        }
138                        
139                        if (options.isImages()) {
140                                if (options.isMultiMode()) 
141                                        out.println("* IMAGES *");
142                                
143                                for (String img : r.getArticleImages()) {
144                                        out.println(img);
145                                }
146                        }
147                }
148        }
149}