public class Readability extends Object
Modifier and Type | Class and Description |
---|---|
protected class |
Readability.MappingNode |
protected static class |
Readability.Regexps
Regular expressions for different types of content
|
Modifier and Type | Field and Description |
---|---|
protected boolean |
addTitle |
protected String |
article_contentType |
protected Date |
article_date |
protected String |
article_date_string |
protected Element |
articleContent |
protected String |
articleTitle |
protected boolean |
debug |
protected Document |
document |
protected EnumSet<org.openimaj.web.readability.Readability.Flag> |
flags |
static float |
LINK_DENSITY_THRESHOLD
Threshold for removing elements with lots of links
|
Constructor and Description |
---|
Readability(Document document)
Construct with the given document.
|
Readability(Document document,
boolean debug)
Construct with the given document.
|
Readability(Document document,
boolean debug,
boolean addTitle)
Construct with the given document.
|
Modifier and Type | Method and Description |
---|---|
static void |
augmentDocument(Document document)
Iterates through all the ELEMENT nodes in a document and gives them ids
if they don't already have them.
|
protected void |
clean(Element e,
String tag)
Clean a node of all elements of type "tag".
|
protected void |
cleanConditionally(Element e,
String tag)
Clean an element of all tags of type "tag" if they look fishy.
|
protected void |
cleanHeaders(Element e)
Clean out spurious headers from an Element.
|
protected void |
cleanStyles() |
protected void |
cleanStyles(Element e)
Remove the style attribute on every e and under.
|
protected void |
dbg(String s) |
protected void |
findArticleDate() |
protected void |
findArticleEncoding() |
protected String |
findArticleTitle()
Get the article title.
|
protected int |
findChildNodeIndex(Node parent,
Node childToFind) |
protected List<Node> |
findChildNodesWithName(Node parent,
String name) |
List<Anchor> |
getAllLinks() |
String |
getArticleContentType() |
Date |
getArticleDate() |
protected String |
getArticleDateString() |
Node |
getArticleHTML_DOM() |
String |
getArticleHTML() |
List<String> |
getArticleImages() |
List<Anchor> |
getArticleLinks() |
List<String> |
getArticleSubheadings() |
String |
getArticleText() |
List<Readability.MappingNode> |
getArticleTextMapping()
Get the mapping between bits of text in the dom & their xpaths
|
protected void |
getArticleTextMapping(org.w3c.dom.traversal.TreeWalker walker,
List<Readability.MappingNode> map) |
String |
getArticleTitle() |
protected Element |
getBody()
Equivalent to document.body in JS
|
protected int |
getCharCount(Element e) |
protected int |
getCharCount(Element e,
String s)
Get the number of times a string s appears in the node e.
|
protected int |
getClassWeight(Element e)
Get an elements class/id weight.
|
protected String |
getInnerHTML(Node n) |
protected String |
getInnerText(Element e) |
protected String |
getInnerText(Element e,
boolean normalizeSpaces)
Get the inner text of a node - cross browser compatibly.
|
protected String |
getInnerTextSep(Node e) |
protected float |
getLinkDensity(Element e)
Get the density of links as a percentage of the content This is the
amount of text that is inside a link divided by the total text in the
node.
|
static Readability |
getReadability(String html)
Convenience method to build a
Readability instance from an html
string. |
static Readability |
getReadability(String html,
boolean addTitle)
Convenience method to build a
Readability instance from an html
string. |
protected String |
getTitle() |
protected Element |
grabArticle()
grabArticle - Using a variety of metrics (content score, classname,
element types), find the content that is most likely to be the stuff a
user wants to read.
|
boolean |
hasContent() |
protected void |
init()
Runs readability.
|
protected void |
initializeNode(Element node)
Initialize a node with the readability object.
|
protected void |
killBreaks(Element e)
Remove extraneous break tags from a node.
|
static void |
main(String[] argv)
Testing
|
protected String[] |
match(String input,
String regex)
Javascript-like String.match
|
protected String |
nodeToString(Node n) |
protected static String |
nodeToString(Node n,
boolean pretty) |
protected void |
parseDate() |
protected void |
prepArticle(Element articleContent)
Prepare the article node for display.
|
protected void |
prepDocument()
Prepare the HTML document for readability to scrape it.
|
protected void |
removeChildren(Node n) |
protected void |
removeComments(Node n) |
protected int |
search(String input,
String regex)
Javascript-like String.search
|
protected Node |
stringToNode(String str) |
public static float LINK_DENSITY_THRESHOLD
protected String articleTitle
protected Element articleContent
protected String article_date_string
protected Date article_date
protected String article_contentType
protected boolean debug
protected boolean addTitle
public Readability(Document document)
document
- The document.public Readability(Document document, boolean debug)
document
- The document.debug
- Enable debugging output.public Readability(Document document, boolean debug, boolean addTitle)
document
- The document.debug
- Enable debugging output.addTitle
- Add title to output.public static void augmentDocument(Document document)
document
- protected String[] match(String input, String regex)
input
- regex
- public boolean hasContent()
protected int search(String input, String regex)
input
- regex
- protected void findArticleEncoding()
protected void findArticleDate()
protected void parseDate()
protected String findArticleTitle()
protected void init()
protected void prepDocument()
protected void removeComments(Node n)
protected void prepArticle(Element articleContent)
tags, etc.
Element
- protected void removeChildren(Node n)
protected void initializeNode(Element node)
Element
- protected int getClassWeight(Element e)
Element
- protected void cleanStyles()
protected void cleanStyles(Element e)
Element
- protected void killBreaks(Element e)
Element
- protected void clean(Element e, String tag)
Element
- string
- tag to cleanprotected void cleanHeaders(Element e)
Element
- protected float getLinkDensity(Element e)
Element
- protected void cleanConditionally(Element e, String tag)
protected int getCharCount(Element e, String s)
Element
- string
- - what to split on. Default is ","protected int getCharCount(Element e)
public String getArticleTitle()
public String getArticleContentType()
protected Element grabArticle()
protected String getInnerHTML(Node n)
protected String nodeToString(Node n)
protected static String nodeToString(Node n, boolean pretty)
protected Node stringToNode(String str)
protected String getInnerText(Element e, boolean normalizeSpaces)
Element
- protected String getInnerTextSep(Node e)
protected String getInnerText(Element e)
public String getArticleHTML()
String
.public Node getArticleHTML_DOM()
protected String getArticleDateString()
public Date getArticleDate()
public String getArticleText()
public List<Anchor> getArticleLinks()
public List<Anchor> getAllLinks()
public List<String> getArticleImages()
public List<String> getArticleSubheadings()
protected List<Node> findChildNodesWithName(Node parent, String name)
protected int findChildNodeIndex(Node parent, Node childToFind)
protected void getArticleTextMapping(org.w3c.dom.traversal.TreeWalker walker, List<Readability.MappingNode> map) throws DOMException
DOMException
public List<Readability.MappingNode> getArticleTextMapping()
public static Readability getReadability(String html) throws SAXException, IOException
Readability
instance from an html
string.html
- The html stringReadability
instance.SAXException
IOException
public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException
Readability
instance from an html
string.html
- The html stringaddTitle
- Should the title be added to the generated article?Readability
instance.SAXException
IOException