http://get.ftqq.com/122.get
https://github.com/wuman/JReadability/blob/master/src/main/java/com/wuman/jreadability/Readability.java
protected Element grabArticle(boolean preserveUnlikelyCandidates) {
/**
* First, node prepping. Trash nodes that look cruddy (like ones with
* the class name "comment", etc), and turn divs into P tags where they
* have been used inappropriately (as in, where they contain no other
* block level elements.)
*
* Note: Assignment from index for performance. See
* http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO:
* Shouldn't this be a reverse traversal?
**/
for (Element node : mDocument.getAllElements()) {
/* Remove unlikely candidates */
if (!preserveUnlikelyCandidates) {
String unlikelyMatchString = node.className() + node.id();
Matcher unlikelyCandidatesMatcher = Patterns.get(
Patterns.RegEx.UNLIKELY_CANDIDATES).matcher(
unlikelyMatchString);
Matcher maybeCandidateMatcher = Patterns.get(
Patterns.RegEx.OK_MAYBE_ITS_A_CANDIDATE).matcher(
unlikelyMatchString);
if (unlikelyCandidatesMatcher.find()
&& maybeCandidateMatcher.find()
&& !"body".equalsIgnoreCase(node.tagName())) {
node.remove();
dbg("Removing unlikely candidate - " + unlikelyMatchString);
continue;
}
}
/*
* Turn all divs that don't have children block level elements into
* p's
*/
if ("div".equalsIgnoreCase(node.tagName())) {
Matcher matcher = Patterns
.get(Patterns.RegEx.DIV_TO_P_ELEMENTS).matcher(
node.html());
if (!matcher.find()) {
dbg("Alternating div to p: " + node);
try {
node.tagName("p");
} catch (Exception e) {
dbg("Could not alter div to p, probably an IE restriction, reverting back to div.",
e);
}
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how
* content-y they look. Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names,
* etc. Maybe eventually link density.
**/
Elements allParagraphs = mDocument.getElementsByTag("p");
ArrayList<Element> candidates = new ArrayList<Element>();
for (Element node : allParagraphs) {
Element parentNode = node.parent();
Element grandParentNode = parentNode.parent();
String innerText = getInnerText(node, true);
/*
* If this paragraph is less than 25 characters, don't even count
* it.
*/
if (innerText.length() < 25) {
continue;
}
/* Initialize readability data for the parent. */
if (!parentNode.hasAttr("readabilityContentScore")) {
initializeNode(parentNode);
candidates.add(parentNode);
}
/* Initialize readability data for the grandparent. */
if (!grandParentNode.hasAttr("readabilityContentScore")) {
initializeNode(grandParentNode);
candidates.add(grandParentNode);
}
int contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore++;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(",").length;
/*
* For every 100 characters in this paragraph, add another point. Up
* to 3 points.
*/
contentScore += Math.min(Math.floor(innerText.length() / 100), 3);
/* Add the score to the parent. The grandparent gets half. */
incrementContentScore(parentNode, contentScore);
incrementContentScore(grandParentNode, contentScore / 2);
}
/**
* After we've calculated scores, loop through all of the possible
* candidate nodes we found and find the one with the highest score.
*/
Element topCandidate = null;
for (Element candidate : candidates) {
/**
* Scale the final candidates score based on link density. Good
* content should have a relatively small link density (5% or less)
* and be mostly unaffected by this operation.
*/
scaleContentScore(candidate, 1 - getLinkDensity(candidate));
dbg("Candidate: (" + candidate.className() + ":" + candidate.id()
+ ") with score " + getContentScore(candidate));
if (topCandidate == null
|| getContentScore(candidate) > getContentScore(topCandidate)) {
topCandidate = candidate;
}
}
/**
* If we still have no top candidate, just use the body as a last
* resort. We also have to copy the body node so it is something we can
* modify.
*/
if (topCandidate == null
|| "body".equalsIgnoreCase(topCandidate.tagName())) {
topCandidate = mDocument.createElement("div");
topCandidate.html(mDocument.body().html());
mDocument.body().html("");
mDocument.body().appendChild(topCandidate);
initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for
* content that might also be related. Things like preambles, content
* split by ads that we removed, etc.
*/
Element articleContent = mDocument.createElement("div");
articleContent.attr("id", "readability-content");
int siblingScoreThreshold = Math.max(10,
(int) (getContentScore(topCandidate) * 0.2f));
Elements siblingNodes = topCandidate.parent().children();
for (Element siblingNode : siblingNodes) {
boolean append = false;
dbg("Looking at sibling node: (" + siblingNode.className() + ":"
+ siblingNode.id() + ")" + " with score "
+ getContentScore(siblingNode));
if (siblingNode == topCandidate) {
append = true;
}
if (getContentScore(siblingNode) >= siblingScoreThreshold) {
append = true;
}
if ("p".equalsIgnoreCase(siblingNode.tagName())) {
float linkDensity = getLinkDensity(siblingNode);
String nodeContent = getInnerText(siblingNode, true);
int nodeLength = nodeContent.length();
if (nodeLength > 80 && linkDensity < 0.25f) {
append = true;
} else if (nodeLength < 80 && linkDensity == 0.0f
&& nodeContent.matches(".*\\.( |$).*")) {
append = true;
}
}
if (append) {
dbg("Appending node: " + siblingNode);
/*
* Append sibling and subtract from our list because it removes
* the node when you append to another node
*/
articleContent.appendChild(siblingNode);
continue;
}
}
/**
* So we have all of the content that we need. Now we clean it up for
* presentation.
*/
prepArticle(articleContent);
return articleContent;
}
JS版本的Readability是最好用的,它可以直接在浏览器完成分析,于是用户还可以人工对分析出来的内容进行修改和校正。
首先,它定义了一系列正则:
regexps: {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
replaceFonts: /<(\/?)font[^>]*>/gi,
trim: /^\s+|\s+$/g,
normalize: /\s{2,}/g,
killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g,
videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
prevLink: /(prev|earl|old|new|<|«)/i
},
可以看到,标签和文字都有加权或降权分组。整个内容分析是通过grabArticle函数来实现的。
首先开始遍历节点
for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1)
然后将不像内容的元素去掉
if (stripUnlikelyCandidates)
{
var unlikelyMatchString = node.className + node.id;
if (
(
unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
node.tagName !== "BODY"
)
)
{
dbg("Removing unlikely candidate - " + unlikelyMatchString);
node.parentNode.removeChild(node);
nodeIndex-=1;
continue;
}
}
将DIV替换为P标签后,再对目标节点进行遍历,进行计分:
var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {
var parentNode = nodesToScore[pt].parentNode;
var grandParentNode = parentNode ? parentNode.parentNode : null;
var innerText = readability.getInnerText(nodesToScore[pt]);
if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
continue;
}
/* If this paragraph is less than 25 characters, don't even count it. */
if(innerText.length < 25) {
continue; }
/* Initialize readability data for the parent. */
if(typeof parentNode.readability === 'undefined') {
readability.initializeNode(parentNode);
candidates.push(parentNode);
}
/* Initialize readability data for the grandparent. */
if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
readability.initializeNode(grandParentNode);
candidates.push(grandParentNode);
}
var contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore+=1;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(',').length;
/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
/* Add the score to the parent. The grandparent gets half. */
parentNode.readability.contentScore += contentScore;
if(grandParentNode) {
grandParentNode.readability.contentScore += contentScore/2;
}
}
最后根据分值,重新拼接内容
var articleContent = document.createElement("DIV");
if (isPaging) {
articleContent.id = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes = topCandidate.parentNode.childNodes;
for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
var siblingNode = siblingNodes[s];
var append = false;
/**
* Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
* Example of error visible here: http://www.esquire.com/features/honesty0707
**/
if(!siblingNode) {
continue;
}
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
if(siblingNode === topCandidate)
{
append = true;
}
var contentBonus = 0;
/* Give a bonus if sibling nodes and top candidates have the example same classname */
if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
contentBonus += topCandidate.readability.contentScore * 0.2;
}
if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
{
append = true;
}
if(siblingNode.nodeName === "P") {
var linkDensity = readability.getLinkDensity(siblingNode);
var nodeContent = readability.getInnerText(siblingNode);
var nodeLength = nodeContent.length;
if(nodeLength > 80 && linkDensity < 0.25)
{
append = true;
}
else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
{
append = true;
}
}
if(append) {
dbg("Appending node: " + siblingNode);
var nodeToAppend = null;
if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
nodeToAppend = document.createElement("DIV");
try {
nodeToAppend.id = siblingNode.id;
nodeToAppend.innerHTML = siblingNode.innerHTML;
}
catch(er) {
dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
nodeToAppend = siblingNode;
s-=1;
sl-=1;
}
} else {
nodeToAppend = siblingNode;
s-=1;
sl-=1;
}
/* To ensure a node does not interfere with readability styles, remove its classnames */
nodeToAppend.className = "";
/* Append sibling and subtract from our list because it removes the node when you append to another node */
articleContent.appendChild(nodeToAppend);
}
}
可以看到,里边用到了很多很trick的技巧,比如25字以下的段落不计分。
整个读下来,还是很有趣的。
由于Readability解决的需求很通用,于是其他语言的程序员纷纷移植了该算法。
protected Element grabArticle(boolean preserveUnlikelyCandidates) {
/**
* First, node prepping. Trash nodes that look cruddy (like ones with
* the class name "comment", etc), and turn divs into P tags where they
* have been used inappropriately (as in, where they contain no other
* block level elements.)
*
* Note: Assignment from index for performance. See
* http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO:
* Shouldn't this be a reverse traversal?
**/
for (Element node : mDocument.getAllElements()) {
/* Remove unlikely candidates */
if (!preserveUnlikelyCandidates) {
String unlikelyMatchString = node.className() + node.id();
Matcher unlikelyCandidatesMatcher = Patterns.get(
Patterns.RegEx.UNLIKELY_CANDIDATES).matcher(
unlikelyMatchString);
Matcher maybeCandidateMatcher = Patterns.get(
Patterns.RegEx.OK_MAYBE_ITS_A_CANDIDATE).matcher(
unlikelyMatchString);
if (unlikelyCandidatesMatcher.find()
&& maybeCandidateMatcher.find()
&& !"body".equalsIgnoreCase(node.tagName())) {
node.remove();
dbg("Removing unlikely candidate - " + unlikelyMatchString);
continue;
}
}
/*
* Turn all divs that don't have children block level elements into
* p's
*/
if ("div".equalsIgnoreCase(node.tagName())) {
Matcher matcher = Patterns
.get(Patterns.RegEx.DIV_TO_P_ELEMENTS).matcher(
node.html());
if (!matcher.find()) {
dbg("Alternating div to p: " + node);
try {
node.tagName("p");
} catch (Exception e) {
dbg("Could not alter div to p, probably an IE restriction, reverting back to div.",
e);
}
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how
* content-y they look. Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names,
* etc. Maybe eventually link density.
**/
Elements allParagraphs = mDocument.getElementsByTag("p");
ArrayList<Element> candidates = new ArrayList<Element>();
for (Element node : allParagraphs) {
Element parentNode = node.parent();
Element grandParentNode = parentNode.parent();
String innerText = getInnerText(node, true);
/*
* If this paragraph is less than 25 characters, don't even count
* it.
*/
if (innerText.length() < 25) {
continue;
}
/* Initialize readability data for the parent. */
if (!parentNode.hasAttr("readabilityContentScore")) {
initializeNode(parentNode);
candidates.add(parentNode);
}
/* Initialize readability data for the grandparent. */
if (!grandParentNode.hasAttr("readabilityContentScore")) {
initializeNode(grandParentNode);
candidates.add(grandParentNode);
}
int contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore++;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(",").length;
/*
* For every 100 characters in this paragraph, add another point. Up
* to 3 points.
*/
contentScore += Math.min(Math.floor(innerText.length() / 100), 3);
/* Add the score to the parent. The grandparent gets half. */
incrementContentScore(parentNode, contentScore);
incrementContentScore(grandParentNode, contentScore / 2);
}
/**
* After we've calculated scores, loop through all of the possible
* candidate nodes we found and find the one with the highest score.
*/
Element topCandidate = null;
for (Element candidate : candidates) {
/**
* Scale the final candidates score based on link density. Good
* content should have a relatively small link density (5% or less)
* and be mostly unaffected by this operation.
*/
scaleContentScore(candidate, 1 - getLinkDensity(candidate));
dbg("Candidate: (" + candidate.className() + ":" + candidate.id()
+ ") with score " + getContentScore(candidate));
if (topCandidate == null
|| getContentScore(candidate) > getContentScore(topCandidate)) {
topCandidate = candidate;
}
}
/**
* If we still have no top candidate, just use the body as a last
* resort. We also have to copy the body node so it is something we can
* modify.
*/
if (topCandidate == null
|| "body".equalsIgnoreCase(topCandidate.tagName())) {
topCandidate = mDocument.createElement("div");
topCandidate.html(mDocument.body().html());
mDocument.body().html("");
mDocument.body().appendChild(topCandidate);
initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for
* content that might also be related. Things like preambles, content
* split by ads that we removed, etc.
*/
Element articleContent = mDocument.createElement("div");
articleContent.attr("id", "readability-content");
int siblingScoreThreshold = Math.max(10,
(int) (getContentScore(topCandidate) * 0.2f));
Elements siblingNodes = topCandidate.parent().children();
for (Element siblingNode : siblingNodes) {
boolean append = false;
dbg("Looking at sibling node: (" + siblingNode.className() + ":"
+ siblingNode.id() + ")" + " with score "
+ getContentScore(siblingNode));
if (siblingNode == topCandidate) {
append = true;
}
if (getContentScore(siblingNode) >= siblingScoreThreshold) {
append = true;
}
if ("p".equalsIgnoreCase(siblingNode.tagName())) {
float linkDensity = getLinkDensity(siblingNode);
String nodeContent = getInnerText(siblingNode, true);
int nodeLength = nodeContent.length();
if (nodeLength > 80 && linkDensity < 0.25f) {
append = true;
} else if (nodeLength < 80 && linkDensity == 0.0f
&& nodeContent.matches(".*\\.( |$).*")) {
append = true;
}
}
if (append) {
dbg("Appending node: " + siblingNode);
/*
* Append sibling and subtract from our list because it removes
* the node when you append to another node
*/
articleContent.appendChild(siblingNode);
continue;
}
}
/**
* So we have all of the content that we need. Now we clean it up for
* presentation.
*/
prepArticle(articleContent);
return articleContent;
}