Thursday, December 10, 2015

Readability内容分析算法,和它的那些多语言实现



http://get.ftqq.com/122.get
JS版本的Readability是最好用的,它可以直接在浏览器完成分析,于是用户还可以人工对分析出来的内容进行修改和校正。
首先,它定义了一系列正则:
regexps: {
        unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
        okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
        positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
        negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
        extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
        divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
        replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
        replaceFonts:          /<(\/?)font[^>]*>/gi,
        trim:                  /^\s+|\s+$/g,
        normalize:             /\s{2,}/g,
        killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
        videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
        skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
        nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
        prevLink:              /(prev|earl|old|new|<|«)/i
    },
可以看到,标签和文字都有加权或降权分组。整个内容分析是通过grabArticle函数来实现的。
首先开始遍历节点
for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) 
然后将不像内容的元素去掉
if (stripUnlikelyCandidates) 
{
    var unlikelyMatchString = node.className + node.id;
    if (
        (
            unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
            unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
            node.tagName !== "BODY"
        )
    )
    {
        dbg("Removing unlikely candidate - " + unlikelyMatchString);
        node.parentNode.removeChild(node);
        nodeIndex-=1;
        continue;
    }               
}
将DIV替换为P标签后,再对目标节点进行遍历,进行计分:
var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {
    var parentNode      = nodesToScore[pt].parentNode;
    var grandParentNode = parentNode ? parentNode.parentNode : null;
    var innerText       = readability.getInnerText(nodesToScore[pt]);

    if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
        continue;
    }

    /* If this paragraph is less than 25 characters, don't even count it. */
    if(innerText.length < 25) {
        continue; }

    /* Initialize readability data for the parent. */
    if(typeof parentNode.readability === 'undefined') {
        readability.initializeNode(parentNode);
        candidates.push(parentNode);
    }

    /* Initialize readability data for the grandparent. */
    if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
        readability.initializeNode(grandParentNode);
        candidates.push(grandParentNode);
    }

    var contentScore = 0;

    /* Add a point for the paragraph itself as a base. */
    contentScore+=1;

    /* Add points for any commas within this paragraph */
    contentScore += innerText.split(',').length;
    
    /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
    contentScore += Math.min(Math.floor(innerText.length / 100), 3);
    
    /* Add the score to the parent. The grandparent gets half. */
    parentNode.readability.contentScore += contentScore;

    if(grandParentNode) {
        grandParentNode.readability.contentScore += contentScore/2;             
    }
}
最后根据分值,重新拼接内容
var articleContent        = document.createElement("DIV");
if (isPaging) {
    articleContent.id     = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes          = topCandidate.parentNode.childNodes;


for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
    var siblingNode = siblingNodes[s];
    var append      = false;

    /**
     * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
     * Example of error visible here: http://www.esquire.com/features/honesty0707
    **/
    if(!siblingNode) {
        continue;
    }

    dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
    dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));

    if(siblingNode === topCandidate)
    {
        append = true;
    }

    var contentBonus = 0;
    /* Give a bonus if sibling nodes and top candidates have the example same classname */
    if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
        contentBonus += topCandidate.readability.contentScore * 0.2;
    }

    if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
    {
        append = true;
    }
    
    if(siblingNode.nodeName === "P") {
        var linkDensity = readability.getLinkDensity(siblingNode);
        var nodeContent = readability.getInnerText(siblingNode);
        var nodeLength  = nodeContent.length;
        
        if(nodeLength > 80 && linkDensity < 0.25)
        {
            append = true;
        }
        else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
        {
            append = true;
        }
    }

    if(append) {
        dbg("Appending node: " + siblingNode);

        var nodeToAppend = null;
        if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
            /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
            
            dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
            nodeToAppend = document.createElement("DIV");
            try {
                nodeToAppend.id = siblingNode.id;
                nodeToAppend.innerHTML = siblingNode.innerHTML;
            }
            catch(er) {
                dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
                nodeToAppend = siblingNode;
                s-=1;
                sl-=1;
            }
        } else {
            nodeToAppend = siblingNode;
            s-=1;
            sl-=1;
        }
        
        /* To ensure a node does not interfere with readability styles, remove its classnames */
        nodeToAppend.className = "";

        /* Append sibling and subtract from our list because it removes the node when you append to another node */
        articleContent.appendChild(nodeToAppend);
    }
}
可以看到,里边用到了很多很trick的技巧,比如25字以下的段落不计分。
整个读下来,还是很有趣的。
由于Readability解决的需求很通用,于是其他语言的程序员纷纷移植了该算法。
  1. PHP版本 https://github.com/feelinglucky/php-readability
  2. Java版本 https://github.com/wuman/JReadability
  3. 当然会有Node版本了 https://www.npmjs.org/package/node-readability
https://github.com/wuman/JReadability/blob/master/src/main/java/com/wuman/jreadability/Readability.java
    protected Element grabArticle(boolean preserveUnlikelyCandidates) {
        /**
         * First, node prepping. Trash nodes that look cruddy (like ones with
         * the class name "comment", etc), and turn divs into P tags where they
         * have been used inappropriately (as in, where they contain no other
         * block level elements.)
         *
         * Note: Assignment from index for performance. See
         * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO:
         * Shouldn't this be a reverse traversal?
         **/
        for (Element node : mDocument.getAllElements()) {
            /* Remove unlikely candidates */
            if (!preserveUnlikelyCandidates) {
                String unlikelyMatchString = node.className() + node.id();
                Matcher unlikelyCandidatesMatcher = Patterns.get(
                        Patterns.RegEx.UNLIKELY_CANDIDATES).matcher(
                        unlikelyMatchString);
                Matcher maybeCandidateMatcher = Patterns.get(
                        Patterns.RegEx.OK_MAYBE_ITS_A_CANDIDATE).matcher(
                        unlikelyMatchString);
                if (unlikelyCandidatesMatcher.find()
                        && maybeCandidateMatcher.find()
                        && !"body".equalsIgnoreCase(node.tagName())) {
                    node.remove();
                    dbg("Removing unlikely candidate - " + unlikelyMatchString);
                    continue;
                }
            }

            /*
             * Turn all divs that don't have children block level elements into
             * p's
             */
            if ("div".equalsIgnoreCase(node.tagName())) {
                Matcher matcher = Patterns
                        .get(Patterns.RegEx.DIV_TO_P_ELEMENTS).matcher(
                                node.html());
                if (!matcher.find()) {
                    dbg("Alternating div to p: " + node);
                    try {
                        node.tagName("p");
                    } catch (Exception e) {
                        dbg("Could not alter div to p, probably an IE restriction, reverting back to div.",
                                e);
                    }
                }
            }
        }

        /**
         * Loop through all paragraphs, and assign a score to them based on how
         * content-y they look. Then add their score to their parent node.
         *
         * A score is determined by things like number of commas, class names,
         * etc. Maybe eventually link density.
         **/
        Elements allParagraphs = mDocument.getElementsByTag("p");
        ArrayList<Element> candidates = new ArrayList<Element>();

        for (Element node : allParagraphs) {
            Element parentNode = node.parent();
            Element grandParentNode = parentNode.parent();
            String innerText = getInnerText(node, true);

            /*
             * If this paragraph is less than 25 characters, don't even count
             * it.
             */
            if (innerText.length() < 25) {
                continue;
            }

            /* Initialize readability data for the parent. */
            if (!parentNode.hasAttr("readabilityContentScore")) {
                initializeNode(parentNode);
                candidates.add(parentNode);
            }

            /* Initialize readability data for the grandparent. */
            if (!grandParentNode.hasAttr("readabilityContentScore")) {
                initializeNode(grandParentNode);
                candidates.add(grandParentNode);
            }

            int contentScore = 0;

            /* Add a point for the paragraph itself as a base. */
            contentScore++;

            /* Add points for any commas within this paragraph */
            contentScore += innerText.split(",").length;

            /*
             * For every 100 characters in this paragraph, add another point. Up
             * to 3 points.
             */
            contentScore += Math.min(Math.floor(innerText.length() / 100), 3);

            /* Add the score to the parent. The grandparent gets half. */
            incrementContentScore(parentNode, contentScore);
            incrementContentScore(grandParentNode, contentScore / 2);
        }

        /**
         * After we've calculated scores, loop through all of the possible
         * candidate nodes we found and find the one with the highest score.
         */
        Element topCandidate = null;
        for (Element candidate : candidates) {
            /**
             * Scale the final candidates score based on link density. Good
             * content should have a relatively small link density (5% or less)
             * and be mostly unaffected by this operation.
             */
            scaleContentScore(candidate, 1 - getLinkDensity(candidate));

            dbg("Candidate: (" + candidate.className() + ":" + candidate.id()
                    + ") with score " + getContentScore(candidate));

            if (topCandidate == null
                    || getContentScore(candidate) > getContentScore(topCandidate)) {
                topCandidate = candidate;
            }
        }

        /**
         * If we still have no top candidate, just use the body as a last
         * resort. We also have to copy the body node so it is something we can
         * modify.
         */
        if (topCandidate == null
                || "body".equalsIgnoreCase(topCandidate.tagName())) {
            topCandidate = mDocument.createElement("div");
            topCandidate.html(mDocument.body().html());
            mDocument.body().html("");
            mDocument.body().appendChild(topCandidate);
            initializeNode(topCandidate);
        }

        /**
         * Now that we have the top candidate, look through its siblings for
         * content that might also be related. Things like preambles, content
         * split by ads that we removed, etc.
         */
        Element articleContent = mDocument.createElement("div");
        articleContent.attr("id", "readability-content");
        int siblingScoreThreshold = Math.max(10,
                (int) (getContentScore(topCandidate) * 0.2f));
        Elements siblingNodes = topCandidate.parent().children();
        for (Element siblingNode : siblingNodes) {
            boolean append = false;

            dbg("Looking at sibling node: (" + siblingNode.className() + ":"
                    + siblingNode.id() + ")" + " with score "
                    + getContentScore(siblingNode));

            if (siblingNode == topCandidate) {
                append = true;
            }

            if (getContentScore(siblingNode) >= siblingScoreThreshold) {
                append = true;
            }

            if ("p".equalsIgnoreCase(siblingNode.tagName())) {
                float linkDensity = getLinkDensity(siblingNode);
                String nodeContent = getInnerText(siblingNode, true);
                int nodeLength = nodeContent.length();

                if (nodeLength > 80 && linkDensity < 0.25f) {
                    append = true;
                } else if (nodeLength < 80 && linkDensity == 0.0f
                        && nodeContent.matches(".*\\.( |$).*")) {
                    append = true;
                }
            }

            if (append) {
                dbg("Appending node: " + siblingNode);

                /*
                 * Append sibling and subtract from our list because it removes
                 * the node when you append to another node
                 */
                articleContent.appendChild(siblingNode);
                continue;
            }
        }

        /**
         * So we have all of the content that we need. Now we clean it up for
         * presentation.
         */
        prepArticle(articleContent);

        return articleContent;
    }

Labels

Review (572) System Design (334) System Design - Review (198) Java (189) Coding (75) Interview-System Design (65) Interview (63) Book Notes (59) Coding - Review (59) to-do (45) Linux (43) Knowledge (39) Interview-Java (35) Knowledge - Review (32) Database (31) Design Patterns (31) Big Data (29) Product Architecture (28) MultiThread (27) Soft Skills (27) Concurrency (26) Cracking Code Interview (26) Miscs (25) Distributed (24) OOD Design (24) Google (23) Career (22) Interview - Review (21) Java - Code (21) Operating System (21) Interview Q&A (20) System Design - Practice (20) Tips (19) Algorithm (17) Company - Facebook (17) Security (17) How to Ace Interview (16) Brain Teaser (14) Linux - Shell (14) Redis (14) Testing (14) Tools (14) Code Quality (13) Search (13) Spark (13) Spring (13) Company - LinkedIn (12) How to (12) Interview-Database (12) Interview-Operating System (12) Solr (12) Architecture Principles (11) Resource (10) Amazon (9) Cache (9) Git (9) Interview - MultiThread (9) Scalability (9) Trouble Shooting (9) Web Dev (9) Architecture Model (8) Better Programmer (8) Cassandra (8) Company - Uber (8) Java67 (8) Math (8) OO Design principles (8) SOLID (8) Design (7) Interview Corner (7) JVM (7) Java Basics (7) Kafka (7) Mac (7) Machine Learning (7) NoSQL (7) C++ (6) Chrome (6) File System (6) Highscalability (6) How to Better (6) Network (6) Restful (6) CareerCup (5) Code Review (5) Hash (5) How to Interview (5) JDK Source Code (5) JavaScript (5) Leetcode (5) Must Known (5) Python (5)

Popular Posts