Readability\Readability::grabArticle PHP Method

grabArticle() protected method

Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
protected grabArticle ( DOMElement $page = null ) : DOMElement | boolean
$page DOMElement
return DOMElement | boolean
    protected function grabArticle($page = null)
    {
        if (!$page) {
            $page = $this->dom;
        }
        $xpath = null;
        $nodesToScore = array();
        if ($page instanceof \DOMDocument && isset($page->documentElement)) {
            $xpath = new \DOMXPath($page);
        }
        $allElements = $page->getElementsByTagName('*');
        for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); ++$nodeIndex) {
            $tagName = $node->tagName;
            // Some well known site uses sections as paragraphs.
            if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) {
                $nodesToScore[] = $node;
            }
            // Turn divs into P tags where they have been used inappropriately
            //  (as in, where they contain no other block level elements).
            if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
                if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
                    $newNode = $this->dom->createElement('p');
                    try {
                        $newNode->innerHTML = $node->innerHTML;
                        $node->parentNode->replaceChild($newNode, $node);
                        --$nodeIndex;
                        $nodesToScore[] = $newNode;
                    } catch (\Exception $e) {
                        $this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
                    }
                } else {
                    // Will change these P elements back to text nodes after processing.
                    for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
                        $childNode = $node->childNodes->item($i);
                        // executable tags (<?php or <?xml) warning
                        if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') {
                            $childNode->parentNode->removeChild($childNode);
                            continue;
                        }
                        if ($childNode->nodeType === XML_TEXT_NODE) {
                            $p = $this->dom->createElement('p');
                            $p->innerHTML = $childNode->nodeValue;
                            $p->setAttribute('data-readability-styled', 'true');
                            $childNode->parentNode->replaceChild($p, $childNode);
                        }
                    }
                }
            }
        }
        /*
         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
         * Then add their score to their parent node.
         *
         * A score is determined by things like number of commas, class names, etc.
         * Maybe eventually link density.
         */
        for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
            $parentNode = $nodesToScore[$pt]->parentNode;
            // No parent node? Move on...
            if (!$parentNode) {
                continue;
            }
            $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
            $innerText = $this->getInnerText($nodesToScore[$pt]);
            // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
            if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
                continue;
            }
            // Initialize readability data for the parent.
            if (!$parentNode->hasAttribute('readability')) {
                $this->initializeNode($parentNode);
                $parentNode->setAttribute('data-candidate', 'true');
            }
            // Initialize readability data for the grandparent.
            if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
                $this->initializeNode($grandParentNode);
                $grandParentNode->setAttribute('data-candidate', 'true');
            }
            // Add a point for the paragraph itself as a base.
            $contentScore = 1;
            // Add points for any commas within this paragraph.
            $contentScore += $this->getCommaCount($innerText);
            // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
            $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
            // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
            $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
            /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
               $up = $nodesToScore[$pt];
               $score = 0;
               while ($up->parentNode instanceof \DOMElement) {
                   $up = $up->parentNode;
                   if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
                       $score += 0.5;
                   } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
                       $score -= 0.5;
                   }
               }
               $score = floor($score);
               $contentScore += max(min($score, 3), -3);/**/
            // Add the score to the parent. The grandparent gets half.
            $parentNode->getAttributeNode('readability')->value += $contentScore;
            if ($grandParentNode) {
                $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
            }
        }
        /*
         * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
         * This is faster to do before scoring but safer after.
         */
        if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
            $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
            $node = null;
            for ($c = $candidates->length - 1; $c >= 0; --$c) {
                $node = $candidates->item($c);
                // node should be readable but not inside of an article otherwise it's probably non-readable block
                if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
                    $this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
                    $node->parentNode->removeChild($node);
                }
            }
            $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
            $node = null;
            for ($c = $candidates->length - 1; $c >= 0; --$c) {
                $node = $candidates->item($c);
                // Remove unlikely candidates
                $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
                if (mb_strlen($unlikelyMatchString) > 3 && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)) {
                    $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
                    $node->parentNode->removeChild($node);
                    --$nodeIndex;
                }
            }
            unset($candidates);
        }
        /*
         * After we've calculated scores, loop through all of the possible candidate nodes we found
         * and find the one with the highest score.
         */
        $topCandidate = null;
        if ($xpath) {
            // Using array of DOMElements after deletion is a path to DOOMElement.
            $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
            for ($c = $candidates->length - 1; $c >= 0; --$c) {
                $item = $candidates->item($c);
                // Scale the final candidates score based on link density. Good content should have a
                // relatively small link density (5% or less) and be mostly unaffected by this operation.
                // If not for this we would have used XPath to find maximum @readability.
                $readability = $item->getAttributeNode('readability');
                $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
                if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
                    $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
                    $topCandidate = $item;
                }
            }
            unset($candidates);
        }
        /*
         * If we still have no top candidate, just use the body as a last resort.
         * We also have to copy the body node so it is something we can modify.
         */
        if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
            $topCandidate = $this->dom->createElement('div');
            if ($page instanceof \DOMDocument) {
                if (!isset($page->documentElement)) {
                    // we don't have a body either? what a mess! :)
                    $this->logger->debug('The page has no body!');
                } else {
                    $this->logger->debug('Setting body to a raw HTML of original page!');
                    $topCandidate->innerHTML = $page->documentElement->innerHTML;
                    $page->documentElement->innerHTML = '';
                    $this->reinitBody();
                    $page->documentElement->appendChild($topCandidate);
                }
            } else {
                $topCandidate->innerHTML = $page->innerHTML;
                $page->innerHTML = '';
                $page->appendChild($topCandidate);
            }
            $this->initializeNode($topCandidate);
        }
        // Set table as the main node if resulted data is table element.
        $tagName = $topCandidate->tagName;
        if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
            $up = $topCandidate;
            if ($up->parentNode instanceof \DOMElement) {
                $up = $up->parentNode;
                if (strcasecmp($up->tagName, 'table') === 0) {
                    $topCandidate = $up;
                }
            }
        }
        $this->logger->debug('Top candidate: ' . $topCandidate->getNodePath());
        /*
         * Now that we have the top candidate, look through its siblings for content that might also be related.
         * Things like preambles, content split by ads that we removed, etc.
         */
        $articleContent = $this->dom->createElement('div');
        $articleContent->setAttribute('class', 'readability-content');
        $siblingScoreThreshold = max(10, (int) $topCandidate->getAttribute('readability') * 0.2);
        $siblingNodes = $topCandidate->parentNode->childNodes;
        if (!isset($siblingNodes)) {
            $siblingNodes = new stdClass();
            $siblingNodes->length = 0;
        }
        for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
            $siblingNode = $siblingNodes->item($s);
            $siblingNodeName = $siblingNode->nodeName;
            $append = false;
            $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') ? ' with score ' . $siblingNode->getAttribute('readability') : ''));
            if ($siblingNode->isSameNode($topCandidate)) {
                $append = true;
            }
            $contentBonus = 0;
            // Give a bonus if sibling nodes and top candidates have the same classname.
            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
                $contentBonus += (int) $topCandidate->getAttribute('readability') * 0.2;
            }
            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (int) $siblingNode->getAttribute('readability') + $contentBonus >= $siblingScoreThreshold) {
                $append = true;
            }
            if (strcasecmp($siblingNodeName, 'p') === 0) {
                $linkDensity = $this->getLinkDensity($siblingNode);
                $nodeContent = $this->getInnerText($siblingNode, true, true);
                $nodeLength = mb_strlen($nodeContent);
                if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY || $nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\\.( |$)/', $nodeContent)) {
                    $append = true;
                }
            }
            if ($append) {
                $this->logger->debug('Appending node: ' . $siblingNode->getNodePath());
                if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
                    // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
                    $this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
                    $nodeToAppend = $this->dom->createElement('div');
                    try {
                        $nodeToAppend->setAttribute('alt', $siblingNodeName);
                        $nodeToAppend->innerHTML = $siblingNode->innerHTML;
                    } catch (\Exception $e) {
                        $this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
                        $nodeToAppend = $siblingNode;
                        --$s;
                        --$sl;
                    }
                } else {
                    $nodeToAppend = $siblingNode;
                    --$s;
                    --$sl;
                }
                // To ensure a node does not interfere with readability styles, remove its classnames & ids.
                // Now done via RegExp post_filter.
                //$nodeToAppend->removeAttribute('class');
                //$nodeToAppend->removeAttribute('id');
                // Append sibling and subtract from our list as appending removes a node.
                $articleContent->appendChild($nodeToAppend);
            }
        }
        unset($xpath);
        // So we have all of the content that we need. Now we clean it up for presentation.
        $this->prepArticle($articleContent);
        /*
         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
         * finding the -right- content.
         */
        if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
            $this->reinitBody();
            if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
                $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n");
                return $this->grabArticle($this->body);
            } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
                $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n");
                return $this->grabArticle($this->body);
            } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
                $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
                $this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n");
                return $this->grabArticle($this->body);
            }
            return false;
        }
        return $articleContent;
    }