protected function grabArticle($page = null)
{
if (!$page) {
$page = $this->dom;
}
$xpath = null;
$nodesToScore = array();
if ($page instanceof \DOMDocument && isset($page->documentElement)) {
$xpath = new \DOMXPath($page);
}
$allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); ++$nodeIndex) {
$tagName = $node->tagName;
// Some well known site uses sections as paragraphs.
if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'pre') === 0 || strcasecmp($tagName, 'section') === 0) {
$nodesToScore[] = $node;
}
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
$newNode = $this->dom->createElement('p');
try {
$newNode->innerHTML = $node->innerHTML;
$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
$nodesToScore[] = $newNode;
} catch (\Exception $e) {
$this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
}
} else {
// Will change these P elements back to text nodes after processing.
for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
$childNode = $node->childNodes->item($i);
// executable tags (<?php or <?xml) warning
if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') {
$childNode->parentNode->removeChild($childNode);
continue;
}
if ($childNode->nodeType === XML_TEXT_NODE) {
$p = $this->dom->createElement('p');
$p->innerHTML = $childNode->nodeValue;
$p->setAttribute('data-readability-styled', 'true');
$childNode->parentNode->replaceChild($p, $childNode);
}
}
}
}
}
/*
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density.
*/
for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
$parentNode = $nodesToScore[$pt]->parentNode;
// No parent node? Move on...
if (!$parentNode) {
continue;
}
$grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
$innerText = $this->getInnerText($nodesToScore[$pt]);
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
continue;
}
// Initialize readability data for the parent.
if (!$parentNode->hasAttribute('readability')) {
$this->initializeNode($parentNode);
$parentNode->setAttribute('data-candidate', 'true');
}
// Initialize readability data for the grandparent.
if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
$this->initializeNode($grandParentNode);
$grandParentNode->setAttribute('data-candidate', 'true');
}
// Add a point for the paragraph itself as a base.
$contentScore = 1;
// Add points for any commas within this paragraph.
$contentScore += $this->getCommaCount($innerText);
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
/* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
$up = $nodesToScore[$pt];
$score = 0;
while ($up->parentNode instanceof \DOMElement) {
$up = $up->parentNode;
if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
$score += 0.5;
} elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
$score -= 0.5;
}
}
$score = floor($score);
$contentScore += max(min($score, 3), -3);/**/
// Add the score to the parent. The grandparent gets half.
$parentNode->getAttributeNode('readability')->value += $contentScore;
if ($grandParentNode) {
$grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
}
}
/*
* Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
* This is faster to do before scoring but safer after.
*/
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
$node = null;
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node);
}
}
$candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
$node = null;
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
// Remove unlikely candidates
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
if (mb_strlen($unlikelyMatchString) > 3 && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)) {
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node);
--$nodeIndex;
}
}
unset($candidates);
}
/*
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
*/
$topCandidate = null;
if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement.
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
for ($c = $candidates->length - 1; $c >= 0; --$c) {
$item = $candidates->item($c);
// Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability.
$readability = $item->getAttributeNode('readability');
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
$this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
$topCandidate = $item;
}
}
unset($candidates);
}
/*
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
*/
if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
$topCandidate = $this->dom->createElement('div');
if ($page instanceof \DOMDocument) {
if (!isset($page->documentElement)) {
// we don't have a body either? what a mess! :)
$this->logger->debug('The page has no body!');
} else {
$this->logger->debug('Setting body to a raw HTML of original page!');
$topCandidate->innerHTML = $page->documentElement->innerHTML;
$page->documentElement->innerHTML = '';
$this->reinitBody();
$page->documentElement->appendChild($topCandidate);
}
} else {
$topCandidate->innerHTML = $page->innerHTML;
$page->innerHTML = '';
$page->appendChild($topCandidate);
}
$this->initializeNode($topCandidate);
}
// Set table as the main node if resulted data is table element.
$tagName = $topCandidate->tagName;
if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
$up = $topCandidate;
if ($up->parentNode instanceof \DOMElement) {
$up = $up->parentNode;
if (strcasecmp($up->tagName, 'table') === 0) {
$topCandidate = $up;
}
}
}
$this->logger->debug('Top candidate: ' . $topCandidate->getNodePath());
/*
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc.
*/
$articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content');
$siblingScoreThreshold = max(10, (int) $topCandidate->getAttribute('readability') * 0.2);
$siblingNodes = $topCandidate->parentNode->childNodes;
if (!isset($siblingNodes)) {
$siblingNodes = new stdClass();
$siblingNodes->length = 0;
}
for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
$siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName;
$append = false;
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') ? ' with score ' . $siblingNode->getAttribute('readability') : ''));
if ($siblingNode->isSameNode($topCandidate)) {
$append = true;
}
$contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the same classname.
if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
$contentBonus += (int) $topCandidate->getAttribute('readability') * 0.2;
}
if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (int) $siblingNode->getAttribute('readability') + $contentBonus >= $siblingScoreThreshold) {
$append = true;
}
if (strcasecmp($siblingNodeName, 'p') === 0) {
$linkDensity = $this->getLinkDensity($siblingNode);
$nodeContent = $this->getInnerText($siblingNode, true, true);
$nodeLength = mb_strlen($nodeContent);
if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY || $nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\\.( |$)/', $nodeContent)) {
$append = true;
}
}
if ($append) {
$this->logger->debug('Appending node: ' . $siblingNode->getNodePath());
if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
// We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
$this->logger->debug('Altering siblingNode "' . $siblingNodeName . '" to "div".');
$nodeToAppend = $this->dom->createElement('div');
try {
$nodeToAppend->setAttribute('alt', $siblingNodeName);
$nodeToAppend->innerHTML = $siblingNode->innerHTML;
} catch (\Exception $e) {
$this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
$nodeToAppend = $siblingNode;
--$s;
--$sl;
}
} else {
$nodeToAppend = $siblingNode;
--$s;
--$sl;
}
// To ensure a node does not interfere with readability styles, remove its classnames & ids.
// Now done via RegExp post_filter.
//$nodeToAppend->removeAttribute('class');
//$nodeToAppend->removeAttribute('id');
// Append sibling and subtract from our list as appending removes a node.
$articleContent->appendChild($nodeToAppend);
}
}
unset($xpath);
// So we have all of the content that we need. Now we clean it up for presentation.
$this->prepArticle($articleContent);
/*
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
* finding the -right- content.
*/
if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
$this->reinitBody();
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n");
return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
$this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
$this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n");
return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
$this->logger->debug('...content is shorter than ' . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n");
return $this->grabArticle($this->body);
}
return false;
}
return $articleContent;
}