public function cleanConditionally($e, $tag)
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
}
$tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length;
$node = null;
/*
* Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
* TODO: Consider taking into account original contentScore here.
*/
for ($i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i);
$weight = $this->getWeight($node);
$contentScore = $node->hasAttribute('readability') ? (int) $node->getAttribute('readability') : 0;
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . ($node->hasAttribute('readability') ? ' with score ' . $node->getAttribute('readability') : ''));
if ($weight + $contentScore < 0) {
$this->logger->debug('Removing...');
$node->parentNode->removeChild($node);
} elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
/*
* If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
*/
$p = $node->getElementsByTagName('p')->length;
$img = $node->getElementsByTagName('img')->length;
$li = $node->getElementsByTagName('li')->length - 100;
$input = $node->getElementsByTagName('input')->length;
$a = $node->getElementsByTagName('a')->length;
$embedCount = 0;
$embeds = $node->getElementsByTagName('embed');
for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
++$embedCount;
}
}
$embeds = $node->getElementsByTagName('iframe');
for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
++$embedCount;
}
}
$linkDensity = $this->getLinkDensity($node, true);
$contentLength = mb_strlen($this->getInnerText($node));
$toRemove = false;
if ($this->lightClean) {
if ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true;
} elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many <input> elements');
$toRemove = true;
} elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
$this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.25) {
$this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
$toRemove = true;
} elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
$this->logger->debug(' more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
$toRemove = true;
} elseif ($embedCount > 3) {
$this->logger->debug(' more than 3 embeds');
$toRemove = true;
}
} else {
if ($img > $p) {
$this->logger->debug(' more image elements than paragraph elements');
$toRemove = true;
} elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
$this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true;
} elseif ($input > floor($p / 3)) {
$this->logger->debug(' too many <input> elements');
$toRemove = true;
} elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
$this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.2) {
$this->logger->debug(' weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
$toRemove = true;
} elseif ($weight >= 25 && $linkDensity > 0.5) {
$this->logger->debug(' weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
$toRemove = true;
} elseif ($embedCount === 1 && $contentLength < 75 || $embedCount > 1) {
$this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed');
$toRemove = true;
}
}
if ($toRemove) {
$this->logger->debug('Removing...');
$node->parentNode->removeChild($node);
}
}
}
}