Readability\Readability::cleanConditionally PHP Method

cleanConditionally() public method

"Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
public cleanConditionally ( DOMElement $e, string $tag )
$e DOMElement
$tag string
    public function cleanConditionally($e, $tag)
    {
        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
            return;
        }
        $tagsList = $e->getElementsByTagName($tag);
        $curTagsLength = $tagsList->length;
        $node = null;
        /*
         * Gather counts for other typical elements embedded within.
         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
         *
         * TODO: Consider taking into account original contentScore here.
         */
        for ($i = $curTagsLength - 1; $i >= 0; --$i) {
            $node = $tagsList->item($i);
            $weight = $this->getWeight($node);
            $contentScore = $node->hasAttribute('readability') ? (int) $node->getAttribute('readability') : 0;
            $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . ($node->hasAttribute('readability') ? ' with score ' . $node->getAttribute('readability') : ''));
            if ($weight + $contentScore < 0) {
                $this->logger->debug('Removing...');
                $node->parentNode->removeChild($node);
            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
                /*
                 * If there are not very many commas, and the number of
                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
                 */
                $p = $node->getElementsByTagName('p')->length;
                $img = $node->getElementsByTagName('img')->length;
                $li = $node->getElementsByTagName('li')->length - 100;
                $input = $node->getElementsByTagName('input')->length;
                $a = $node->getElementsByTagName('a')->length;
                $embedCount = 0;
                $embeds = $node->getElementsByTagName('embed');
                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
                        ++$embedCount;
                    }
                }
                $embeds = $node->getElementsByTagName('iframe');
                for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
                    if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
                        ++$embedCount;
                    }
                }
                $linkDensity = $this->getLinkDensity($node, true);
                $contentLength = mb_strlen($this->getInnerText($node));
                $toRemove = false;
                if ($this->lightClean) {
                    if ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
                        $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
                        $toRemove = true;
                    } elseif ($input > floor($p / 3)) {
                        $this->logger->debug(' too many <input> elements');
                        $toRemove = true;
                    } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
                        $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
                        $toRemove = true;
                    } elseif ($weight < 25 && $linkDensity > 0.25) {
                        $this->logger->debug(' weight is ' . $weight . ' < 25 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.25');
                        $toRemove = true;
                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
                        $this->logger->debug('  more than 2 links and weight is ' . $weight . ' > 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
                        $toRemove = true;
                    } elseif ($embedCount > 3) {
                        $this->logger->debug(' more than 3 embeds');
                        $toRemove = true;
                    }
                } else {
                    if ($img > $p) {
                        $this->logger->debug(' more image elements than paragraph elements');
                        $toRemove = true;
                    } elseif ($li > $p && $tag !== 'ul' && $tag !== 'ol') {
                        $this->logger->debug('  too many <li> elements, and parent is not <ul> or <ol>');
                        $toRemove = true;
                    } elseif ($input > floor($p / 3)) {
                        $this->logger->debug('  too many <input> elements');
                        $toRemove = true;
                    } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
                        $this->logger->debug('  content length less than 10 chars and 0 images, or more than 2 images');
                        $toRemove = true;
                    } elseif ($weight < 25 && $linkDensity > 0.2) {
                        $this->logger->debug('  weight is ' . $weight . ' lower than 0 and link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.2');
                        $toRemove = true;
                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
                        $this->logger->debug('  weight above 25 but link density is ' . sprintf('%.2f', $linkDensity) . ' > 0.5');
                        $toRemove = true;
                    } elseif ($embedCount === 1 && $contentLength < 75 || $embedCount > 1) {
                        $this->logger->debug('  1 embed and content length smaller than 75 chars, or more than one embed');
                        $toRemove = true;
                    }
                }
                if ($toRemove) {
                    $this->logger->debug('Removing...');
                    $node->parentNode->removeChild($node);
                }
            }
        }
    }