Readability\Readability::prepArticle PHP Method

prepArticle() public method

Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous

tags, etc.

public prepArticle ( DOMElement $articleContent )
$articleContent DOMElement
    public function prepArticle($articleContent)
    {
        $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
        $this->cleanStyles($articleContent);
        $this->killBreaks($articleContent);
        $xpath = new \DOMXPath($articleContent->ownerDocument);
        if ($this->revertForcedParagraphElements) {
            /*
             * Reverts P elements with class 'readability-styled' to text nodes:
             * which is what they were before.
             */
            $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
            for ($i = $elems->length - 1; $i >= 0; --$i) {
                $e = $elems->item($i);
                $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
            }
        }
        // Remove service data-candidate attribute.
        $elems = $xpath->query('.//*[@data-candidate]', $articleContent);
        for ($i = $elems->length - 1; $i >= 0; --$i) {
            $elems->item($i)->removeAttribute('data-candidate');
        }
        // Clean out junk from the article content.
        $this->clean($articleContent, 'input');
        $this->clean($articleContent, 'button');
        $this->clean($articleContent, 'nav');
        $this->clean($articleContent, 'object');
        $this->clean($articleContent, 'iframe');
        $this->clean($articleContent, 'canvas');
        $this->clean($articleContent, 'h1');
        /*
         * If there is only one h2, they are probably using it as a main header, so remove it since we
         *  already have a header.
         */
        $h2s = $articleContent->getElementsByTagName('h2');
        if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
            $this->clean($articleContent, 'h2');
        }
        $this->cleanHeaders($articleContent);
        // Do these last as the previous stuff may have removed junk that will affect these.
        $this->cleanConditionally($articleContent, 'form');
        $this->cleanConditionally($articleContent, 'table');
        $this->cleanConditionally($articleContent, 'ul');
        $this->cleanConditionally($articleContent, 'div');
        // Remove extra paragraphs.
        $articleParagraphs = $articleContent->getElementsByTagName('p');
        for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) {
            $item = $articleParagraphs->item($i);
            $imgCount = $item->getElementsByTagName('img')->length;
            $embedCount = $item->getElementsByTagName('embed')->length;
            $objectCount = $item->getElementsByTagName('object')->length;
            $videoCount = $item->getElementsByTagName('video')->length;
            $audioCount = $item->getElementsByTagName('audio')->length;
            $iframeCount = $item->getElementsByTagName('iframe')->length;
            if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
                $item->parentNode->removeChild($item);
            }
            // add extra text to iframe tag to avoid an auto-closing iframe and then break the html code
            if ($iframeCount) {
                $iframe = $item->getElementsByTagName('iframe');
                $iframe->item(0)->nodeValue = ' ';
                $item->parentNode->replaceChild($iframe->item(0), $item);
            }
        }
        if (!$this->flagIsActive(self::FLAG_DISABLE_POSTFILTER)) {
            try {
                foreach ($this->post_filters as $search => $replace) {
                    $articleContent->innerHTML = preg_replace($search, $replace, $articleContent->innerHTML);
                }
                unset($search, $replace);
            } catch (\Exception $e) {
                $this->logger->error('Cleaning output HTML failed. Ignoring: ' . $e->getMessage());
            }
        }
    }