public function prepArticle($articleContent)
{
$this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
$this->cleanStyles($articleContent);
$this->killBreaks($articleContent);
$xpath = new \DOMXPath($articleContent->ownerDocument);
if ($this->revertForcedParagraphElements) {
/*
* Reverts P elements with class 'readability-styled' to text nodes:
* which is what they were before.
*/
$elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
$e = $elems->item($i);
$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
}
}
// Remove service data-candidate attribute.
$elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->removeAttribute('data-candidate');
}
// Clean out junk from the article content.
$this->clean($articleContent, 'input');
$this->clean($articleContent, 'button');
$this->clean($articleContent, 'nav');
$this->clean($articleContent, 'object');
$this->clean($articleContent, 'iframe');
$this->clean($articleContent, 'canvas');
$this->clean($articleContent, 'h1');
/*
* If there is only one h2, they are probably using it as a main header, so remove it since we
* already have a header.
*/
$h2s = $articleContent->getElementsByTagName('h2');
if ($h2s->length === 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
$this->clean($articleContent, 'h2');
}
$this->cleanHeaders($articleContent);
// Do these last as the previous stuff may have removed junk that will affect these.
$this->cleanConditionally($articleContent, 'form');
$this->cleanConditionally($articleContent, 'table');
$this->cleanConditionally($articleContent, 'ul');
$this->cleanConditionally($articleContent, 'div');
// Remove extra paragraphs.
$articleParagraphs = $articleContent->getElementsByTagName('p');
for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) {
$item = $articleParagraphs->item($i);
$imgCount = $item->getElementsByTagName('img')->length;
$embedCount = $item->getElementsByTagName('embed')->length;
$objectCount = $item->getElementsByTagName('object')->length;
$videoCount = $item->getElementsByTagName('video')->length;
$audioCount = $item->getElementsByTagName('audio')->length;
$iframeCount = $item->getElementsByTagName('iframe')->length;
if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\\s+/is', '', $this->getInnerText($item, false, false))) === 0) {
$item->parentNode->removeChild($item);
}
// add extra text to iframe tag to avoid an auto-closing iframe and then break the html code
if ($iframeCount) {
$iframe = $item->getElementsByTagName('iframe');
$iframe->item(0)->nodeValue = ' ';
$item->parentNode->replaceChild($iframe->item(0), $item);
}
}
if (!$this->flagIsActive(self::FLAG_DISABLE_POSTFILTER)) {
try {
foreach ($this->post_filters as $search => $replace) {
$articleContent->innerHTML = preg_replace($search, $replace, $articleContent->innerHTML);
}
unset($search, $replace);
} catch (\Exception $e) {
$this->logger->error('Cleaning output HTML failed. Ignoring: ' . $e->getMessage());
}
}
}