private function loadHtml()
{
$this->original_html = $this->html;
$this->logger->debug('Parsing URL: ' . $this->url);
if ($this->url) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\\.')) . '/';
}
mb_internal_encoding('UTF-8');
mb_http_output('UTF-8');
mb_regex_encoding('UTF-8');
// HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
foreach ($this->pre_filters as $search => $replace) {
$this->html = preg_replace($search, $replace, $this->html);
}
unset($search, $replace);
}
if (trim($this->html) === '') {
$this->html = '<html></html>';
}
/*
* Use tidy (if it exists).
* This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
* Although sometimes it makes matters worse, which is why there is an option to disable it.
*/
if ($this->useTidy) {
$this->logger->debug('Tidying document');
$tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) {
$this->tidied = true;
$this->html = $tidy->value;
$this->html = preg_replace('/[\\r\\n]+/is', "\n", $this->html);
}
unset($tidy);
}
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) {
libxml_use_internal_errors(true);
$this->dom = new \DOMDocument();
$this->dom->preserveWhiteSpace = false;
if (PHP_VERSION_ID >= 50400) {
$this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
} else {
$this->dom->loadHTML($this->html);
}
libxml_use_internal_errors(false);
}
$this->dom->registerNodeClass('DOMElement', 'Readability\\JSLikeHTMLElement');
}