Readability\Readability::loadHtml PHP Méthode

Readability Class Documentation Afficher le fichier Open project: j0k3r/php-readability

loadHtml() private méthode

Apply Pre filters Cleanup HTML using Tidy (or not).

private loadHtml ( )

    private function loadHtml()
    {
        $this->original_html = $this->html;
        $this->logger->debug('Parsing URL: ' . $this->url);
        if ($this->url) {
            $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\\.')) . '/';
        }
        mb_internal_encoding('UTF-8');
        mb_http_output('UTF-8');
        mb_regex_encoding('UTF-8');
        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
            foreach ($this->pre_filters as $search => $replace) {
                $this->html = preg_replace($search, $replace, $this->html);
            }
            unset($search, $replace);
        }
        if (trim($this->html) === '') {
            $this->html = '<html></html>';
        }
        /*
         * Use tidy (if it exists).
         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
         * Although sometimes it makes matters worse, which is why there is an option to disable it.
         */
        if ($this->useTidy) {
            $this->logger->debug('Tidying document');
            $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
            if (tidy_clean_repair($tidy)) {
                $this->tidied = true;
                $this->html = $tidy->value;
                $this->html = preg_replace('/[\\r\\n]+/is', "\n", $this->html);
            }
            unset($tidy);
        }
        $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
        if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) {
            libxml_use_internal_errors(true);
            $this->dom = new \DOMDocument();
            $this->dom->preserveWhiteSpace = false;
            if (PHP_VERSION_ID >= 50400) {
                $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
            } else {
                $this->dom->loadHTML($this->html);
            }
            libxml_use_internal_errors(false);
        }
        $this->dom->registerNodeClass('DOMElement', 'Readability\\JSLikeHTMLElement');
    }

Readability

__construct

addFlag

addFootnotes

addPostFilter

addPreFilter

clean

cleanConditionally

cleanHeaders

cleanStyles

dbg

dump_dbg