Zend_Search_Lucene_Document_Html::__construct PHP Method

__construct() private method

Object constructor
private __construct ( string $data, boolean $isFile, boolean $storeContent, string $defaultEncoding = '' )
$data string HTML string (may be HTML fragment, )
$isFile boolean
$storeContent boolean
$defaultEncoding string HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
    private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
    {
        $this->_doc = new DOMDocument();
        $this->_doc->substituteEntities = true;
        if ($isFile) {
            $htmlData = file_get_contents($data);
        } else {
            $htmlData = $data;
        }
        @$this->_doc->loadHTML($htmlData);
        if ($this->_doc->encoding === null) {
            // Document encoding is not recognized
            /** @todo improve HTML vs HTML fragment recognition */
            if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
                // It's an HTML document
                // Add additional HEAD section and recognize document
                $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
                @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
                // Remove additional HEAD section
                $xpath = new DOMXPath($this->_doc);
                $head = $xpath->query('/html/head')->item(0);
                $head->parentNode->removeChild($head);
            } else {
                // It's an HTML fragment
                @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>');
            }
        }
        /** @todo Add correction of wrong HTML encoding recognition processing
         * The case is:
         * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
         * even $this->_doc->encoding demonstrates another recognized encoding
         */
        $xpath = new DOMXPath($this->_doc);
        $docTitle = '';
        $titleNodes = $xpath->query('/html/head/title');
        foreach ($titleNodes as $titleNode) {
            // title should always have only one entry, but we process all nodeset entries
            $docTitle .= $titleNode->nodeValue . ' ';
        }
        $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
        $metaNodes = $xpath->query('/html/head/meta[@name]');
        foreach ($metaNodes as $metaNode) {
            $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8'));
        }
        $docBody = '';
        $bodyNodes = $xpath->query('/html/body');
        foreach ($bodyNodes as $bodyNode) {
            // body should always have only one entry, but we process all nodeset entries
            $this->_retrieveNodeText($bodyNode, $docBody);
        }
        if ($storeContent) {
            $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8'));
        } else {
            $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8'));
        }
        $linkNodes = $this->_doc->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
            if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
                $this->_links[] = $href;
            }
        }
        $linkNodes = $this->_doc->getElementsByTagName('area');
        foreach ($linkNodes as $linkNode) {
            if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
                $this->_links[] = $href;
            }
        }
        $this->_links = array_unique($this->_links);
        $linkNodes = $xpath->query('/html/head/link');
        foreach ($linkNodes as $linkNode) {
            if (($href = $linkNode->getAttribute('href')) != '') {
                $this->_headerLinks[] = $href;
            }
        }
        $this->_headerLinks = array_unique($this->_headerLinks);
    }