private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
{
$this->_doc = new DOMDocument();
$this->_doc->substituteEntities = true;
if ($isFile) {
$htmlData = file_get_contents($data);
} else {
$htmlData = $data;
}
@$this->_doc->loadHTML($htmlData);
if ($this->_doc->encoding === null) {
// Document encoding is not recognized
/** @todo improve HTML vs HTML fragment recognition */
if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
// It's an HTML document
// Add additional HEAD section and recognize document
$htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
@$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
// Remove additional HEAD section
$xpath = new DOMXPath($this->_doc);
$head = $xpath->query('/html/head')->item(0);
$head->parentNode->removeChild($head);
} else {
// It's an HTML fragment
@$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>');
}
}
/** @todo Add correction of wrong HTML encoding recognition processing
* The case is:
* Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
* even $this->_doc->encoding demonstrates another recognized encoding
*/
$xpath = new DOMXPath($this->_doc);
$docTitle = '';
$titleNodes = $xpath->query('/html/head/title');
foreach ($titleNodes as $titleNode) {
// title should always have only one entry, but we process all nodeset entries
$docTitle .= $titleNode->nodeValue . ' ';
}
$this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
$metaNodes = $xpath->query('/html/head/meta[@name]');
foreach ($metaNodes as $metaNode) {
$this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8'));
}
$docBody = '';
$bodyNodes = $xpath->query('/html/body');
foreach ($bodyNodes as $bodyNode) {
// body should always have only one entry, but we process all nodeset entries
$this->_retrieveNodeText($bodyNode, $docBody);
}
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8'));
}
$linkNodes = $this->_doc->getElementsByTagName('a');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
$this->_links[] = $href;
}
}
$linkNodes = $this->_doc->getElementsByTagName('area');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
$this->_links[] = $href;
}
}
$this->_links = array_unique($this->_links);
$linkNodes = $xpath->query('/html/head/link');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '') {
$this->_headerLinks[] = $href;
}
}
$this->_headerLinks = array_unique($this->_headerLinks);
}