public addHtmlContent ( string $content, string $charset = 'UTF-8' ) | ||
$content | string | The HTML content |
$charset | string | The charset |
public function addHtmlContent($content, $charset = 'UTF-8')
{
$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
set_error_handler(function () {
throw new \Exception();
});
try {
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
}
restore_error_handler();
if ('' !== trim($content)) {
@$dom->loadHTML($content);
}
libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);
$this->addDocument($dom);
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
$baseHref = current($base);
if (count($base) && !empty($baseHref)) {
if ($this->baseHref) {
$linkNode = $dom->createElement('a');
$linkNode->setAttribute('href', $baseHref);
$link = new Link($linkNode, $this->baseHref);
$this->baseHref = $link->getUri();
} else {
$this->baseHref = $baseHref;
}
}
}
/** * Process the DOM * * @return array * @throws Exception */ public function process() { // Check if HTML content is already set $this->checkIfContentIsEmpty($this->html); $items = []; $total = 0; $prepareItems = function (Crawler $nodeCrawler, $i) use(&$items, &$total) { $title = $nodeCrawler->filter('h3 > a'); $link = $nodeCrawler->filter('h3 > a')->attr('href'); $price = $nodeCrawler->filter('p.pricePerUnit')->text(); $descriptionPage = $this->fetch($link); //prepare items array $items[$i]['title'] = trim($title->text()); $items[$i]['size'] = $this->sizeOf($descriptionPage); $items[$i]['unit_price'] = $this->format($price); $items[$i]['description'] = $this->getDescriptionFor($descriptionPage); $total += $items[$i]['unit_price']; }; // bind the closure to the object context // so we can access the object inside the closure $prepareItems->bindTo($this); $this->domCrawler->addHtmlContent($this->html); $this->domCrawler->filter('ul.productLister > li')->each($prepareItems); $this->items = $items; $this->total = number_format($total, 2); unset($items); unset($total); return ['items' => $this->items, 'total' => $this->total]; }