/**
* Crawl and process a single URL.
* @param $url string
* @return mixed|null
*/
protected function crawlUrl($url, $parentUrl = null)
{
if (!$url || $this->crawled->search($url) !== false || Str::startsWith($url, "#")) {
return null;
}
$this->log("Crawling URL: " . $url);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, $this->agent);
curl_setopt($ch, CURLOPT_HEADER, 1);
$response = curl_exec($ch);
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = $this->parseHeader(substr($response, 0, $headerSize));
$body = substr($response, $headerSize);
curl_close($ch);
$this->crawled->push($url);
if (!$this->validate($header, $body, $url, $parentUrl)) {
return null;
}
$processed = $this->processHtml($url, HtmlDomParser::str_get_html($body));
$this->add($processed);
// Recursively crawl other URLs that were found.
foreach ($processed['urls'] as $href) {
$this->crawlUrl($href, $url);
}
}