protected function extractLinksInfo(DomCrawler $crawler, $url)
{
$childLinks = array();
$crawler->filter('a')->each(function (DomCrawler $node, $i) use(&$childLinks) {
$node_text = trim($node->text());
$node_url = $node->attr('href');
$node_url_is_crawlable = $this->checkIfCrawlable($node_url);
$hash = $this->normalizeLink($node_url);
if (isset($this->links[$hash]) === false) {
$childLinks[$hash]['original_urls'][$node_url] = $node_url;
$childLinks[$hash]['links_text'][$node_text] = $node_text;
if ($node_url_is_crawlable === true) {
// Ensure URL is formatted as absolute
if (preg_match("@^http(s)?@", $node_url) !== 1) {
if (strpos($node_url, '/') === 0) {
$parsed_url = parse_url($this->baseUrl);
$childLinks[$hash]['absolute_url'] = $parsed_url['scheme'] . '://' . $parsed_url['host'] . $node_url;
} else {
$childLinks[$hash]['absolute_url'] = substr($this->baseUrl, 0, strrpos($this->baseUrl, '/')) . '/' . $node_url;
}
} else {
$childLinks[$hash]['absolute_url'] = $node_url;
}
// Is this an external URL?
$childLinks[$hash]['external_link'] = $this->checkIfExternal($childLinks[$hash]['absolute_url']);
// Additional metadata
$childLinks[$hash]['visited'] = false;
$childLinks[$hash]['frequency'] = isset($childLinks[$hash]['frequency']) ? $childLinks[$hash]['frequency'] + 1 : 1;
} else {
$childLinks[$hash]['dont_visit'] = true;
$childLinks[$hash]['external_link'] = false;
}
}
});
// Avoid cyclic loops with pages that link to themselves
if (isset($childLinks[$url]) === true) {
$childLinks[$url]['visited'] = true;
}
return $childLinks;
}