Arachnid\Crawler::extractLinksInfo PHP Method

extractLinksInfo() protected method

Extract links information from url
protected extractLinksInfo ( Crawler $crawler, string $url ) : array
$crawler Symfony\Component\DomCrawler\Crawler
$url string
return array
    protected function extractLinksInfo(DomCrawler $crawler, $url)
    {
        $childLinks = array();
        $crawler->filter('a')->each(function (DomCrawler $node, $i) use(&$childLinks) {
            $node_text = trim($node->text());
            $node_url = $node->attr('href');
            $node_url_is_crawlable = $this->checkIfCrawlable($node_url);
            $hash = $this->normalizeLink($node_url);
            if (isset($this->links[$hash]) === false) {
                $childLinks[$hash]['original_urls'][$node_url] = $node_url;
                $childLinks[$hash]['links_text'][$node_text] = $node_text;
                if ($node_url_is_crawlable === true) {
                    // Ensure URL is formatted as absolute
                    if (preg_match("@^http(s)?@", $node_url) !== 1) {
                        if (strpos($node_url, '/') === 0) {
                            $parsed_url = parse_url($this->baseUrl);
                            $childLinks[$hash]['absolute_url'] = $parsed_url['scheme'] . '://' . $parsed_url['host'] . $node_url;
                        } else {
                            $childLinks[$hash]['absolute_url'] = substr($this->baseUrl, 0, strrpos($this->baseUrl, '/')) . '/' . $node_url;
                        }
                    } else {
                        $childLinks[$hash]['absolute_url'] = $node_url;
                    }
                    // Is this an external URL?
                    $childLinks[$hash]['external_link'] = $this->checkIfExternal($childLinks[$hash]['absolute_url']);
                    // Additional metadata
                    $childLinks[$hash]['visited'] = false;
                    $childLinks[$hash]['frequency'] = isset($childLinks[$hash]['frequency']) ? $childLinks[$hash]['frequency'] + 1 : 1;
                } else {
                    $childLinks[$hash]['dont_visit'] = true;
                    $childLinks[$hash]['external_link'] = false;
                }
            }
        });
        // Avoid cyclic loops with pages that link to themselves
        if (isset($childLinks[$url]) === true) {
            $childLinks[$url]['visited'] = true;
        }
        return $childLinks;
    }