Goose\Crawler::crawl PHP Method

crawl() public method

public crawl ( string $url, string | null $rawHTML = null ) : goose\Article
$url string
$rawHTML string | null
return goose\Article
    public function crawl($url, $rawHTML = null)
    {
        $article = new Article();
        $parseCandidate = Helper::getCleanedUrl($url);
        $xmlInternalErrors = libxml_use_internal_errors(true);
        if (empty($rawHTML)) {
            $rawHTML = $this->getHTML($parseCandidate->url);
        }
        // Generate document
        $doc = $this->getDocument($rawHTML);
        // Set core mutators
        $article->setFinalUrl($parseCandidate->url);
        $article->setDomain($parseCandidate->parts->host);
        $article->setLinkhash($parseCandidate->linkhash);
        $article->setRawHtml($rawHTML);
        $article->setDoc($doc);
        $article->setRawDoc(clone $doc);
        // Pre-extraction document cleaning
        $this->modules('cleaners', $article);
        // Extract content
        $this->modules('extractors', $article);
        // Post-extraction content formatting
        $this->modules('formatters', $article);
        libxml_use_internal_errors($xmlInternalErrors);
        return $article;
    }