Graby\Graby::doFetchContent PHP Method

doFetchContent() private method

Do fetch content from an url.
private doFetchContent ( string $url ) : array
$url string
return array With key status, html, title, language, url, content_type & open_graph
    private function doFetchContent($url)
    {
        $url = $this->validateUrl($url);
        $siteConfig = $this->configBuilder->buildFromUrl($url);
        $this->logger->log('debug', 'Fetching url: {url}', array('url' => $url));
        $response = $this->httpClient->fetch($url, false, $siteConfig->http_header);
        $effectiveUrl = $response['effective_url'];
        $effectiveUrl = str_replace(' ', '%20', $effectiveUrl);
        if (!$this->isUrlAllowed($effectiveUrl)) {
            throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $effectiveUrl));
        }
        // check if action defined for returned Content-Type, like image, pdf, audio or video
        $mimeInfo = $this->getMimeActionInfo($response['headers']);
        $infos = $this->handleMimeAction($mimeInfo, $effectiveUrl, $response['body']);
        if (is_array($infos)) {
            return $infos;
        }
        $html = $this->convert2Utf8($response['body'], $response['headers']);
        // some non utf8 enconding might be breaking after converting to utf8
        // when it happen the string (usually) starts with this character
        // in that case, we'll take the default response instead of the utf8 forced one
        if (0 === strpos(utf8_encode($response['body']), 'ÿþ')) {
            $html = $response['body'];
        }
        $ogData = $this->extractOpenGraph($html, $effectiveUrl);
        $this->logger->log('debug', 'Opengraph data: {ogData}', array('ogData' => $ogData));
        // @TODO: log raw html + headers
        // check site config for single page URL - fetch it if found
        $isSinglePage = false;
        if ($this->config['singlepage'] && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) {
            $isSinglePage = true;
            $effectiveUrl = $singlePageResponse['effective_url'];
            // check if action defined for returned Content-Type
            $mimeInfo = $this->getMimeActionInfo($singlePageResponse['headers']);
            $infos = $this->handleMimeAction($mimeInfo, $effectiveUrl, $singlePageResponse['body']);
            if (is_array($infos)) {
                return $infos;
            }
            $html = $this->convert2Utf8($singlePageResponse['body'], $singlePageResponse['headers']);
            $this->logger->log('debug', 'Retrieved single-page view from "{url}"', array('url' => $effectiveUrl));
            unset($singlePageResponse);
        }
        $this->logger->log('debug', 'Attempting to extract content');
        $extractResult = $this->extractor->process($html, $effectiveUrl);
        $readability = $this->extractor->readability;
        $contentBlock = $this->extractor->getContent();
        $extractedTitle = $this->extractor->getTitle();
        $extractedLanguage = $this->extractor->getLanguage();
        // Deal with multi-page articles
        $isMultiPage = !$isSinglePage && $extractResult && null !== $this->extractor->getNextPageUrl();
        if ($this->config['multipage'] && $isMultiPage) {
            $this->logger->log('debug', 'Attempting to process multi-page article');
            // store first page to avoid parsing it again (previous url content is in `$contentBlock`)
            $multiPageUrls = array($effectiveUrl);
            $multiPageContent = array();
            while ($nextPageUrl = $this->extractor->getNextPageUrl()) {
                $this->logger->log('debug', 'Processing next page: {url}', array('url' => $nextPageUrl));
                // If we've got URL, resolve against $url
                $nextPageUrl = $this->makeAbsoluteStr($effectiveUrl, $nextPageUrl);
                if (!$nextPageUrl) {
                    $this->logger->log('debug', 'Failed to resolve against: {url}', array('url' => $effectiveUrl));
                    $multiPageContent = array();
                    break;
                }
                // check it's not what we have already!
                if (in_array($nextPageUrl, $multiPageUrls)) {
                    $this->logger->log('debug', 'URL already processed');
                    $multiPageContent = array();
                    break;
                }
                // it's not, store it for later check & so let's attempt to fetch it
                $multiPageUrls[] = $nextPageUrl;
                $response = $this->httpClient->fetch($nextPageUrl, false, $siteConfig->http_header);
                // make sure mime type is not something with a different action associated
                $mimeInfo = $this->getMimeActionInfo($response['headers']);
                if (isset($mimeInfo['action'])) {
                    $this->logger->log('debug', 'MIME type requires different action');
                    $multiPageContent = array();
                    break;
                }
                $extracSuccess = $this->extractor->process($this->convert2Utf8($response['body'], $response['headers']), $nextPageUrl);
                if (!$extracSuccess) {
                    $this->logger->log('debug', 'Failed to extract content');
                    $multiPageContent = array();
                    break;
                }
                $multiPageContent[] = $this->extractor->getContent();
            }
            // did we successfully deal with this multi-page article?
            if (empty($multiPageContent)) {
                $this->logger->log('debug', 'Failed to extract all parts of multi-page article, so not going to include them');
                $page = $readability->dom->createElement('p');
                $page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
                $multiPageContent[] = $page;
            }
            foreach ($multiPageContent as $page) {
                $page = $contentBlock->ownerDocument->importNode($page, true);
                $contentBlock->appendChild($page);
            }
            unset($multiPageUrls, $multiPageContent, $nextPageUrl, $page);
        }
        // if we failed to extract content...
        if (!$extractResult || null === $contentBlock) {
            return array('status' => $response['status'], 'html' => $this->config['error_message'], 'title' => $extractedTitle ?: $this->config['error_message_title'], 'language' => $extractedLanguage, 'url' => $effectiveUrl, 'content_type' => isset($mimeInfo['mime']) ? $mimeInfo['mime'] : '', 'open_graph' => $ogData);
        }
        $readability->clean($contentBlock, 'select');
        if ($this->config['rewrite_relative_urls']) {
            $this->makeAbsolute($effectiveUrl, $contentBlock);
        }
        // footnotes
        if ($this->config['content_links'] == 'footnotes' && strpos($effectiveUrl, 'wikipedia.org') === false) {
            $readability->addFootnotes($contentBlock);
        }
        // normalise
        $contentBlock->normalize();
        // remove empty text nodes
        foreach ($contentBlock->childNodes as $n) {
            if ($n->nodeType === XML_TEXT_NODE && trim($n->textContent) == '') {
                $contentBlock->removeChild($n);
            }
        }
        // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
        while ($contentBlock->childNodes->length == 1 && $contentBlock->firstChild->nodeType === XML_ELEMENT_NODE) {
            // only follow these tag names
            if (!in_array(strtolower($contentBlock->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
                break;
            }
            $contentBlock = $contentBlock->firstChild;
        }
        // convert content block to HTML string
        // Need to preserve things like body: //img[@id='feature']
        if (in_array(strtolower($contentBlock->tagName), array('div', 'article', 'section', 'header', 'footer', 'li', 'td'))) {
            $html = $contentBlock->innerHTML;
        } else {
            $html = $contentBlock->ownerDocument->saveXML($contentBlock);
            // essentially outerHTML
        }
        unset($contentBlock);
        // post-processing cleanup
        $html = preg_replace('!<p>[\\s\\h\\v]*</p>!u', '', $html);
        if ($this->config['content_links'] == 'remove') {
            $html = preg_replace('!</?a[^>]*>!', '', $html);
        }
        $this->logger->log('debug', 'Returning data (most interesting ones): {data}', array('data' => array('title' => $extractedTitle, 'language' => $extractedLanguage, 'url' => $effectiveUrl, 'content_type' => $mimeInfo['mime'])));
        return array('status' => $response['status'], 'html' => trim($html), 'title' => $extractedTitle ?: $this->config['error_message_title'], 'language' => $extractedLanguage, 'url' => $effectiveUrl, 'content_type' => $mimeInfo['mime'], 'open_graph' => $ogData);
    }