private function doFetchContent($url)
{
$url = $this->validateUrl($url);
$siteConfig = $this->configBuilder->buildFromUrl($url);
$this->logger->log('debug', 'Fetching url: {url}', array('url' => $url));
$response = $this->httpClient->fetch($url, false, $siteConfig->http_header);
$effectiveUrl = $response['effective_url'];
$effectiveUrl = str_replace(' ', '%20', $effectiveUrl);
if (!$this->isUrlAllowed($effectiveUrl)) {
throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $effectiveUrl));
}
// check if action defined for returned Content-Type, like image, pdf, audio or video
$mimeInfo = $this->getMimeActionInfo($response['headers']);
$infos = $this->handleMimeAction($mimeInfo, $effectiveUrl, $response['body']);
if (is_array($infos)) {
return $infos;
}
$html = $this->convert2Utf8($response['body'], $response['headers']);
// some non utf8 enconding might be breaking after converting to utf8
// when it happen the string (usually) starts with this character
// in that case, we'll take the default response instead of the utf8 forced one
if (0 === strpos(utf8_encode($response['body']), 'ÿþ')) {
$html = $response['body'];
}
$ogData = $this->extractOpenGraph($html, $effectiveUrl);
$this->logger->log('debug', 'Opengraph data: {ogData}', array('ogData' => $ogData));
// @TODO: log raw html + headers
// check site config for single page URL - fetch it if found
$isSinglePage = false;
if ($this->config['singlepage'] && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) {
$isSinglePage = true;
$effectiveUrl = $singlePageResponse['effective_url'];
// check if action defined for returned Content-Type
$mimeInfo = $this->getMimeActionInfo($singlePageResponse['headers']);
$infos = $this->handleMimeAction($mimeInfo, $effectiveUrl, $singlePageResponse['body']);
if (is_array($infos)) {
return $infos;
}
$html = $this->convert2Utf8($singlePageResponse['body'], $singlePageResponse['headers']);
$this->logger->log('debug', 'Retrieved single-page view from "{url}"', array('url' => $effectiveUrl));
unset($singlePageResponse);
}
$this->logger->log('debug', 'Attempting to extract content');
$extractResult = $this->extractor->process($html, $effectiveUrl);
$readability = $this->extractor->readability;
$contentBlock = $this->extractor->getContent();
$extractedTitle = $this->extractor->getTitle();
$extractedLanguage = $this->extractor->getLanguage();
// Deal with multi-page articles
$isMultiPage = !$isSinglePage && $extractResult && null !== $this->extractor->getNextPageUrl();
if ($this->config['multipage'] && $isMultiPage) {
$this->logger->log('debug', 'Attempting to process multi-page article');
// store first page to avoid parsing it again (previous url content is in `$contentBlock`)
$multiPageUrls = array($effectiveUrl);
$multiPageContent = array();
while ($nextPageUrl = $this->extractor->getNextPageUrl()) {
$this->logger->log('debug', 'Processing next page: {url}', array('url' => $nextPageUrl));
// If we've got URL, resolve against $url
$nextPageUrl = $this->makeAbsoluteStr($effectiveUrl, $nextPageUrl);
if (!$nextPageUrl) {
$this->logger->log('debug', 'Failed to resolve against: {url}', array('url' => $effectiveUrl));
$multiPageContent = array();
break;
}
// check it's not what we have already!
if (in_array($nextPageUrl, $multiPageUrls)) {
$this->logger->log('debug', 'URL already processed');
$multiPageContent = array();
break;
}
// it's not, store it for later check & so let's attempt to fetch it
$multiPageUrls[] = $nextPageUrl;
$response = $this->httpClient->fetch($nextPageUrl, false, $siteConfig->http_header);
// make sure mime type is not something with a different action associated
$mimeInfo = $this->getMimeActionInfo($response['headers']);
if (isset($mimeInfo['action'])) {
$this->logger->log('debug', 'MIME type requires different action');
$multiPageContent = array();
break;
}
$extracSuccess = $this->extractor->process($this->convert2Utf8($response['body'], $response['headers']), $nextPageUrl);
if (!$extracSuccess) {
$this->logger->log('debug', 'Failed to extract content');
$multiPageContent = array();
break;
}
$multiPageContent[] = $this->extractor->getContent();
}
// did we successfully deal with this multi-page article?
if (empty($multiPageContent)) {
$this->logger->log('debug', 'Failed to extract all parts of multi-page article, so not going to include them');
$page = $readability->dom->createElement('p');
$page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
$multiPageContent[] = $page;
}
foreach ($multiPageContent as $page) {
$page = $contentBlock->ownerDocument->importNode($page, true);
$contentBlock->appendChild($page);
}
unset($multiPageUrls, $multiPageContent, $nextPageUrl, $page);
}
// if we failed to extract content...
if (!$extractResult || null === $contentBlock) {
return array('status' => $response['status'], 'html' => $this->config['error_message'], 'title' => $extractedTitle ?: $this->config['error_message_title'], 'language' => $extractedLanguage, 'url' => $effectiveUrl, 'content_type' => isset($mimeInfo['mime']) ? $mimeInfo['mime'] : '', 'open_graph' => $ogData);
}
$readability->clean($contentBlock, 'select');
if ($this->config['rewrite_relative_urls']) {
$this->makeAbsolute($effectiveUrl, $contentBlock);
}
// footnotes
if ($this->config['content_links'] == 'footnotes' && strpos($effectiveUrl, 'wikipedia.org') === false) {
$readability->addFootnotes($contentBlock);
}
// normalise
$contentBlock->normalize();
// remove empty text nodes
foreach ($contentBlock->childNodes as $n) {
if ($n->nodeType === XML_TEXT_NODE && trim($n->textContent) == '') {
$contentBlock->removeChild($n);
}
}
// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while ($contentBlock->childNodes->length == 1 && $contentBlock->firstChild->nodeType === XML_ELEMENT_NODE) {
// only follow these tag names
if (!in_array(strtolower($contentBlock->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
break;
}
$contentBlock = $contentBlock->firstChild;
}
// convert content block to HTML string
// Need to preserve things like body: //img[@id='feature']
if (in_array(strtolower($contentBlock->tagName), array('div', 'article', 'section', 'header', 'footer', 'li', 'td'))) {
$html = $contentBlock->innerHTML;
} else {
$html = $contentBlock->ownerDocument->saveXML($contentBlock);
// essentially outerHTML
}
unset($contentBlock);
// post-processing cleanup
$html = preg_replace('!<p>[\\s\\h\\v]*</p>!u', '', $html);
if ($this->config['content_links'] == 'remove') {
$html = preg_replace('!</?a[^>]*>!', '', $html);
}
$this->logger->log('debug', 'Returning data (most interesting ones): {data}', array('data' => array('title' => $extractedTitle, 'language' => $extractedLanguage, 'url' => $effectiveUrl, 'content_type' => $mimeInfo['mime'])));
return array('status' => $response['status'], 'html' => trim($html), 'title' => $extractedTitle ?: $this->config['error_message_title'], 'language' => $extractedLanguage, 'url' => $effectiveUrl, 'content_type' => $mimeInfo['mime'], 'open_graph' => $ogData);
}