private function getSinglePage($html, $url) { $this->logger->log('debug', 'Looking for site config files to see if single page link exists'); $siteConfig = $this->configBuilder->buildFromUrl($url); // no single page found? if (empty($siteConfig->single_page_link)) { $this->logger->log('debug', 'No "single_page_link" config found'); return false; } // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new \DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $singlePageUrl = null; foreach ($siteConfig->single_page_link as $pattern) { $elems = $xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $singlePageUrl = trim($elems); break; } elseif ($elems instanceof \DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item instanceof \DOMElement && $item->hasAttribute('href')) { $singlePageUrl = $item->getAttribute('href'); break 2; } elseif ($item instanceof \DOMAttr && $item->value) { $singlePageUrl = $item->value; break 2; } } } } if (!$singlePageUrl) { $this->logger->log('debug', 'No url found'); return false; } // try to resolve against $url $singlePageUrl = $this->makeAbsoluteStr($url, $singlePageUrl); // check it's not what we have already! if (false !== $singlePageUrl && $singlePageUrl != $url) { // it's not, so let's try to fetch it... $response = $this->httpClient->fetch($singlePageUrl, false, $siteConfig->http_header); if ($response['status'] < 300) { $this->logger->log('debug', 'Single page content found with url', ['url' => $singlePageUrl]); return $response; } } $this->logger->log('debug', 'No content found with url', ['url' => $singlePageUrl]); return false; }