Graby\Extractor\HttpClient::fetch PHP Method

fetch() public method

Grab informations from an url: - final url (after potential redirection) - raw content - content type header.
public fetch ( string $url, boolean $skipTypeVerification = false, array $httpHeader = [] ) : array
$url string
$skipTypeVerification boolean Avoid mime detection which means, force GET instead of potential HEAD
$httpHeader array Custom HTTP Headers from SiteConfig
return array With keys effective_url, body & headers
    public function fetch($url, $skipTypeVerification = false, $httpHeader = array())
    {
        if (false === $this->checkNumberRedirects($url)) {
            return $this->sendResults(array('effective_url' => self::$initialUrl, 'body' => '', 'headers' => '', 'status' => 310));
        }
        $url = $this->cleanupUrl($url);
        $method = 'get';
        if (!$skipTypeVerification && !empty($this->config['header_only_types']) && $this->possibleUnsupportedType($url)) {
            $method = 'head';
        }
        $this->logger->log('debug', 'Trying using method "{method}" on url "{url}"', array('method' => $method, 'url' => $url));
        try {
            $response = $this->client->{$method}($url, array('headers' => array('User-Agent' => $this->getUserAgent($url, $httpHeader), 'Referer' => $this->getReferer($url, $httpHeader)), 'timeout' => $this->config['timeout'], 'connect_timeout' => $this->config['timeout']));
        } catch (RequestException $e) {
            if ($e->hasResponse()) {
                $response = $e->getResponse();
                $data = array('effective_url' => $response->getEffectiveUrl(), 'body' => '', 'headers' => (string) $response->getHeader('Content-Type'), 'status' => $response->getStatusCode());
                $this->logger->log('debug', 'Request throw exception (with a response): {error_message}', array('error_message' => $e->getMessage()));
                $this->logger->log('debug', 'Data fetched: {data}', array('data' => $data));
                return $this->sendResults($data);
            }
            $data = array('effective_url' => $url, 'body' => '', 'headers' => '', 'status' => 500);
            $this->logger->log('debug', 'Request throw exception (with no response): {error_message}', array('error_message' => $e->getMessage()));
            $this->logger->log('debug', 'Data fetched: {data}', array('data' => $data));
            return $this->sendResults($data);
        }
        $effectiveUrl = $response->getEffectiveUrl();
        $contentType = (string) $response->getHeader('Content-Type');
        // the response content-type did not match our 'header only' types,
        // but we'd issues a HEAD request because we assumed it would. So
        // let's queue a proper GET request for this item...
        if ('head' === $method && !$this->headerOnlyType($contentType)) {
            return $this->fetch($effectiveUrl, true, $httpHeader);
        }
        $body = (string) $response->getBody();
        // be sure to remove conditional comments for IE
        // we only remove conditional comments until we found the <head> tag
        // they usually contains the <html> tag which we try to found and replace the last occurence
        // with the whole conditional comments
        preg_match('/^\\<!--\\[if(\\X+)\\<!\\[endif\\]--\\>(\\X+)\\<head\\>$/mi', $body, $matchesConditional);
        if (count($matchesConditional) > 1) {
            preg_match_all('/\\<html([\\sa-z0-9\\=\\"\\"\\-:\\/\\.\\#]+)\\>$/mi', $matchesConditional[0], $matchesHtml);
            if (count($matchesHtml) > 1) {
                $htmlTag = end($matchesHtml[0]);
                if (!empty($htmlTag)) {
                    $body = str_replace($matchesConditional[0], $htmlTag . '<head>', $body);
                }
            }
        }
        // check for <meta name='fragment' content='!'/>
        // for AJAX sites, e.g. Blogger with its dynamic views templates.
        // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
        if (strpos($effectiveUrl, '_escaped_fragment_') === false) {
            $redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body);
            if (false !== $redirectURL) {
                return $this->fetch($redirectURL, true, $httpHeader);
            }
        }
        // remove utm parameters & fragment
        $effectiveUrl = preg_replace('/((\\?)?(&(amp;)?)?utm_(.*?)\\=[^&]+)|(#(.*?)\\=[^&]+)/', '', urldecode($effectiveUrl));
        $this->logger->log('debug', 'Data fetched: {data}', array('data' => array('effective_url' => $effectiveUrl, 'body' => '(only length for debug): ' . strlen($body), 'headers' => $contentType, 'status' => $response->getStatusCode())));
        return $this->sendResults(array('effective_url' => $effectiveUrl, 'body' => $body, 'headers' => $contentType, 'status' => $response->getStatusCode()));
    }

Usage Example

Beispiel #1
0
 public function testWith404ResponseWithoutResponse()
 {
     $request = $this->getMockBuilder('GuzzleHttp\\Message\\Request')->disableOriginalConstructor()->getMock();
     $response = $this->getMockBuilder('GuzzleHttp\\Message\\Response')->disableOriginalConstructor()->getMock();
     $client = $this->getMockBuilder('GuzzleHttp\\Client')->disableOriginalConstructor()->getMock();
     $client->expects($this->once())->method('get')->willThrowException(new RequestException('oops', $request));
     $http = new HttpClient($client);
     $res = $http->fetch('http://0.0.0.0');
     $this->assertEquals('http://0.0.0.0', $res['effective_url']);
     $this->assertEquals('', $res['body']);
     $this->assertEquals('', $res['headers']);
     $this->assertEquals(500, $res['status']);
 }