public function execute($pageContent = '', $recursionDepth = 0)
{
$this->html = '';
$this->encoding = '';
$this->content = '';
$this->download();
$this->prepareHtml();
$parser = $this->getParser();
if ($parser !== null) {
$maxRecursions = $this->config->getMaxRecursions();
if (!isset($maxRecursions)) {
$maxRecursions = 25;
}
$pageContent .= $parser->execute();
// check if there is a link to next page and recursively get content (max 25 pages)
if (($nextLink = $parser->findNextLink()) !== null && $recursionDepth < $maxRecursions) {
$nextLink = Url::resolve($nextLink, $this->url);
$this->setUrl($nextLink);
$this->execute($pageContent, $recursionDepth + 1);
} else {
$this->content = $pageContent;
}
Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes');
}
}
/** * @group online */ public function testGrabContentRegex() { $grabber = new Scraper(new Config()); $grabber->setUrl('http://penny-arcade.com/comic/2015/04/13/101-part-one'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $this->assertEquals('<img src="http://art.penny-arcade.com/photos/i-tBMHkzG/0/1050x10000/i-tBMHkzG-1050x10000.jpg" alt="101, Part One"/>', $grabber->getRelevantContent()); $grabber->setUrl('http://penny-arcade.com/news/post/2015/04/15/101-part-two'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $this->assertContains('101, Part Two', $grabber->getRelevantContent()); }