Graby\Extractor\ContentExtractor::process PHP Method

ContentExtractor Class Documentation Usage Examples Of Graby\Extractor\ContentExtractor::process Datei anzeigen Open project: j0k3r/graby

process() public method

Tidy helps us deal with PHP's patchy HTML parsing most of the time but it has problems of its own which we try to avoid with this option.

public process ( string $html, string $url, SiteConfig $siteConfig = null, boolean $smartTidy = true ) : boolean
$html	string
$url	string
$siteConfig	Graby\SiteConfig\SiteConfig	Will avoid to recalculate the site config
$smartTidy	boolean	Do we need to tidy the html ?
return	boolean	true on success, false on failure

    public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy = true)
    {
        $this->reset();
        $this->siteConfig = $siteConfig;
        if (null === $this->siteConfig) {
            $this->siteConfig = $this->buildSiteConfig($url, $html);
        }
        // do string replacements
        if (!empty($this->siteConfig->find_string)) {
            if (count($this->siteConfig->find_string) == count($this->siteConfig->replace_string)) {
                $html = str_replace($this->siteConfig->find_string, $this->siteConfig->replace_string, $html, $count);
                $this->logger->log('debug', 'Strings replaced: {count} (find_string and/or replace_string)', array('count' => $count));
            } else {
                $this->logger->log('debug', 'Skipped string replacement - incorrect number of find-replace strings in site config');
            }
            unset($count);
        }
        // load and parse html
        $parser = $this->siteConfig->parser();
        if (!in_array($parser, $this->config['allowed_parsers'])) {
            $this->logger->log('debug', 'HTML parser {parser} not listed, using {default_parser} instead', array('parser' => $parser, 'default_parser' => $this->config['default_parser']));
            $parser = $this->config['default_parser'];
        }
        $this->logger->log('debug', 'Attempting to parse HTML with {parser}', array('parser' => $parser));
        $this->readability = new Readability($html, $url, $parser, $this->siteConfig->tidy() && $smartTidy);
        $tidied = $this->readability->tidied;
        // we use xpath to find elements in the given HTML document
        $this->xpath = new \DOMXPath($this->readability->dom);
        // try to get next page link
        // @todo: should we test if the link is actually a link?
        foreach ($this->siteConfig->next_page_link as $pattern) {
            $elems = $this->xpath->evaluate($pattern, $this->readability->dom);
            if (is_string($elems)) {
                $this->nextPageUrl = trim($elems);
                break;
            } elseif ($elems instanceof \DOMNodeList && $elems->length > 0) {
                foreach ($elems as $item) {
                    if ($item instanceof \DOMElement && $item->hasAttribute('href')) {
                        $this->nextPageUrl = $item->getAttribute('href');
                        break 2;
                    } elseif ($item instanceof \DOMAttr && $item->value) {
                        $this->nextPageUrl = $item->value;
                        break 2;
                    }
                }
            }
        }
        // try to get title
        foreach ($this->siteConfig->title as $pattern) {
            $this->logger->log('debug', 'Trying {pattern} for title', array('pattern' => $pattern));
            $elems = $this->xpath->evaluate($pattern, $this->readability->dom);
            if (is_string($elems)) {
                $this->title = trim($elems);
                $this->logger->log('debug', 'Title expression evaluated as string: {title}', array('title' => $this->title));
                $this->logger->log('debug', '...XPath match: {pattern}', array('pattern', $pattern));
                break;
            } elseif ($elems instanceof \DOMNodeList && $elems->length > 0) {
                $this->title = $elems->item(0)->textContent;
                $this->logger->log('debug', 'Title matched: {title}', array('title' => $this->title));
                $this->logger->log('debug', '...XPath match: {pattern}', array('pattern', $pattern));
                // remove title from document
                try {
                    $elems->item(0)->parentNode->removeChild($elems->item(0));
                } catch (\DOMException $e) {
                    // do nothing
                }
                break;
            }
        }
        // try to get language
        $langXpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
        foreach ($langXpath as $pattern) {
            $this->logger->log('debug', 'Trying {pattern} for language', array('pattern' => $pattern));
            $elems = $this->xpath->evaluate($pattern, $this->readability->dom);
            if ($elems instanceof \DOMNodeList && $elems->length > 0) {
                foreach ($elems as $elem) {
                    $this->language = trim($elem->textContent);
                    $this->logger->log('debug', 'Language matched: {language}', array('language' => $this->language));
                }
                if (null !== $this->language) {
                    break;
                }
            }
        }
        // strip elements (using xpath expressions)
        foreach ($this->siteConfig->strip as $pattern) {
            $this->logger->log('debug', 'Trying {pattern} to strip element', array('pattern' => $pattern));
            $elems = $this->xpath->query($pattern, $this->readability->dom);
            $this->removeElements($elems, 'Stripping {length} elements (strip)');
        }
        // strip elements (using id and class attribute values)
        foreach ($this->siteConfig->strip_id_or_class as $string) {
            $this->logger->log('debug', 'Trying {string} to strip element', array('string' => $string));
            $string = strtr($string, array("'" => '', '"' => ''));
            $elems = $this->xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
            $this->removeElements($elems, 'Stripping {length} elements (strip_id_or_class)');
        }
        // strip images (using src attribute values)
        foreach ($this->siteConfig->strip_image_src as $string) {
            $string = strtr($string, array("'" => '', '"' => ''));
            foreach ($this->readability->dom->getElementsByTagName('img') as $e) {
                if (strpos($e->getAttribute('src'), $string)) {
                    $e->parentNode->removeChild($e);
                }
            }
        }
        // strip elements using Readability.com and Instapaper.com ignore class names
        // .entry-unrelated and .instapaper_ignore
        // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
        // and http://blog.instapaper.com/post/730281947
        $elems = $this->xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
        $this->removeElements($elems, 'Stripping {length} .entry-unrelated,.instapaper_ignore elements');
        // strip elements that contain style 'display: none' or 'visibility:hidden'
        // @todo: inline style are convert to <style> by tidy, so we can't remove hidden content ...
        $elems = $this->xpath->query("//*[contains(@style,'display:none') or contains(@style,'visibility:hidden')]", $this->readability->dom);
        $this->removeElements($elems, 'Stripping {length} elements with inline display:none or visibility:hidden style');
        // try to get body
        foreach ($this->siteConfig->body as $pattern) {
            $this->logger->log('debug', 'Trying {pattern} for body', array('pattern' => $pattern));
            $res = $this->extractBody(true, $pattern, $this->readability->dom, 'XPath');
            // this mean we have *found* a body, so we don't need to continue
            if (false === $res) {
                break;
            }
        }
        // auto detect?
        $detectTitle = $detectBody = false;
        // detect title?
        if (!isset($this->title) && (empty($this->siteConfig->title) || $this->siteConfig->autodetect_on_failure())) {
            $detectTitle = true;
        }
        // detect body?
        if (!isset($this->body) && (empty($this->siteConfig->body) || $this->siteConfig->autodetect_on_failure())) {
            $detectBody = true;
        }
        // check for hNews
        if ($detectTitle || $detectBody) {
            // check for hentry
            $elems = $this->xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
            if ($this->hasElements($elems)) {
                $this->logger->log('debug', 'hNews: found hentry');
                $hentry = $elems->item(0);
                // check for entry-title
                $detectTitle = $this->extractTitle($detectTitle, 'entry-title', $hentry, 'hNews: found entry-title: {title}');
                // check for entry-content.
                // according to hAtom spec, if there are multiple elements marked entry-content,
                // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
                $detectBody = $this->extractBody($detectBody, ".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry, 'hNews');
            }
        }
        // check for elements marked with instapaper_title
        $detectTitle = $this->extractTitle($detectTitle, 'instapaper_title', $this->readability->dom, 'Title found (.instapaper_title): {title}');
        // check for elements marked with instapaper_body
        $detectBody = $this->extractBody($detectBody, "//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom, 'instapaper');
        // check for elements marked with itemprop="articleBody" (from Schema.org)
        $detectBody = $this->extractBody($detectBody, "//*[@itemprop='articleBody']", $this->readability->dom, 'Schema.org');
        // still missing title or body, so we detect using Readability
        $success = false;
        if ($detectTitle || $detectBody) {
            $this->logger->log('debug', 'Using Readability');
            // clone body if we're only using Readability for title (otherwise it may interfere with body element)
            if (isset($this->body)) {
                $this->body = $this->body->cloneNode(true);
            }
            $success = $this->readability->init();
        }
        if ($detectTitle && $this->readability->getTitle()) {
            $this->title = $this->readability->getTitle()->textContent;
            $this->logger->log('debug', 'Detected title: {title}', array('title' => $this->title));
        }
        if ($detectBody && $success) {
            $this->logger->log('debug', 'Detecting body');
            $this->body = $this->readability->getContent();
            if ($this->body->childNodes->length === 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
                $this->body = $this->body->firstChild;
            }
            // prune (clean up elements that may not be content)
            if ($this->siteConfig->prune()) {
                $this->logger->log('debug', 'Pruning content');
                $this->readability->prepArticle($this->body);
            }
        }
        if (isset($this->body)) {
            // remove any h1-h6 elements that appear as first thing in the body
            // and which match our title
            if (isset($this->title) && $this->title != '' && null !== $this->body->firstChild) {
                $firstChild = $this->body->firstChild;
                while ($firstChild->nextSibling != null && $firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) {
                    $firstChild = $firstChild->nextSibling;
                }
                if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) {
                    $this->body->removeChild($firstChild);
                }
            }
            // prevent self-closing iframes
            foreach ($this->body->getElementsByTagName('iframe') as $e) {
                if (!$e->hasChildNodes()) {
                    $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
                }
            }
            // prevent self-closing iframe when content is ONLY an iframe
            if ('iframe' === $this->body->nodeName && !$this->body->hasChildNodes()) {
                $this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
            }
            // remove image lazy loading
            foreach ($this->body->getElementsByTagName('img') as $e) {
                if (!$e->hasAttribute('data-lazy-src') && !$e->hasAttribute('data-src') && !$e->hasAttribute('data-original') && !$e->hasAttribute('data-sources')) {
                    continue;
                }
                // Custom case for WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
                // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
                // inside the data-lazy-src attribute. It also places the original image inside a noscript element
                // next to the amended one.
                // @see https://plugins.trac.wordpress.org/browser/lazy-load/trunk/lazy-load.php
                if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
                    $newElem = $e->ownerDocument->createDocumentFragment();
                    $newElem->appendXML($e->nextSibling->innerHTML);
                    $e->nextSibling->parentNode->replaceChild($newElem, $e->nextSibling);
                    $e->parentNode->removeChild($e);
                    continue;
                }
                $src = $e->getAttribute('data-src');
                $e->removeAttribute('data-src');
                if ($e->hasAttribute('data-lazy-src')) {
                    $src = $e->getAttribute('data-lazy-src');
                    $e->removeAttribute('data-lazy-src');
                }
                if ($e->hasAttribute('data-original')) {
                    $src = $e->getAttribute('data-original');
                    $e->removeAttribute('data-original');
                }
                if ($e->hasAttribute('data-sources')) {
                    $src = $e->getAttribute('data-sources');
                    $e->removeAttribute('data-sources');
                }
                $e->setAttribute('src', $src);
            }
            $this->success = true;
        }
        // if we've had no success and we've used tidy, there's a chance
        // that tidy has messed up. So let's try again without tidy...
        if (!$this->success && $tidied && $smartTidy) {
            unset($this->body, $this->xpath);
            $this->logger->log('debug', 'Trying again without tidy');
            return $this->process($this->readability->original_html, $url, $this->siteConfig, false);
        }
        $this->logger->log('debug', 'Success ? {is_success}', array('is_success' => $this->success));
        return $this->success;
    }

Usage Example

Beispiel #1

Datei anzeigen

Datei: ContentExtractorTest.php Projekt: harikt/graby

 public function testIframeEmbeddedContent()
 {
     $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);
     $config = new SiteConfig();
     // '//header' is a bad pattern, and it will jump to the next one
     $config->body = array('//header', '//div');
     // obviously a bad parser which will be converted to use the default one
     $config->parser = 'toto';
     $res = $contentExtractor->process('<div>' . str_repeat('this is the best part of the show', 10) . '</div><div class="video_player"><iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320"></iframe></div>', 'https://lemonde.io/35941909', $config);
     $this->assertTrue($res, 'Extraction went well');
     $domElement = $contentExtractor->getContent();
     $content = $domElement->ownerDocument->saveXML($domElement);
     $this->assertContains('<iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320">[embedded content]</iframe>', $content);
 }

ContentExtractor

__construct

buildSiteConfig

extractBody

extractTitle

findHostUsingFingerprints

getContent

getLanguage

getNextPageUrl

getSiteConfig

getTitle

hasElements

process

removeElements

reset

setLogger