public static function runCrawler()
{
$running = Configuration::getCoreSetting('running');
if ($running === TRUE) {
return FALSE;
}
$indexDir = \LuceneSearch\Plugin::getFrontendSearchIndex();
if ($indexDir) {
exec('rm -Rf ' . str_replace('/index/', '/tmpindex', $indexDir));
\Pimcore\Logger::debug('LuceneSearch: rm -Rf ' . str_replace('/index/', '/tmpindex', $indexDir));
\Pimcore\Logger::debug('LuceneSearch: Starting crawl');
try {
$urls = Configuration::get('frontend.urls');
$invalidLinkRegexesSystem = Configuration::get('frontend.invalidLinkRegexes');
$invalidLinkRegexesEditable = Configuration::get('frontend.invalidLinkRegexesEditable');
if (!empty($invalidLinkRegexesEditable) and !empty($invalidLinkRegexesSystem)) {
$invalidLinkRegexes = array_merge($invalidLinkRegexesEditable, array($invalidLinkRegexesSystem));
} else {
if (!empty($invalidLinkRegexesEditable)) {
$invalidLinkRegexes = $invalidLinkRegexesEditable;
} else {
if (!empty($invalidLinkRegexesSystem)) {
$invalidLinkRegexes = array($invalidLinkRegexesSystem);
} else {
$invalidLinkRegexes = array();
}
}
}
self::setCrawlerState('frontend', 'started', TRUE);
try {
foreach ($urls as $seed) {
$parser = new Parser();
$parser->setDepth(Configuration::get('frontend.crawler.maxLinkDepth'))->setValidLinkRegexes(Configuration::get('frontend.validLinkRegexes'))->setInvalidLinkRegexes($invalidLinkRegexes)->setSearchStartIndicator(Configuration::get('frontend.crawler.contentStartIndicator'))->setSearchEndIndicator(Configuration::get('frontend.crawler.contentEndIndicator'))->setSearchExcludeStartIndicator(Configuration::get('frontend.crawler.contentExcludeStartIndicator'))->setSearchExcludeEndIndicator(Configuration::get('frontend.crawler.contentExcludeEndIndicator'))->setAllowSubdomain(FALSE)->setAllowedSchemes(Configuration::get('frontend.allowedSchemes'))->setDownloadLimit(Configuration::get('frontend.crawler.maxDownloadLimit'))->setSeed($seed);
if (Configuration::get('frontend.auth.useAuth') === TRUE) {
$parser->setAuth(Configuration::get('frontend.auth.username'), Configuration::get('frontend.auth.password'));
}
$parser->startParser();
$parser->optimizeIndex();
}
} catch (\Exception $e) {
}
self::setCrawlerState('frontend', 'finished', FALSE);
//only remove index, if tmp exists!
$tmpIndex = str_replace('/index', '/tmpindex', $indexDir);
if (is_dir($tmpIndex)) {
exec('rm -Rf ' . $indexDir);
\Pimcore\Logger::debug('LuceneSearch: rm -Rf ' . $indexDir);
exec('cp -R ' . substr($tmpIndex, 0, -1) . ' ' . substr($indexDir, 0, -1));
\Pimcore\Logger::debug('LuceneSearch: cp -R ' . substr($tmpIndex, 0, -1) . ' ' . substr($indexDir, 0, -1));
\Pimcore\Logger::debug('LuceneSearch: replaced old index');
\Pimcore\Logger::info('LuceneSearch: Finished crawl');
} else {
\Pimcore\Logger::error('LuceneSearch: skipped index replacing. no tmp index found.');
}
} catch (\Exception $e) {
\Pimcore\Logger::error($e);
throw $e;
}
}
}