public function getText($page = null, $path = null)
{
try {
$path = $path ? $this->preparePath($path) : $this->path;
$pageRange = "";
try {
// first try to use poppler's pdftotext, because this produces more accurate results than the txtwrite device from ghostscript
if ($page) {
$pageRange = "-f " . $page . " -l " . $page . " ";
}
$text = Console::exec(self::getPdftotextCli() . " " . $pageRange . escapeshellarg($path) . " -", null, 120);
} catch (\Exception $e) {
// pure ghostscript way
if ($page) {
$pageRange = "-dFirstPage=" . $page . " -dLastPage=" . $page . " ";
}
$textFile = PIMCORE_SYSTEM_TEMP_DIRECTORY . "/pdf-text-extract-" . uniqid() . ".txt";
Console::exec(self::getGhostscriptCli() . " -dBATCH -dNOPAUSE -sDEVICE=txtwrite " . $pageRange . "-dTextFormat=2 -sOutputFile=" . $textFile . " " . escapeshellarg($path), null, 120);
if (is_file($textFile)) {
$text = file_get_contents($textFile);
// this is a little bit strange the default option -dTextFormat=3 from ghostscript should return utf-8 but it doesn't
// so we use option 2 which returns UCS-2LE and convert it here back to UTF-8 which works fine
$text = mb_convert_encoding($text, 'UTF-8', 'UCS-2LE');
unlink($textFile);
}
}
return $text;
} catch (\Exception $e) {
Logger::error($e);
return false;
}
}