Pimcore\Document\Adapter\Ghostscript::getText PHP Method

getText() public method

public getText ( null $page = null, null $path = null ) : boolean | string
$page null
$path null
return boolean | string
    public function getText($page = null, $path = null)
    {
        try {
            $path = $path ? $this->preparePath($path) : $this->path;
            $pageRange = "";
            try {
                // first try to use poppler's pdftotext, because this produces more accurate results than the txtwrite device from ghostscript
                if ($page) {
                    $pageRange = "-f " . $page . " -l " . $page . " ";
                }
                $text = Console::exec(self::getPdftotextCli() . " " . $pageRange . escapeshellarg($path) . " -", null, 120);
            } catch (\Exception $e) {
                // pure ghostscript way
                if ($page) {
                    $pageRange = "-dFirstPage=" . $page . " -dLastPage=" . $page . " ";
                }
                $textFile = PIMCORE_SYSTEM_TEMP_DIRECTORY . "/pdf-text-extract-" . uniqid() . ".txt";
                Console::exec(self::getGhostscriptCli() . " -dBATCH -dNOPAUSE -sDEVICE=txtwrite " . $pageRange . "-dTextFormat=2 -sOutputFile=" . $textFile . " " . escapeshellarg($path), null, 120);
                if (is_file($textFile)) {
                    $text = file_get_contents($textFile);
                    // this is a little bit strange the default option -dTextFormat=3 from ghostscript should return utf-8 but it doesn't
                    // so we use option 2 which returns UCS-2LE and convert it here back to UTF-8 which works fine
                    $text = mb_convert_encoding($text, 'UTF-8', 'UCS-2LE');
                    unlink($textFile);
                }
            }
            return $text;
        } catch (\Exception $e) {
            Logger::error($e);
            return false;
        }
    }

Usage Example

Exemplo n.º 1
0
 /**
  * @param null $page
  * @param null $path
  * @return bool|string
  * @throws \Exception
  */
 public function getText($page = null, $path = null)
 {
     $path = $path ? $this->preparePath($path) : $this->path;
     if ($page || parent::isFileTypeSupported($path)) {
         // for per page extraction we have to convert the document to PDF and extract the text via ghostscript
         return parent::getText($page, $this->getPdf($path));
     } elseif (File::getFileExtension($path)) {
         // if we want to get the text of the whole document, we can use libreoffices text export feature
         $cmd = self::getLibreOfficeCli() . " --headless --nologo --nofirststartwizard --norestore --convert-to txt:Text --outdir " . escapeshellarg(PIMCORE_TEMPORARY_DIRECTORY) . " " . escapeshellarg($path);
         $out = Console::exec($cmd, null, 240);
         Logger::debug("LibreOffice Output was: " . $out);
         $tmpName = PIMCORE_TEMPORARY_DIRECTORY . "/" . preg_replace("/\\." . File::getFileExtension($path) . "\$/", ".txt", basename($path));
         if (file_exists($tmpName)) {
             $text = file_get_contents($tmpName);
             $text = \Pimcore\Tool\Text::convertToUTF8($text);
             unlink($tmpName);
             return $text;
         } else {
             $message = "Couldn't convert document to PDF: " . $path . " with the command: '" . $cmd . "' - now trying to get the text out of the PDF ...";
             Logger::error($message);
             return parent::getText(null, $this->getPdf($path));
         }
     }
     return "";
     // default empty string
 }