diff --git a/composer.json b/composer.json index 48a4574..4177098 100644 --- a/composer.json +++ b/composer.json @@ -11,6 +11,6 @@ ], "require": { "thiagoalessio/tesseract_ocr": "2.4.0", - "spatie/pdf-to-image": "1.8.1" + "waarneembemiddeling/php-pdfimages": "dev-master" } } diff --git a/composer.lock b/composer.lock index 4359420..3c919f2 100644 --- a/composer.lock +++ b/composer.lock @@ -1,36 +1,172 @@ { "_readme": [ "This file locks the dependencies of your project to a known state", - "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "fca498052306e6cd5c7ca896c0da12d4", + "content-hash": "efb613a4550b62f45ce43ad5d5eb1f1d", "packages": [ { - "name": "spatie/pdf-to-image", - "version": "1.8.1", + "name": "alchemy/binary-driver", + "version": "1.6.0", "source": { "type": "git", - "url": "https://github.com/spatie/pdf-to-image.git", - "reference": "f33afc92ff7bff272fa6a9ba1cc335e96c57eb26" + "url": "https://github.com/alchemy-fr/BinaryDriver.git", + "reference": "80c6633890afb71d2417ae851d0ad167d8b00b95" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/spatie/pdf-to-image/zipball/f33afc92ff7bff272fa6a9ba1cc335e96c57eb26", - "reference": "f33afc92ff7bff272fa6a9ba1cc335e96c57eb26", + "url": "https://api.github.com/repos/alchemy-fr/BinaryDriver/zipball/80c6633890afb71d2417ae851d0ad167d8b00b95", + "reference": "80c6633890afb71d2417ae851d0ad167d8b00b95", "shasum": "" }, "require": { - "ext-imagick": "*", - "php": "^7.0" + "evenement/evenement": "^2.0|^1.0", + "monolog/monolog": "^1.3", + "php": ">=5.5", + "psr/log": "^1.0", + "symfony/process": "^2.0|^3.0" }, "require-dev": { - "phpunit/phpunit": "^6.2" + "phpunit/phpunit": "^4.0|^5.0" }, "type": "library", + "autoload": { + "psr-0": { + "Alchemy": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Romain Neutron", + "email": "imprec@gmail.com", + "homepage": "http://www.lickmychip.com/" + }, + { + "name": "Phraseanet Team", + "email": "info@alchemy.fr", + "homepage": "http://www.phraseanet.com/" + }, + { + "name": "Nicolas Le Goff", + "email": "legoff.n@gmail.com" + } + ], + "description": "A set of tools to build binary drivers", + "keywords": [ + "binary", + "driver" + ], + "time": "2016-03-02T13:49:15+00:00" + }, + { + "name": "evenement/evenement", + "version": "v2.1.0", + "source": { + "type": "git", + "url": "https://github.com/igorw/evenement.git", + "reference": "6ba9a777870ab49f417e703229d53931ed40fd7a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/igorw/evenement/zipball/6ba9a777870ab49f417e703229d53931ed40fd7a", + "reference": "6ba9a777870ab49f417e703229d53931ed40fd7a", + "shasum": "" + }, + "require": { + "php": ">=5.4.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0||^5.7||^4.8.35" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0-dev" + } + }, + "autoload": { + "psr-0": { + "Evenement": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Igor Wiedler", + "email": "igor@wiedler.ch" + } + ], + "description": "Événement is a very simple event dispatching library for PHP", + "keywords": [ + "event-dispatcher", + "event-emitter" + ], + "time": "2017-07-17T17:39:19+00:00" + }, + { + "name": "monolog/monolog", + "version": "1.25.1", + "source": { + "type": "git", + "url": "https://github.com/Seldaek/monolog.git", + "reference": "70e65a5470a42cfec1a7da00d30edb6e617e8dcf" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/70e65a5470a42cfec1a7da00d30edb6e617e8dcf", + "reference": "70e65a5470a42cfec1a7da00d30edb6e617e8dcf", + "shasum": "" + }, + "require": { + "php": ">=5.3.0", + "psr/log": "~1.0" + }, + "provide": { + "psr/log-implementation": "1.0.0" + }, + "require-dev": { + "aws/aws-sdk-php": "^2.4.9 || ^3.0", + "doctrine/couchdb": "~1.0@dev", + "graylog2/gelf-php": "~1.0", + "jakub-onderka/php-parallel-lint": "0.9", + "php-amqplib/php-amqplib": "~2.4", + "php-console/php-console": "^3.1.3", + "phpunit/phpunit": "~4.5", + "phpunit/phpunit-mock-objects": "2.3.0", + "ruflin/elastica": ">=0.90 <3.0", + "sentry/sentry": "^0.13", + "swiftmailer/swiftmailer": "^5.3|^6.0" + }, + "suggest": { + "aws/aws-sdk-php": "Allow sending log messages to AWS services like DynamoDB", + "doctrine/couchdb": "Allow sending log messages to a CouchDB server", + "ext-amqp": "Allow sending log messages to an AMQP server (1.0+ required)", + "ext-mongo": "Allow sending log messages to a MongoDB server", + "graylog2/gelf-php": "Allow sending log messages to a GrayLog2 server", + "mongodb/mongodb": "Allow sending log messages to a MongoDB server via PHP Driver", + "php-amqplib/php-amqplib": "Allow sending log messages to an AMQP server using php-amqplib", + "php-console/php-console": "Allow sending log messages to Google Chrome", + "rollbar/rollbar": "Allow sending log messages to Rollbar", + "ruflin/elastica": "Allow sending log messages to an Elastic Search server", + "sentry/sentry": "Allow sending log messages to a Sentry server" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0.x-dev" + } + }, "autoload": { "psr-4": { - "Spatie\\PdfToImage\\": "src" + "Monolog\\": "src/Monolog" } }, "notification-url": "https://packagist.org/downloads/", @@ -39,22 +175,115 @@ ], "authors": [ { - "name": "Freek Van der Herten", - "email": "freek@spatie.be", - "homepage": "https://spatie.be", - "role": "Developer" + "name": "Jordi Boggiano", + "email": "j.boggiano@seld.be", + "homepage": "http://seld.be" } ], - "description": "Convert a pdf to an image", - "homepage": "https://github.com/spatie/pdf-to-image", + "description": "Sends your logs to files, sockets, inboxes, databases and various web services", + "homepage": "http://github.com/Seldaek/monolog", "keywords": [ - "convert", - "image", - "pdf", - "pdf-to-image", - "spatie" + "log", + "logging", + "psr-3" ], - "time": "2018-07-02T09:30:32+00:00" + "time": "2019-09-06T13:49:17+00:00" + }, + { + "name": "psr/log", + "version": "1.1.0", + "source": { + "type": "git", + "url": "https://github.com/php-fig/log.git", + "reference": "6c001f1daafa3a3ac1d8ff69ee4db8e799a654dd" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/log/zipball/6c001f1daafa3a3ac1d8ff69ee4db8e799a654dd", + "reference": "6c001f1daafa3a3ac1d8ff69ee4db8e799a654dd", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Log\\": "Psr/Log/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interface for logging libraries", + "homepage": "https://github.com/php-fig/log", + "keywords": [ + "log", + "psr", + "psr-3" + ], + "time": "2018-11-20T15:27:04+00:00" + }, + { + "name": "symfony/process", + "version": "v3.4.32", + "source": { + "type": "git", + "url": "https://github.com/symfony/process.git", + "reference": "344dc588b163ff58274f1769b90b75237f32ed16" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/process/zipball/344dc588b163ff58274f1769b90b75237f32ed16", + "reference": "344dc588b163ff58274f1769b90b75237f32ed16", + "shasum": "" + }, + "require": { + "php": "^5.5.9|>=7.0.8" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.4-dev" + } + }, + "autoload": { + "psr-4": { + "Symfony\\Component\\Process\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony Process Component", + "homepage": "https://symfony.com", + "time": "2019-09-25T14:09:38+00:00" }, { "name": "thiagoalessio/tesseract_ocr", @@ -100,12 +329,56 @@ "text recognition" ], "time": "2018-05-11T14:22:47+00:00" + }, + { + "name": "waarneembemiddeling/php-pdfimages", + "version": "dev-master", + "source": { + "type": "git", + "url": "https://github.com/waarneembemiddeling/php-pdfimages.git", + "reference": "1594a11a7aff69418a07802d81f535cf1b434428" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/waarneembemiddeling/php-pdfimages/zipball/1594a11a7aff69418a07802d81f535cf1b434428", + "reference": "1594a11a7aff69418a07802d81f535cf1b434428", + "shasum": "" + }, + "require": { + "alchemy/binary-driver": "~1.5" + }, + "require-dev": { + "phpunit/phpunit": "~4.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Wb\\PdfImages\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Kristian Zondervan", + "email": "k.zondervan@waarneembemiddeling.nl" + }, + { + "name": "Waarneembemiddeling.nl developers", + "email": "development@waarneembemiddeling.nl" + } + ], + "time": "2015-07-09T09:44:00+00:00" } ], "packages-dev": [], "aliases": [], "minimum-stability": "stable", - "stability-flags": [], + "stability-flags": { + "waarneembemiddeling/php-pdfimages": 20 + }, "prefer-stable": false, "prefer-lowest": false, "platform": [], diff --git a/lib/Service/TesseractService.php b/lib/Service/TesseractService.php index 45caa6d..d865dd6 100644 --- a/lib/Service/TesseractService.php +++ b/lib/Service/TesseractService.php @@ -39,10 +39,9 @@ use OCP\Files_FullTextSearch\Model\AFilesDocument; use OCP\FullTextSearch\Model\IIndexDocument; use OCP\FullTextSearch\Model\ISearchRequest; -use Spatie\PdfToImage\Exceptions\PageDoesNotExist; -use Spatie\PdfToImage\Pdf; use Symfony\Component\EventDispatcher\GenericEvent; use thiagoalessio\TesseractOCR\TesseractOCR; +use Wb\PdfImages\PdfImages; /** @@ -214,29 +213,23 @@ private function ocrPdf(AFilesDocument $document, File $file): bool { return true; } + try { - $path = $this->getAbsolutePath($file); - $pdf = new Pdf($path); + $path = $this->getAbsolutePath($file); + $pdfImages = PdfImages::create()->extractImages($path); } catch (Exception $e) { $this->miscService->log('Exception while ocr pdf file: ' . $e->getMessage(), 1); throw new NotFoundException(); } $content = ''; - for ($i = 1; $i <= $pdf->getNumberOfPages(); $i++) { - // we create a temp image file - $tmpFile = tmpfile(); - $tmpPath = stream_get_meta_data($tmpFile)['uri']; - - try { - $pdf->setPage($i); - $pdf->saveImage($tmpPath); - - $content .= $this->ocrFileFromPath($tmpPath); - } catch (PageDoesNotExist $e) { - } + foreach ($pdfImages as $pdfImage) { + $imageText = $this->ocrFileFromPath($pdfImage->getPathname()); + unlink($pdfImage->getPathname()); + $content .= $imageText; } - + rmdir($pdfImages->getPath()); + $document->addPart('ocr', $content); return true;