Skip to content

Commit

Permalink
Merge pull request #56 from nextcloud/enh/noid/compat-27
Browse files Browse the repository at this point in the history
compat nc27
  • Loading branch information
ArtificialOwl authored Jun 13, 2023
2 parents 7f4bebf + f10d797 commit e1405e4
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 108 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ cert_dir=$(HOME)/.nextcloud/certificates
github_account=nextcloud
release_account=nextcloud-releases
branch=master
version=25.0.0-alpha1
since_tag=24.0.0
version=27.0.0
since_tag=

all: appstore

Expand Down
4 changes: 2 additions & 2 deletions appinfo/info.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ which is a wrapper for the command line program [Tesseract OCR](https://github.c
Tesseract must be installed locally, and configured.
]]>
</description>
<version>25.0.0-alpha1</version>
<version>27.0.0</version>
<licence>agpl</licence>
<author>Maxence Lange</author>
<namespace>Files_FullTextSearch_Tesseract</namespace>
Expand All @@ -27,7 +27,7 @@ Tesseract must be installed locally, and configured.
<repository>https://github.com/nextcloud/files_fulltextsearch_tesseract.git</repository>
<screenshot>https://raw.githubusercontent.com/nextcloud/fulltextsearch/master/screenshots/0.3.0.png</screenshot>
<dependencies>
<nextcloud min-version="25" max-version="25"/>
<nextcloud min-version="26" max-version="27"/>
</dependencies>

<settings>
Expand Down
5 changes: 2 additions & 3 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
}
},
"require": {
"artificial-owl/my-small-php-tools": "~22",
"thiagoalessio/tesseract_ocr": "2.4.0",
"spatie/pdf-to-image": "2.1.0"
"thiagoalessio/tesseract_ocr": "2.12.0",
"spatie/pdf-to-image": "2.2.0"
}
}
77 changes: 18 additions & 59 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

73 changes: 31 additions & 42 deletions lib/Service/TesseractService.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
namespace OCA\Files_FullTextSearch_Tesseract\Service;


use ArtificialOwl\MySmallPhpTools\Traits\Nextcloud\nc22\TNC22Logger;
use Exception;
use OC\Files\View;
use OCP\EventDispatcher\GenericEvent;
Expand All @@ -41,6 +40,7 @@
use OCP\Files_FullTextSearch\Model\AFilesDocument;
use OCP\FullTextSearch\Model\IIndexDocument;
use OCP\FullTextSearch\Model\ISearchRequest;
use Psr\Log\LoggerInterface;
use Spatie\PdfToImage\Exceptions\PageDoesNotExist;
use Spatie\PdfToImage\Pdf;
use thiagoalessio\TesseractOCR\TesseractOCR;
Expand All @@ -54,23 +54,10 @@
*/
class TesseractService {


use TNC22Logger;


/** @var ConfigService */
private $configService;


/**
* TesseractService constructor.
*
* @param ConfigService $configService
*/
public function __construct(ConfigService $configService) {
$this->configService = $configService;

$this->setup('app', 'files_fulltextsearch_tesseract');
public function __construct(
private ConfigService $configService,
private LoggerInterface $logger
) {
}


Expand Down Expand Up @@ -106,7 +93,7 @@ public function parsedMimeType(string $mimeType, string $extension): bool {
/**
* @param GenericEvent $e
*/
public function onFileIndexing(GenericEvent $e) {
public function onFileIndexing(GenericEvent $e): void {
/** @var Node $file */
$file = $e->getArgument('file');

Expand Down Expand Up @@ -135,8 +122,7 @@ public function onSearchRequest(GenericEvent $e) {
* @param AFilesDocument $document
* @param File $file
*/
private function extractContentUsingTesseractOCR(AFilesDocument &$document, File $file) {

private function extractContentUsingTesseractOCR(AFilesDocument &$document, File $file): void {
try {
if ($this->configService->getAppValue(ConfigService::TESSERACT_ENABLED) !== '1') {
return;
Expand All @@ -148,7 +134,7 @@ private function extractContentUsingTesseractOCR(AFilesDocument &$document, File
return;
}

$this->debug(
$this->logger->debug(
'extracting content using TesseractOCR',
[
'documentId' => $document->getId(),
Expand Down Expand Up @@ -184,7 +170,7 @@ private function ocrFile(File $file): string {
try {
$path = $this->getAbsolutePath($file);
} catch (Exception $e) {
$this->exception($e, self::$NOTICE);
$this->logger->notice('issue during ocrFile()', ['exception' => $e]);
throw new NotFoundException();
}

Expand All @@ -198,23 +184,28 @@ private function ocrFile(File $file): string {
* @return string
*/
private function ocrFileFromPath(string $path): string {
$this->debug('generating the TesseractOCR wrapper', ['path' => $path]);
$this->logger->debug('generating the TesseractOCR wrapper', ['path' => $path]);

$ocr = new TesseractOCR($path);
$ocr->psm($this->configService->getAppValue(ConfigService::TESSERACT_PSM));
$lang = explode(',', $this->configService->getAppValue(ConfigService::TESSERACT_LANG));
call_user_func_array([$ocr, 'lang'], array_map('trim', $lang));
$this->debug('running the OCR command', ['command' => $ocr->command]);
$this->logger->debug('running the OCR command', ['command' => $ocr->command]);

if ($this->configService->getLogLevel() > 0) {
$ocr->command .= ' 2> /dev/null';
}
// if ($this->configService->getLogLevel() > 0) {
// $ocr->command .= ' 2> /dev/null';
// }

try {
$result = $ocr->run();
$this->debug('OCR command ran smoothly');
$this->logger->debug('OCR command ran smoothly');
} catch (Exception $e) {
$this->exception($e, self::$NOTICE, ['path' => $path, 'cmd' => $ocr->command, 'lang' => $lang]);
$this->logger->notice('failed to OCR', [
'exception' => $e,
'path' => $path,
'cmd' => $ocr->command,
'lang' => $lang
]);
$result = '';
}

Expand All @@ -238,38 +229,38 @@ private function ocrPdf(AFilesDocument $document, File $file): bool {
return true;
}

$this->debug('looks like we\'re working on a PDF file');
$this->logger->debug('looks like we\'re working on a PDF file');

try {
$path = $this->getAbsolutePath($file);
$this->debug('Absolute path', ['path' => $path]);
$this->logger->debug('Absolute path', ['path' => $path]);
$pdf = new Pdf($path);
} catch (Exception $e) {
$this->exception($e, self::$NOTICE, ['document' => $document]);
$this->logger->notice('failed to ocrPdf', ['exception' => $e, 'document' => $document]);
throw new NotFoundException();
}

$content = '';
$pages = $pdf->getNumberOfPages();
$this->debug('PDF contains ' . $pages . ' page(s)');
$this->logger->debug('PDF contains ' . $pages . ' page(s)');

$limit = (int)$this->configService->getAppValue(ConfigService::TESSERACT_PDF_LIMIT);
$pages = ($limit > 0 && $pages > $limit) ? $limit : $pages;
$this->debug('App will now ocr ' . $pages . ' page(s)');
$this->logger->debug('App will now ocr ' . $pages . ' page(s)');


for ($i = 1; $i <= $pages; $i++) {
$this->debug('Creating a temp image file for page #' . $i);
$this->logger->debug('Creating a temp image file for page #' . $i);

$tmpFile = tmpfile();
$tmpPath = stream_get_meta_data($tmpFile)['uri'];
$this->debug('temp image file: ' . $tmpPath . ' for page #' . $i);
$this->logger->debug('temp image file: ' . $tmpPath . ' for page #' . $i);

try {
$this->debug('opening the PDF at the page #' . $i);
$this->logger->debug('opening the PDF at the page #' . $i);
$pdf->setPage($i);

$this->debug('saving the current page as image', ['tmpPath' => $tmpPath]);
$this->logger->debug('saving the current page as image', ['tmpPath' => $tmpPath]);
$pdf->saveImage($tmpPath);

$content .= $this->ocrFileFromPath($tmpPath);
Expand All @@ -279,7 +270,7 @@ private function ocrPdf(AFilesDocument $document, File $file): bool {
fclose($tmpFile);
}

$this->debug('Saving the data into the IndexDocument');
$this->logger->debug('Saving the data into the IndexDocument');
$document->addPart('ocr', $content);

return true;
Expand Down Expand Up @@ -315,6 +306,4 @@ private function getAbsolutePath(File $file): string {

return $view->getLocalFile($file->getPath());
}


}

0 comments on commit e1405e4

Please sign in to comment.