Skip to content

Commit

Permalink
Merge pull request #8 from daita/feature/noid/ocr-on-pdf
Browse files Browse the repository at this point in the history
convert pdf and ocr the image
  • Loading branch information
ArtificialOwl committed Feb 17, 2019
2 parents ba302d7 + 51fadf7 commit 676a166
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 8 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
}
],
"require": {
"thiagoalessio/tesseract_ocr": "2.4.0"
"thiagoalessio/tesseract_ocr": "2.4.0",
"spatie/pdf-to-image": "1.8.1"
}
}
55 changes: 52 additions & 3 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions js/admin.elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,19 @@ var fts_tesseract_elements = {
tesseract_ocr: null,
tesseract_psm: null,
tesseract_lang: null,
tesseract_pdf: null,

init: function () {
fts_tesseract_elements.tesseract_div = $('#files_ocr-tesseract');
fts_tesseract_elements.tesseract_psm = $('#tesseract_psm');
fts_tesseract_elements.tesseract_lang = $('#tesseract_lang');
fts_tesseract_elements.tesseract_ocr = $('#tesseract_ocr');
fts_tesseract_elements.tesseract_pdf = $('#tesseract_pdf');

fts_tesseract_elements.tesseract_ocr.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_psm.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_lang.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_pdf.on('change', fts_tesseract_elements.updateSettings);
},


Expand Down
4 changes: 3 additions & 1 deletion js/admin.settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ var fts_tesseract_settings = {
fts_tesseract_elements.tesseract_ocr.prop('checked', (result.tesseract_enabled === '1'));
fts_tesseract_elements.tesseract_psm.val(result.tesseract_psm);
fts_tesseract_elements.tesseract_lang.val(result.tesseract_lang);
fts_tesseract_elements.tesseract_pdf.prop('checked', (result.tesseract_pdf === '1'));

fts_admin_settings.tagSettingsAsSaved(fts_tesseract_elements.tesseract_div);

Expand All @@ -69,7 +70,8 @@ var fts_tesseract_settings = {
var data = {
tesseract_enabled: (fts_tesseract_elements.tesseract_ocr.is(':checked')) ? 1 : 0,
tesseract_psm: fts_tesseract_elements.tesseract_psm.val(),
tesseract_lang: fts_tesseract_elements.tesseract_lang.val()
tesseract_lang: fts_tesseract_elements.tesseract_lang.val(),
tesseract_pdf: (fts_tesseract_elements.tesseract_pdf.is(':checked')) ? 1 : 0
};

$.ajax({
Expand Down
1 change: 1 addition & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,6 @@ function(GenericEvent $e) {
}
);
}

}

4 changes: 3 additions & 1 deletion lib/Service/ConfigService.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ class ConfigService {
const TESSERACT_ENABLED = 'tesseract_enabled';
const TESSERACT_PSM = 'tesseract_psm';
const TESSERACT_LANG = 'tesseract_lang';
const TESSERACT_PDF = 'tesseract_pdf';

public $defaults = [
self::TESSERACT_ENABLED => '0',
self::TESSERACT_PSM => '4',
self::TESSERACT_LANG => 'eng'
self::TESSERACT_LANG => 'eng',
self::TESSERACT_PDF => '0'
];


Expand Down
72 changes: 70 additions & 2 deletions lib/Service/TesseractService.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
use OCP\Files\NotFoundException;
use OCP\Files_FullTextSearch\Model\AFilesDocument;
use OCP\FullTextSearch\Model\IndexDocument;
use OCP\FullTextSearch\Model\ISearchRequest;
use Spatie\PdfToImage\Exceptions\PageDoesNotExist;
use Spatie\PdfToImage\Pdf;
use Symfony\Component\EventDispatcher\GenericEvent;
use thiagoalessio\TesseractOCR\TesseractOCR;

Expand Down Expand Up @@ -80,7 +83,8 @@ public function parsedMimeType(string $mimeType, string $extension): bool {
'image/png',
'image/jpeg',
'image/tiff',
'image/vnd.djvu'
'image/vnd.djvu',
'application/pdf'
];

foreach ($ocrMimes as $mime) {
Expand Down Expand Up @@ -119,6 +123,9 @@ public function onFileIndexing(GenericEvent $e) {
* @param GenericEvent $e
*/
public function onSearchRequest(GenericEvent $e) {
/** @var ISearchRequest $file */
$request = $e->getArgument('request');
$request->addPart('ocr');
}


Expand All @@ -134,13 +141,18 @@ private function extractContentUsingTesseractOCR(AFilesDocument &$document, File
}

$extension = pathinfo($document->getPath(), PATHINFO_EXTENSION);

if (!$this->parsedMimeType($document->getMimetype(), $extension)) {
return;
}

// TODO: How to set options so that the index can be reset if admin settings are changed
// $this->configService->setDocumentIndexOption($document, ConfigService::FILES_OCR);

if ($this->ocrPdf($document, $file)) {
return;
}

$content = $this->ocrFile($file);
} catch (Exception $e) {
return;
Expand All @@ -161,9 +173,20 @@ private function ocrFile(File $file): string {
try {
$path = $this->getAbsolutePath($file);
} catch (Exception $e) {
throw new NotFoundException('file not found');
$this->miscService->log('Exception while ocr file: ' . $e->getMessage(), 1);
throw new NotFoundException();
}

return $this->ocrFileFromPath($path);
}


/**
* @param string $path
*
* @return string
*/
private function ocrFileFromPath(string $path): string {
$ocr = new TesseractOCR($path);
$ocr->psm($this->configService->getAppValue(ConfigService::TESSERACT_PSM));
$lang = explode(',', $this->configService->getAppValue(ConfigService::TESSERACT_LANG));
Expand All @@ -175,6 +198,51 @@ private function ocrFile(File $file): string {
}


/**
* @param AFilesDocument $document
* @param File $file
*
* @return bool
* @throws NotFoundException
*/
private function ocrPdf(AFilesDocument $document, File $file): bool {
if ($document->getMimetype() !== 'application/pdf') {
return false;
}

if ($this->configService->getAppValue(ConfigService::TESSERACT_PDF) !== '1') {
return true;
}

try {
$path = $this->getAbsolutePath($file);
$pdf = new Pdf($path);
} catch (Exception $e) {
$this->miscService->log('Exception while ocr pdf file: ' . $e->getMessage(), 1);
throw new NotFoundException();
}

$content = '';
for ($i = 1; $i <= $pdf->getNumberOfPages(); $i++) {
// we create a temp image file
$tmpFile = tmpfile();
$tmpPath = stream_get_meta_data($tmpFile)['uri'];

try {
$pdf->setPage($i);
$pdf->saveImage($tmpPath);

$content .= $this->ocrFileFromPath($tmpPath);
} catch (PageDoesNotExist $e) {
}
}

$document->addPart('ocr', $content);

return true;
}


/**
* @param string $extension
*
Expand Down
12 changes: 12 additions & 0 deletions templates/settings.admin.php
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,18 @@
<input type="text" class="big" id="tesseract_lang" value=""/>
</div>
</div>

<div class="div-table-row tesseract_ocr_enabled">
<div class="div-table-col div-table-col-left">
<span class="leftcol">PDF</span>
<br/>
<em>enable the OCR of PDF (heavy on resource)</em>
</div>
<div class="div-table-col">
<input type="checkbox" id="tesseract_pdf" value="1"/>
</div>
</div>

</div>


Expand Down

0 comments on commit 676a166

Please sign in to comment.