default.conf

#!/usr/bin/env bash

###### pmOCR - batch & service wrapper for OCR tools
###### (C) 2014-2022 by Orsiris de Jong (www.netpower.fr)
###### pmOCR v1.5.4 - 1.8.2 config file 2022050801
CONFIG_FILE_REVISION=1

## ---------- GENERAL OPTIONS

## Instance identification
INSTANCE_ID=MyOCRServer

## List of allowed extensions for input files
FILES_TO_PROCESS="\(pdf\|tif\|tiff\|png\|jpg\|jpeg\|bmp\|pcx\|dcx\)"

## Number of OCR subprocesses to start simultaneously. Should not exceed the number of CPU cores for best performance.
NUMBER_OF_PROCESSES=4

## The output file user and group ownership may be copied from input file (works only if executed as root).
PRESERVE_OWNERSHIP=no
## Output file permissions. Defaults to 644 (works only if executed as root).
FILE_PERMISSIONS=

## OCR Engine, adjust *_OCR_ENGINE_ARGS to fit your needs, especially for language settings

# Acceptable values are abbyyocr11, tesseract (tesseract 3.x, 4.x or 5.x)
OCR_ENGINE=tesseract

# File detection strategy:
# true: use inotifywait (works when mountpoint is local)
# false: use integrated inotifywait emulation which does work even on SMB/NFS shares, but takes more resources since it's poller based (poller interval is measured in seconds)
INOTIFYWAIT_SUPPORT=false
INOTIFY_POLLER_INTERVAL=30

## ---------- OCR Engine arguments

	# AbbyyOCR11 Engine Arguments
	#############################

## lpp = load predefinied profil / TextExtraction_Acuraccy = name of the predefinied profile / -adb = Detect barcodes / -ido = Detect and rotate image orientation / -adtop = Detect text embedded in images
## -rl = List of languages for the document (French,English,Spanish) / recc = Enhanced character confidence
##### PDF related arguments : -pfs = PDF Export preset (balanced) / -pacm = PDF/A standards (pdfa-3a) / ptem = Specifies the mode of export of recognized text into PDF (PDF/A) format.
##### DOCX related arguments :-dheb  = Highlights uncertainly recognized characters with the background color when exporting to DOCX format (color definied by deb parameter).
##### -deb 0xFFFF00 (yellow highlights)
##### XLSX related arguments :  -xlto = only export text from table / -xlrf = remove formating from text / -xllrm = This option allows setting the mode of retaining the original document tables' layout in the output XLSX file (Default, ExactDocument, ExactLines) 

## Full path to OCR engine

ABBYY_OCR_ENGINE_EXEC=/usr/local/bin/abbyyocr11

# Quality may be set to Balanced, MaxSpeed, MaxQuality, MinSize
ABBYY_PDF_QUALITY=Balanced
ABBYY_PDF_OCR_ENGINE_ARGS='-lpp TextExtraction_Accuracy -adb -ido -adtop -rl French,English,Spanish -recc -pfs $ABBYY_PDF_QUALITY -pacm Pdfa_3a -ptem ImageOnText -f pdf'
ABBYY_WORD_OCR_ENGINE_ARGS='-lpp TextExtraction_Accuracy -adb -ido -adtop -rl French,English,Spanish -recc -f docx'
ABBYY_EXCEL_OCR_ENGINE_ARGS=' -lpp TextExtraction_Accuracy -adb -ido -adtop -rl French,English,Spanish -recc -rpihp -xlrf -xllrm ExactLines -f xlsx'
ABBYY_TEXT_OCR_ENGINE_ARGS='-lpp TextExtraction_Accuracy -adb -ido -adtop -rl French,English,Spanish -recc -trl -f TextUnicodeDefaults'
ABBYY_CSV_OCR_ENGINE_ARGS='-lpp TextExtraction_Accuracy -adb -ido -adtop -rl French,English,Spanish -recc -trl -f TextUnicodeDefaults'
ABBYY_OCR_ENGINE_INPUT_ARG='-if'
ABBYY_OCR_ENGINE_OUTPUT_ARG='-of'


	# tesseract Engine Arguments
	################################

## Working resolution for tesseract preprocessor and intermediary transformations
## Should be equal to the highest resolution of scanned documents. Good values are 300-600, but 600 is quite CPU hungry
RESOLUTION=600

## Full path to OCR engine
TESSERACT_OCR_ENGINE_EXEC=/usr/bin/tesseract
TESSERACT_PDF_OCR_ENGINE_ARGS='pdf'
TESSERACT_TEXT_OCR_ENGINE_ARGS=''
TESSERACT_CSV_OCR_ENGINE_ARGS=''
TESSERACT_OCR_ENGINE_INPUT_ARG='-l eng' # Language setting
TESSERACT_OCR_ENGINE_OUTPUT_ARG=''
## tesseract intermediary transformation of PDF to TIFF
TESSERACT_PDF_TO_TIFF_EXEC=/usr/bin/convert
TESSERACT_PDF_TO_TIFF_OPTS='-density '${RESOLUTION}' -compress lzw'
# Elder ghostscript conversion
#TESSERACT_PDF_TO_TIFF_EXEC=/usr/bin/gs
#TESSERACT_PDF_TO_TIFF_OPTS=' -q -dNOPAUSE -r'${RESOLUTION}'x'${RESOLUTION}' -sDEVICE=tiff32nc -sCompression=lzw -dBATCH -sOUTPUTFILE='

## Tesseract optional arguments
## Example for Tesseract 4.x/5.x OCR LTSM engine selection (see tesseract --help-extra)
# oem 0 is legacy engine, which as of tesseract 5.0.0 release with github/tesseract/tessdata traineddata gives better results
# oem 1 is LTSM engine
TESSERACT_OPTIONAL_ARGS='--oem 0'

	# Preprocessor Arguments (only for tesseract)
	#############################################

## Optional preprocessor to correct scanned images (don't use this for abbyy11 which already contains it's own preprocessor)
## Uncomment OCR_PREPROCESSOR_EXEC lines to use it
## See http://www.imagemagick.org/discourse-server/viewtopic.php?t=22226 for examples

OCR_PREPROCESSOR_EXEC=/usr/bin/convert
OCR_PREPROCESSOR_ARGS='-units PixelsPerInch -respect-parenthesis \( -compress lzw -density '${RESOLUTION}' -bordercolor black -border 1 -trim +repage -fill white -draw "color 0,0 floodfill" -alpha off -shave 1x1 \) \( -bordercolor black -border 2 -fill white -draw "color 0,0 floodfill" -alpha off -shave 0x1 -deskew 40 +repage \) -antialias -sharpen 0x3'
OCR_PREPROCESSOR_INPUT_ARG=''
OCR_PREPROCESSOR_OUTPUT_ARG=''

#######################################################################
### THE FOLLOWING PARAMETERS ARE USED WHEN pmOCR IS RUN AS SERVICE ####
###     YOU MAY SET THEM IN COMMAND LINE WHEN USING BATCH MODE     ####
#######################################################################

## List of alert mails separated by spaces
DESTINATION_MAILS="infrastructure@example.com"

## Optional change of mail body encoding (using iconv)
## By default, all mails are sent in UTF-8 format without header (because of maximum compatibility of all platforms)
## You may specify an optional encoding here (like "ISO-8859-1" or whatever iconv can handle)
MAIL_BODY_CHARSET=""

## Directories to monitor (Leave variables empty in order to disable specific monitoring).
## As of today, Tesseract only handles PDF, TXT and CSV
PDF_MONITOR_DIR="/storage/service_ocr/PDF"
WORD_MONITOR_DIR="/storage/service_ocr/WORD"
EXCEL_MONITOR_DIR="/storage/service_ocr/EXCEL"
TEXT_MONITOR_DIR="/storage/service_ocr/TEXT"
CSV_MONITOR_DIR="/storage/service_ocr/CSV"

PDF_EXTENSION=".pdf"
WORD_EXTENSION=".docx"
EXCEL_EXTENSION=".xlsx"
TEXT_EXTENSION=".txt"
CSV_EXTENSION=".csv"

## Move original file after successful processing into a path that will be ignored by the monitor.
## Enabling this setting by removing comment automatically disables DELETE_ORIGINAL and FILENAME_SUFFIX values.
#MOVE_ORIGINAL_ON_SUCCESS="/storage/service_ocr/done"

## Move failed to process file into a path that will be ignored by the monitor.
## Enabling this setting by removing comment automatically disables FAILED_FILENAME_SUFFIX value.
#MOVE_ORIGINAL_ON_FAILURE="/storage/service_ocr/failed"

## Adds an optional following suffix to OCRed files (ex: input.tiff becomes input_OCR.pdf). Any file containing this suffix will be ignored. Can be left empty.
FILENAME_SUFFIX="_OCR"

## Add the following suffix to failed files in order to prevent them from being processed in a loop. Can be left empty.
FAILED_FILENAME_SUFFIX="_OCR_ERR"

## Delete original file upon successful processing (has no effect if MOVE_ORIGINAL_ON_SUCCESS is set) (true/false)
DELETE_ORIGINAL=false

# Alternative check if PDFs are already OCRed (checks if a pdf contains a font). This will prevent images integrated in already indexed PDFs to get OCRed. (true/false)
CHECK_PDF=true

## Add some extra info to the filename. Example here adds a pseudo ISO 8601 timestamp after a dot (pseudo because the colon sign would render the filename quite weird).
## Keep variables between singlequotes if you want them to expand at runtime. Leave this variable empty if you don't want to add anything (is also added to moved files).
FILENAME_ADDITION='.$(date --utc +"%Y-%m-%dT%H-%M-%SZ")'

## Max time before triggering a forced OCR run when no file actions are detected
MAX_TIME=3600