@@ -74,8 +74,11 @@ def __str__(self):
7474 def __getattr__ (self , attr ):
7575 """Generate methods for fetching resources"""
7676 p_image = re .compile (
77- r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
77+ r"^get_"
78+ r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
79+ r"(?P<list>_list)?$"
7880 )
81+
7982 get = attr .startswith ("get_" )
8083 url = attr .endswith ("_url" )
8184 text = attr .endswith ("_text" )
@@ -230,9 +233,15 @@ def get_errors(self):
230233
231234 return all_results
232235
233- def process (self ):
234- """Reprocess the document"""
235- self ._client .post (f"{ self .api_path } /{ self .id } /process/" )
236+ def process (self , ** kwargs ):
237+ """Process the document, used on upload and for reprocessing"""
238+ payload = {}
239+ if "force_ocr" in kwargs :
240+ payload ["force_ocr" ] = kwargs ["force_ocr" ]
241+ if "ocr_engine" in kwargs :
242+ payload ["ocr_engine" ] = kwargs ["ocr_engine" ]
243+
244+ self ._client .post (f"{ self .api_path } /{ self .id } /process/" , json = payload or None )
236245
237246
238247class DocumentClient (BaseAPIClient ):
@@ -310,6 +319,7 @@ def _format_upload_parameters(self, name, **kwargs):
310319 "title" ,
311320 "data" ,
312321 "force_ocr" ,
322+ "ocr_engine" ,
313323 "projects" ,
314324 "delayed_index" ,
315325 "revision_control" ,
@@ -333,21 +343,52 @@ def _format_upload_parameters(self, name, **kwargs):
333343
334344 return params
335345
346+ def _extract_ocr_options (self , kwargs ):
347+ """
348+ Extract and validate OCR options from kwargs.
349+
350+ Returns:
351+ force_ocr (bool)
352+ ocr_engine (str or None)
353+ """
354+ force_ocr = kwargs .pop ("force_ocr" , False )
355+ ocr_engine = kwargs .pop ("ocr_engine" , "tess4" )
356+
357+ if ocr_engine and ocr_engine not in ("tess4" , "textract" ):
358+ raise ValueError (
359+ "ocr_engine must be either 'tess4' for tesseract or 'textract'"
360+ )
361+
362+ return force_ocr , ocr_engine
363+
336364 def _get_title (self , name ):
337365 """Get the default title for a document from its path"""
338366 return name .split (os .sep )[- 1 ].rsplit ("." , 1 )[0 ]
339367
340368 def _upload_url (self , file_url , ** kwargs ):
341369 """Upload a document from a publicly accessible URL"""
370+ # extract process-related args
371+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
372+
373+ # create the document
342374 params = self ._format_upload_parameters (file_url , ** kwargs )
343375 params ["file_url" ] = file_url
344376 response = self .client .post ("documents/" , json = params )
345- return Document (self .client , response .json ())
377+ create_json = response .json ()
378+
379+ # wrap in Document object
380+ doc = Document (self .client , create_json )
381+
382+ # begin processing if needed
383+ if force_ocr :
384+ doc .process (force_ocr = force_ocr , ocr_engine = ocr_engine )
385+ return doc
346386
347387 def _upload_file (self , file_ , ** kwargs ):
348388 """Upload a document directly"""
349389 # create the document
350- force_ocr = kwargs .pop ("force_ocr" , False )
390+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
391+
351392 params = self ._format_upload_parameters (file_ .name , ** kwargs )
352393 response = self .client .post ("documents/" , json = params )
353394
@@ -357,12 +398,12 @@ def _upload_file(self, file_, **kwargs):
357398 response = requests_retry_session ().put (presigned_url , data = file_ .read ())
358399
359400 # begin processing the document
360- doc_id = create_json ["id" ]
361- response = self .client .post (
362- f"documents/{ doc_id } /process/" , json = {"force_ocr" : force_ocr }
363- )
401+ doc = Document (self .client , create_json )
364402
365- return Document (self .client , create_json )
403+ # begin processing
404+ doc .process (force_ocr = force_ocr , ocr_engine = ocr_engine )
405+
406+ return doc
366407
367408 def _collect_files (self , path , extensions ):
368409 """Find the paths to files with specified extensions under a directory"""
@@ -410,7 +451,9 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
410451 # Upload all the files using the bulk API to reduce the number
411452 # of API calls and improve performance
412453 obj_list = []
454+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
413455 params = self ._format_upload_parameters ("" , ** kwargs )
456+
414457 for i , file_paths in enumerate (grouper (path_list , BULK_LIMIT )):
415458 # Grouper will put None's on the end of the last group
416459 file_paths = [p for p in file_paths if p is not None ]
@@ -471,9 +514,13 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
471514
472515 # Begin processing the documents
473516 logger .info ("Processing the documents..." )
474- doc_ids = [j ["id" ] for j in create_json ]
517+ process_payload = [
518+ {"id" : j ["id" ], "force_ocr" : force_ocr , "ocr_engine" : ocr_engine }
519+ for j in create_json
520+ ]
521+
475522 try :
476- response = self .client .post ("documents/process/" , json = { "ids" : doc_ids } )
523+ response = self .client .post ("documents/process/" , json = process_payload )
477524 except (APIError , RequestException ) as exc :
478525 if handle_errors :
479526 logger .info (
@@ -484,7 +531,7 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
484531 continue
485532 else :
486533 raise
487-
534+ logger . info ( "Process payload: %s" , process_payload )
488535 logger .info ("Upload directory complete" )
489536
490537 # Pass back the list of documents
@@ -496,6 +543,8 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
496543 # Do not set the same title for all documents
497544 kwargs .pop ("title" , None )
498545
546+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
547+
499548 obj_list = []
500549 params = self ._format_upload_parameters ("" , ** kwargs )
501550 for i , url_group in enumerate (grouper (url_list , BULK_LIMIT )):
@@ -534,6 +583,25 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
534583 create_json = response .json ()
535584 obj_list .extend (create_json )
536585
586+ # Begin bulk processing if needed
587+ if force_ocr :
588+ process_payload = [
589+ {"id" : j ["id" ], "force_ocr" : force_ocr , "ocr_engine" : ocr_engine }
590+ for j in create_json
591+ ]
592+ try :
593+ self .client .post ("documents/process/" , json = process_payload )
594+ except (APIError , RequestException ) as exc :
595+ if handle_errors :
596+ logger .info (
597+ "Error processing the following documents: %s\n %s" ,
598+ exc ,
599+ "\n " .join (url_group ),
600+ )
601+ continue
602+ else :
603+ raise
604+
537605 logger .info ("Upload URLs complete" )
538606
539607 # Pass back the list of documents
0 commit comments