@@ -241,7 +241,7 @@ def process(self, **kwargs):
241241 if "ocr_engine" in kwargs :
242242 payload ["ocr_engine" ] = kwargs ["ocr_engine" ]
243243
244- self ._client .post (f"{ self .api_path } /{ self .id } /process/" , json = payload or None )
244+ self ._client .post (f"{ self .api_path } /{ self .id } /process/" , json = payload )
245245
246246
247247class DocumentClient (BaseAPIClient ):
@@ -354,6 +354,9 @@ def _extract_ocr_options(self, kwargs):
354354 force_ocr = kwargs .pop ("force_ocr" , False )
355355 ocr_engine = kwargs .pop ("ocr_engine" , "tess4" )
356356
357+ if not isinstance (force_ocr , bool ):
358+ raise ValueError ("force_ocr must be a boolean" )
359+
357360 if ocr_engine and ocr_engine not in ("tess4" , "textract" ):
358361 raise ValueError (
359362 "ocr_engine must be either 'tess4' for tesseract or 'textract'"
@@ -373,15 +376,15 @@ def _upload_url(self, file_url, **kwargs):
373376 # create the document
374377 params = self ._format_upload_parameters (file_url , ** kwargs )
375378 params ["file_url" ] = file_url
379+ if force_ocr :
380+ params ["force_ocr" ] = force_ocr
381+ params ["ocr_engine" ] = ocr_engine
376382 response = self .client .post ("documents/" , json = params )
377383 create_json = response .json ()
378384
379385 # wrap in Document object
380386 doc = Document (self .client , create_json )
381387
382- # begin processing if needed
383- if force_ocr :
384- doc .process (force_ocr = force_ocr , ocr_engine = ocr_engine )
385388 return doc
386389
387390 def _upload_file (self , file_ , ** kwargs ):
@@ -544,9 +547,12 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
544547 kwargs .pop ("title" , None )
545548
546549 force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
547-
548550 obj_list = []
549551 params = self ._format_upload_parameters ("" , ** kwargs )
552+ # Add OCR options directly to params if needed
553+ if force_ocr :
554+ params ["force_ocr" ] = force_ocr
555+ params ["ocr_engine" ] = ocr_engine
550556 for i , url_group in enumerate (grouper (url_list , BULK_LIMIT )):
551557 # Grouper will put None's on the end of the last group
552558 url_group = [url for url in url_group if url is not None ]
@@ -583,25 +589,6 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
583589 create_json = response .json ()
584590 obj_list .extend (create_json )
585591
586- # Begin bulk processing if needed
587- if force_ocr :
588- process_payload = [
589- {"id" : j ["id" ], "force_ocr" : force_ocr , "ocr_engine" : ocr_engine }
590- for j in create_json
591- ]
592- try :
593- self .client .post ("documents/process/" , json = process_payload )
594- except (APIError , RequestException ) as exc :
595- if handle_errors :
596- logger .info (
597- "Error processing the following documents: %s\n %s" ,
598- exc ,
599- "\n " .join (url_group ),
600- )
601- continue
602- else :
603- raise
604-
605592 logger .info ("Upload URLs complete" )
606593
607594 # Pass back the list of documents
0 commit comments