Skip to content

Commit 215c12a

Browse files
Improvements
1 parent 9f9e43e commit 215c12a

File tree

1 file changed

+11
-24
lines changed

1 file changed

+11
-24
lines changed

documentcloud/documents.py

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def process(self, **kwargs):
241241
if "ocr_engine" in kwargs:
242242
payload["ocr_engine"] = kwargs["ocr_engine"]
243243

244-
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload or None)
244+
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
245245

246246

247247
class DocumentClient(BaseAPIClient):
@@ -354,6 +354,9 @@ def _extract_ocr_options(self, kwargs):
354354
force_ocr = kwargs.pop("force_ocr", False)
355355
ocr_engine = kwargs.pop("ocr_engine", "tess4")
356356

357+
if not isinstance(force_ocr, bool):
358+
raise ValueError("force_ocr must be a boolean")
359+
357360
if ocr_engine and ocr_engine not in ("tess4", "textract"):
358361
raise ValueError(
359362
"ocr_engine must be either 'tess4' for tesseract or 'textract'"
@@ -373,15 +376,15 @@ def _upload_url(self, file_url, **kwargs):
373376
# create the document
374377
params = self._format_upload_parameters(file_url, **kwargs)
375378
params["file_url"] = file_url
379+
if force_ocr:
380+
params["force_ocr"] = force_ocr
381+
params["ocr_engine"] = ocr_engine
376382
response = self.client.post("documents/", json=params)
377383
create_json = response.json()
378384

379385
# wrap in Document object
380386
doc = Document(self.client, create_json)
381387

382-
# begin processing if needed
383-
if force_ocr:
384-
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
385388
return doc
386389

387390
def _upload_file(self, file_, **kwargs):
@@ -544,9 +547,12 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
544547
kwargs.pop("title", None)
545548

546549
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
547-
548550
obj_list = []
549551
params = self._format_upload_parameters("", **kwargs)
552+
# Add OCR options directly to params if needed
553+
if force_ocr:
554+
params["force_ocr"] = force_ocr
555+
params["ocr_engine"] = ocr_engine
550556
for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
551557
# Grouper will put None's on the end of the last group
552558
url_group = [url for url in url_group if url is not None]
@@ -583,25 +589,6 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
583589
create_json = response.json()
584590
obj_list.extend(create_json)
585591

586-
# Begin bulk processing if needed
587-
if force_ocr:
588-
process_payload = [
589-
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
590-
for j in create_json
591-
]
592-
try:
593-
self.client.post("documents/process/", json=process_payload)
594-
except (APIError, RequestException) as exc:
595-
if handle_errors:
596-
logger.info(
597-
"Error processing the following documents: %s\n%s",
598-
exc,
599-
"\n".join(url_group),
600-
)
601-
continue
602-
else:
603-
raise
604-
605592
logger.info("Upload URLs complete")
606593

607594
# Pass back the list of documents

0 commit comments

Comments
 (0)