Skip to content

Commit ce7320e

Browse files
First pass at handling force_ocr in upload options
1 parent 02a0b60 commit ce7320e

File tree

3 files changed

+86
-18
lines changed

3 files changed

+86
-18
lines changed

docs/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,16 @@
4848

4949
# General information about the project.
5050
project = "documentcloud"
51-
copyright = "2023, MuckRock Foundation"
51+
copyright = "2025, MuckRock Foundation"
5252

5353
# The version info for the project you're documenting, acts as replacement for
5454
# |version| and |release|, also used in various other places throughout the
5555
# built documents.
5656
#
5757
# The short X.Y version.
58-
version = "4.3"
58+
version = "4.5"
5959
# The full version, including alpha/beta/rc tags.
60-
release = "4.3.0"
60+
release = "4.5.0"
6161

6262
# The language for content autogenerated by Sphinx. Refer to documentation
6363
# for a list of supported languages.

documentcloud/documents.py

Lines changed: 82 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,11 @@ def __str__(self):
7474
def __getattr__(self, attr):
7575
"""Generate methods for fetching resources"""
7676
p_image = re.compile(
77-
r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
77+
r"^get_"
78+
r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
79+
r"(?P<list>_list)?$"
7880
)
81+
7982
get = attr.startswith("get_")
8083
url = attr.endswith("_url")
8184
text = attr.endswith("_text")
@@ -230,9 +233,15 @@ def get_errors(self):
230233

231234
return all_results
232235

233-
def process(self):
234-
"""Reprocess the document"""
235-
self._client.post(f"{self.api_path}/{self.id}/process/")
236+
def process(self, **kwargs):
237+
"""Process the document, used on upload and for reprocessing"""
238+
payload = {}
239+
if "force_ocr" in kwargs:
240+
payload["force_ocr"] = kwargs["force_ocr"]
241+
if "ocr_engine" in kwargs:
242+
payload["ocr_engine"] = kwargs["ocr_engine"]
243+
244+
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload or None)
236245

237246

238247
class DocumentClient(BaseAPIClient):
@@ -310,6 +319,7 @@ def _format_upload_parameters(self, name, **kwargs):
310319
"title",
311320
"data",
312321
"force_ocr",
322+
"ocr_engine",
313323
"projects",
314324
"delayed_index",
315325
"revision_control",
@@ -333,21 +343,52 @@ def _format_upload_parameters(self, name, **kwargs):
333343

334344
return params
335345

346+
def _extract_ocr_options(self, kwargs):
347+
"""
348+
Extract and validate OCR options from kwargs.
349+
350+
Returns:
351+
force_ocr (bool)
352+
ocr_engine (str or None)
353+
"""
354+
force_ocr = kwargs.pop("force_ocr", False)
355+
ocr_engine = kwargs.pop("ocr_engine", "tess4")
356+
357+
if ocr_engine and ocr_engine not in ("tess4", "textract"):
358+
raise ValueError(
359+
"ocr_engine must be either 'tess4' for tesseract or 'textract'"
360+
)
361+
362+
return force_ocr, ocr_engine
363+
336364
def _get_title(self, name):
337365
"""Get the default title for a document from its path"""
338366
return name.split(os.sep)[-1].rsplit(".", 1)[0]
339367

340368
def _upload_url(self, file_url, **kwargs):
341369
"""Upload a document from a publicly accessible URL"""
370+
# extract process-related args
371+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
372+
373+
# create the document
342374
params = self._format_upload_parameters(file_url, **kwargs)
343375
params["file_url"] = file_url
344376
response = self.client.post("documents/", json=params)
345-
return Document(self.client, response.json())
377+
create_json = response.json()
378+
379+
# wrap in Document object
380+
doc = Document(self.client, create_json)
381+
382+
# begin processing if needed
383+
if force_ocr:
384+
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
385+
return doc
346386

347387
def _upload_file(self, file_, **kwargs):
348388
"""Upload a document directly"""
349389
# create the document
350-
force_ocr = kwargs.pop("force_ocr", False)
390+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
391+
351392
params = self._format_upload_parameters(file_.name, **kwargs)
352393
response = self.client.post("documents/", json=params)
353394

@@ -357,12 +398,12 @@ def _upload_file(self, file_, **kwargs):
357398
response = requests_retry_session().put(presigned_url, data=file_.read())
358399

359400
# begin processing the document
360-
doc_id = create_json["id"]
361-
response = self.client.post(
362-
f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
363-
)
401+
doc = Document(self.client, create_json)
364402

365-
return Document(self.client, create_json)
403+
# begin processing
404+
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
405+
406+
return doc
366407

367408
def _collect_files(self, path, extensions):
368409
"""Find the paths to files with specified extensions under a directory"""
@@ -410,7 +451,9 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
410451
# Upload all the files using the bulk API to reduce the number
411452
# of API calls and improve performance
412453
obj_list = []
454+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
413455
params = self._format_upload_parameters("", **kwargs)
456+
414457
for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
415458
# Grouper will put None's on the end of the last group
416459
file_paths = [p for p in file_paths if p is not None]
@@ -471,9 +514,13 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
471514

472515
# Begin processing the documents
473516
logger.info("Processing the documents...")
474-
doc_ids = [j["id"] for j in create_json]
517+
process_payload = [
518+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
519+
for j in create_json
520+
]
521+
475522
try:
476-
response = self.client.post("documents/process/", json={"ids": doc_ids})
523+
response = self.client.post("documents/process/", json=process_payload)
477524
except (APIError, RequestException) as exc:
478525
if handle_errors:
479526
logger.info(
@@ -484,7 +531,7 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
484531
continue
485532
else:
486533
raise
487-
534+
logger.info("Process payload: %s", process_payload)
488535
logger.info("Upload directory complete")
489536

490537
# Pass back the list of documents
@@ -496,6 +543,8 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
496543
# Do not set the same title for all documents
497544
kwargs.pop("title", None)
498545

546+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
547+
499548
obj_list = []
500549
params = self._format_upload_parameters("", **kwargs)
501550
for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
@@ -534,6 +583,25 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
534583
create_json = response.json()
535584
obj_list.extend(create_json)
536585

586+
# Begin bulk processing if needed
587+
if force_ocr:
588+
process_payload = [
589+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
590+
for j in create_json
591+
]
592+
try:
593+
self.client.post("documents/process/", json=process_payload)
594+
except (APIError, RequestException) as exc:
595+
if handle_errors:
596+
logger.info(
597+
"Error processing the following documents: %s\n%s",
598+
exc,
599+
"\n".join(url_group),
600+
)
601+
continue
602+
else:
603+
raise
604+
537605
logger.info("Upload URLs complete")
538606

539607
# Pass back the list of documents

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
setup(
99
name="python-documentcloud",
10-
version="4.4.1",
10+
version="4.5.0",
1111
description="A simple Python wrapper for the DocumentCloud API",
1212
author="Mitchell Kotler",
1313
author_email="mitch@muckrock.com",

0 commit comments

Comments
 (0)