First pass at handling force_ocr in upload options

duckduckgrayduck · duckduckgrayduck · commit ce7320ec059e · 2025-09-22T14:38:20.000-05:00
diff --git a/docs/conf.py b/docs/conf.py
@@ -48,16 +48,16 @@
 
 # General information about the project.
 project = "documentcloud"
-copyright = "2023, MuckRock Foundation"
+copyright = "2025, MuckRock Foundation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = "4.3"
+version = "4.5"
 # The full version, including alpha/beta/rc tags.
-release = "4.3.0"
+release = "4.5.0"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/documentcloud/documents.py b/documentcloud/documents.py
@@ -74,8 +74,11 @@ def __str__(self):
     def __getattr__(self, attr):
         """Generate methods for fetching resources"""
         p_image = re.compile(
-            r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
+            r"^get_"
+            r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
+            r"(?P<list>_list)?$"
         )
+
         get = attr.startswith("get_")
         url = attr.endswith("_url")
         text = attr.endswith("_text")
@@ -230,9 +233,15 @@ def get_errors(self):
 
         return all_results
 
-    def process(self):
-        """Reprocess the document"""
-        self._client.post(f"{self.api_path}/{self.id}/process/")
+    def process(self, **kwargs):
+        """Process the document, used on upload and for reprocessing"""
+        payload = {}
+        if "force_ocr" in kwargs:
+            payload["force_ocr"] = kwargs["force_ocr"]
+        if "ocr_engine" in kwargs:
+            payload["ocr_engine"] = kwargs["ocr_engine"]
+
+        self._client.post(f"{self.api_path}/{self.id}/process/", json=payload or None)
 
 
 class DocumentClient(BaseAPIClient):
@@ -310,6 +319,7 @@ def _format_upload_parameters(self, name, **kwargs):
             "title",
             "data",
             "force_ocr",
+            "ocr_engine",
             "projects",
             "delayed_index",
             "revision_control",
@@ -333,21 +343,52 @@ def _format_upload_parameters(self, name, **kwargs):
 
         return params
 
+    def _extract_ocr_options(self, kwargs):
+        """
+        Extract and validate OCR options from kwargs.
+
+        Returns:
+            force_ocr (bool)
+            ocr_engine (str or None)
+        """
+        force_ocr = kwargs.pop("force_ocr", False)
+        ocr_engine = kwargs.pop("ocr_engine", "tess4")
+
+        if ocr_engine and ocr_engine not in ("tess4", "textract"):
+            raise ValueError(
+                "ocr_engine must be either 'tess4' for tesseract or 'textract'"
+            )
+
+        return force_ocr, ocr_engine
+
     def _get_title(self, name):
         """Get the default title for a document from its path"""
         return name.split(os.sep)[-1].rsplit(".", 1)[0]
 
     def _upload_url(self, file_url, **kwargs):
         """Upload a document from a publicly accessible URL"""
+        # extract process-related args
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
+
+        # create the document
         params = self._format_upload_parameters(file_url, **kwargs)
         params["file_url"] = file_url
         response = self.client.post("documents/", json=params)
-        return Document(self.client, response.json())
+        create_json = response.json()
+
+        # wrap in Document object
+        doc = Document(self.client, create_json)
+
+        # begin processing if needed
+        if force_ocr:
+            doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
+        return doc
 
     def _upload_file(self, file_, **kwargs):
         """Upload a document directly"""
         # create the document
-        force_ocr = kwargs.pop("force_ocr", False)
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
+
         params = self._format_upload_parameters(file_.name, **kwargs)
         response = self.client.post("documents/", json=params)
 
@@ -357,12 +398,12 @@ def _upload_file(self, file_, **kwargs):
         response = requests_retry_session().put(presigned_url, data=file_.read())
 
         # begin processing the document
-        doc_id = create_json["id"]
-        response = self.client.post(
-            f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
-        )
+        doc = Document(self.client, create_json)
 
-        return Document(self.client, create_json)
+        # begin processing
+        doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
+
+        return doc
 
     def _collect_files(self, path, extensions):
         """Find the paths to files with specified extensions under a directory"""
@@ -410,7 +451,9 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
         # Upload all the files using the bulk API to reduce the number
         # of API calls and improve performance
         obj_list = []
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
         params = self._format_upload_parameters("", **kwargs)
+
         for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
             # Grouper will put None's on the end of the last group
             file_paths = [p for p in file_paths if p is not None]
@@ -471,9 +514,13 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
 
             # Begin processing the documents
             logger.info("Processing the documents...")
-            doc_ids = [j["id"] for j in create_json]
+            process_payload = [
+                {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
+                for j in create_json
+            ]
+
             try:
-                response = self.client.post("documents/process/", json={"ids": doc_ids})
+                response = self.client.post("documents/process/", json=process_payload)
             except (APIError, RequestException) as exc:
                 if handle_errors:
                     logger.info(
@@ -484,7 +531,7 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
                     continue
                 else:
                     raise
-
+            logger.info("Process payload: %s", process_payload)
         logger.info("Upload directory complete")
 
         # Pass back the list of documents
@@ -496,6 +543,8 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
         # Do not set the same title for all documents
         kwargs.pop("title", None)
 
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
+
         obj_list = []
         params = self._format_upload_parameters("", **kwargs)
         for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
@@ -534,6 +583,25 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
             create_json = response.json()
             obj_list.extend(create_json)
 
+            # Begin bulk processing if needed
+            if force_ocr:
+                process_payload = [
+                    {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
+                    for j in create_json
+                ]
+                try:
+                    self.client.post("documents/process/", json=process_payload)
+                except (APIError, RequestException) as exc:
+                    if handle_errors:
+                        logger.info(
+                            "Error processing the following documents: %s\n%s",
+                            exc,
+                            "\n".join(url_group),
+                        )
+                        continue
+                    else:
+                        raise
+
         logger.info("Upload URLs complete")
 
         # Pass back the list of documents
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="python-documentcloud",
-    version="4.4.1",
+    version="4.5.0",
     description="A simple Python wrapper for the DocumentCloud API",
     author="Mitchell Kotler",
     author_email="mitch@muckrock.com",