First pass at scheduler

duckduckgrayduck · Mar 5, 2024 · 727d22d · 727d22d
1 parent 8029bb9
commit 727d22d
Showing 1 changed file with 39 additions and 28 deletions.
diff --git a/main.py b/main.py
@@ -1,38 +1,49 @@
 """
-This is a hello world add-on for DocumentCloud.
-
-It demonstrates how to write a add-on which can be activated from the
-DocumentCloud add-on system and run using Github Actions.  It receives data
-from DocumentCloud via the request dispatch and writes data back to
-DocumentCloud using the standard API
+Schedule running OCR Add-On on a project of documents on a schedule.
 """
-
+from itertools import islice
 from documentcloud.addon import AddOn
 
 
-class HelloWorld(AddOn):
+class Scheduler(AddOn):
     """An example Add-On for DocumentCloud."""
 
     def main(self):
-        """The main add-on functionality goes here."""
-        # fetch your add-on specific data
-        name = self.data.get("name", "world")
-
-        self.set_message("Hello World start!")
-
-        # add a hello note to the first page of each selected document
-        for document in self.get_documents():
-            # get_documents will iterate through all documents efficiently,
-            # either selected or by query, dependeing on which is passed in
-            document.annotations.create(f"Hello {name}!", 0)
-
-        with open("hello.txt", "w+") as file_:
-            file_.write("Hello world!")
-            self.upload_file(file_)
-
-        self.set_message("Hello World end!")
-        self.send_mail("Hello World!", "We finished!")
-
+        """ Runs the selected OCR engine on a batch of documents """
+        batch_size = self.data.get("batch_size")
+        project_id = self.data.get("project_id")
+        to_tag = self.data.get("to_tag", False)
+        ocr_engine = self.data.get("ocr_engine")
+        batch_num = 1
+
+        if ocr_engine == "azure":
+            run_id = 544
+        if ocr_engine == "google":
+            run_id = 542
+        if ocr_engine == "doctr":
+            run_id = 549
+
+        documents = self.client.documents.search(
+            f"+project:{project_id} -data_ocr_engine:*"
+        )
+
+        for i in range(batch_num):
+            # Pull out the IDs for a batch of the documents
+            doc_ids = [
+                d.id for d in islice(documents, i * batch_size, (i + 1) * batch_size)
+            ]
+
+            self.client.post(
+                "addon_runs/",
+                json={
+                    "addon": run_id,
+                    "parameters": {
+                        "to_tag": to_tag
+                    },
+                    "documents": doc_ids,
+                    "dismissed": True,
+                },
+            )
 
 if __name__ == "__main__":
-    HelloWorld().main()
+    Scheduler().main()