Skip to content

Commit

Permalink
First pass at scheduler
Browse files Browse the repository at this point in the history
  • Loading branch information
duckduckgrayduck committed Mar 5, 2024
1 parent 8029bb9 commit 727d22d
Showing 1 changed file with 39 additions and 28 deletions.
67 changes: 39 additions & 28 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,49 @@
"""
This is a hello world add-on for DocumentCloud.
It demonstrates how to write a add-on which can be activated from the
DocumentCloud add-on system and run using Github Actions. It receives data
from DocumentCloud via the request dispatch and writes data back to
DocumentCloud using the standard API
Schedule running OCR Add-On on a project of documents on a schedule.
"""

from itertools import islice
from documentcloud.addon import AddOn


class HelloWorld(AddOn):
class Scheduler(AddOn):
"""An example Add-On for DocumentCloud."""

def main(self):
"""The main add-on functionality goes here."""
# fetch your add-on specific data
name = self.data.get("name", "world")

self.set_message("Hello World start!")

# add a hello note to the first page of each selected document
for document in self.get_documents():
# get_documents will iterate through all documents efficiently,
# either selected or by query, dependeing on which is passed in
document.annotations.create(f"Hello {name}!", 0)

with open("hello.txt", "w+") as file_:
file_.write("Hello world!")
self.upload_file(file_)

self.set_message("Hello World end!")
self.send_mail("Hello World!", "We finished!")

""" Runs the selected OCR engine on a batch of documents """
batch_size = self.data.get("batch_size")
project_id = self.data.get("project_id")
to_tag = self.data.get("to_tag", False)
ocr_engine = self.data.get("ocr_engine")
batch_num = 1

if ocr_engine == "azure":
run_id = 544
if ocr_engine == "google":
run_id = 542
if ocr_engine == "doctr":
run_id = 549

documents = self.client.documents.search(
f"+project:{project_id} -data_ocr_engine:*"
)

for i in range(batch_num):
# Pull out the IDs for a batch of the documents
doc_ids = [
d.id for d in islice(documents, i * batch_size, (i + 1) * batch_size)
]

self.client.post(
"addon_runs/",
json={
"addon": run_id,
"parameters": {
"to_tag": to_tag
},
"documents": doc_ids,
"dismissed": True,
},
)

if __name__ == "__main__":
HelloWorld().main()
Scheduler().main()

0 comments on commit 727d22d

Please sign in to comment.