generated from MuckRock/documentcloud-hello-world-addon
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8029bb9
commit 727d22d
Showing
1 changed file
with
39 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,49 @@ | ||
""" | ||
This is a hello world add-on for DocumentCloud. | ||
It demonstrates how to write a add-on which can be activated from the | ||
DocumentCloud add-on system and run using Github Actions. It receives data | ||
from DocumentCloud via the request dispatch and writes data back to | ||
DocumentCloud using the standard API | ||
Schedule running OCR Add-On on a project of documents on a schedule. | ||
""" | ||
|
||
from itertools import islice | ||
from documentcloud.addon import AddOn | ||
|
||
|
||
class HelloWorld(AddOn): | ||
class Scheduler(AddOn): | ||
"""An example Add-On for DocumentCloud.""" | ||
|
||
def main(self): | ||
"""The main add-on functionality goes here.""" | ||
# fetch your add-on specific data | ||
name = self.data.get("name", "world") | ||
|
||
self.set_message("Hello World start!") | ||
|
||
# add a hello note to the first page of each selected document | ||
for document in self.get_documents(): | ||
# get_documents will iterate through all documents efficiently, | ||
# either selected or by query, dependeing on which is passed in | ||
document.annotations.create(f"Hello {name}!", 0) | ||
|
||
with open("hello.txt", "w+") as file_: | ||
file_.write("Hello world!") | ||
self.upload_file(file_) | ||
|
||
self.set_message("Hello World end!") | ||
self.send_mail("Hello World!", "We finished!") | ||
|
||
""" Runs the selected OCR engine on a batch of documents """ | ||
batch_size = self.data.get("batch_size") | ||
project_id = self.data.get("project_id") | ||
to_tag = self.data.get("to_tag", False) | ||
ocr_engine = self.data.get("ocr_engine") | ||
batch_num = 1 | ||
|
||
if ocr_engine == "azure": | ||
run_id = 544 | ||
if ocr_engine == "google": | ||
run_id = 542 | ||
if ocr_engine == "doctr": | ||
run_id = 549 | ||
|
||
documents = self.client.documents.search( | ||
f"+project:{project_id} -data_ocr_engine:*" | ||
) | ||
|
||
for i in range(batch_num): | ||
# Pull out the IDs for a batch of the documents | ||
doc_ids = [ | ||
d.id for d in islice(documents, i * batch_size, (i + 1) * batch_size) | ||
] | ||
|
||
self.client.post( | ||
"addon_runs/", | ||
json={ | ||
"addon": run_id, | ||
"parameters": { | ||
"to_tag": to_tag | ||
}, | ||
"documents": doc_ids, | ||
"dismissed": True, | ||
}, | ||
) | ||
|
||
if __name__ == "__main__": | ||
HelloWorld().main() | ||
Scheduler().main() |