From 0022cbfcb2882b7e912308bfc5496e39b3cc1c46 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 24 Sep 2024 17:38:34 +0200 Subject: [PATCH 1/6] fix code samples --- examples/auto_invoice_splitter_extraction.py | 61 +++++++++++--------- examples/multi_receipts_tutorial.py | 3 +- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction.py b/examples/auto_invoice_splitter_extraction.py index 4a456a3a..655e665b 100644 --- a/examples/auto_invoice_splitter_extraction.py +++ b/examples/auto_invoice_splitter_extraction.py @@ -1,38 +1,47 @@ import os - from mindee import Client from mindee.extraction.pdf_extractor import PdfExtractor from mindee.input import PathInput from mindee.product import InvoiceSplitterV1, InvoiceV4 -api_key = os.getenv("MINDEE_API_KEY") -mindee_client = Client(api_key=api_key) - -input_path = "path/to/your/file.ext" -input_source = PathInput(input_path) +mindee_client = Client(api_key="my-api-key") +# mindee_client = Client() # Optionally, set from env. -if input_source.is_pdf(): - pdf_extractor = PdfExtractor(input_source) - if pdf_extractor.get_page_count() > 1: - invoice_splitter_response = mindee_client.enqueue_and_parse( - InvoiceSplitterV1, input_source, close_file=False - ) - page_groups = ( - invoice_splitter_response.document.inference.prediction.invoice_page_groups - ) - extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict=False) - for extracted_pdf in extracted_pdfs: - # Optional: Save the files locally - # extracted_pdf.write_to_file("output/path") +def parse_invoice(file_path): + input_source = PathInput(file_path) - invoice_result = mindee_client.parse( - InvoiceV4, extracted_pdf.as_input_source() - ) - print(invoice_result.document) + if input_source.is_pdf() and input_source.count_doc_pages() > 1: + parse_multi_page(input_source) else: - invoice_result = mindee_client.parse(InvoiceV4, input_source) - print(invoice_result.document) -else: + parse_single(input_source) + + +def parse_single(input_source): invoice_result = mindee_client.parse(InvoiceV4, input_source) print(invoice_result.document) + + +def parse_multi_page(input_source): + pdf_extractor = PdfExtractor(input_source) + invoice_splitter_response = mindee_client.enqueue_and_parse( + InvoiceSplitterV1, input_source, close_file=False + ) + page_groups = ( + invoice_splitter_response.document.inference.prediction.invoice_page_groups + ) + extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict=False) + + for extracted_pdf in extracted_pdfs: + # Optional: Save the files locally + # extracted_pdf.write_to_file("output/path") + + invoice_result = mindee_client.parse( + InvoiceV4, extracted_pdf.as_input_source() + ) + print(invoice_result.document) + + +if __name__ == "__main__": + input_path = "path/to/your/file.ext" + parse_invoice(input_path) diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py index 1bea52fd..9dcceaeb 100644 --- a/examples/multi_receipts_tutorial.py +++ b/examples/multi_receipts_tutorial.py @@ -3,8 +3,9 @@ extract_receipts, ) +api_key = "my-api-key" # Init a new client -mindee_client = Client() +mindee_client = Client(api_key) # Load a file from disk input_doc = mindee_client.source_from_path("path/to/your/file.ext") From 36b7e3c14704b81a539c94b8f5c2d5e2569b3ee5 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 25 Sep 2024 14:03:26 +0200 Subject: [PATCH 2/6] quickfixes to example scripts --- ...to_invoice_splitter_extraction_example.py} | 0 .../auto_multi_receipts_extraction_example.py | 26 +++++++++++++++++++ examples/multi_receipts_tutorial.py | 21 --------------- 3 files changed, 26 insertions(+), 21 deletions(-) rename examples/{auto_invoice_splitter_extraction.py => auto_invoice_splitter_extraction_example.py} (100%) create mode 100644 examples/auto_multi_receipts_extraction_example.py delete mode 100644 examples/multi_receipts_tutorial.py diff --git a/examples/auto_invoice_splitter_extraction.py b/examples/auto_invoice_splitter_extraction_example.py similarity index 100% rename from examples/auto_invoice_splitter_extraction.py rename to examples/auto_invoice_splitter_extraction_example.py diff --git a/examples/auto_multi_receipts_extraction_example.py b/examples/auto_multi_receipts_extraction_example.py new file mode 100644 index 00000000..044f2ad7 --- /dev/null +++ b/examples/auto_multi_receipts_extraction_example.py @@ -0,0 +1,26 @@ +from mindee import Client, PredictResponse, product +from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import ( + extract_receipts, +) + +mindee_client = Client(api_key="my-api-key") +# mindee_client = Client() # Optionally, set from env. + + +def parse_receipts(input_path): + input_doc = mindee_client.source_from_path(input_path) + result_split: PredictResponse = mindee_client.parse( + product.MultiReceiptsDetectorV1, input_doc, close_file=False + ) + + extracted_receipts = extract_receipts(input_doc, result_split.document.inference) + for receipt in extracted_receipts: + receipt_as_source = receipt.as_source() + # receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt + result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) + print(result_receipt.document) + + +if __name__ == "__main__": + input_path = "path/to/your/file.ext" + parse_receipts(input_path) diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py deleted file mode 100644 index 9dcceaeb..00000000 --- a/examples/multi_receipts_tutorial.py +++ /dev/null @@ -1,21 +0,0 @@ -from mindee import Client, PredictResponse, product -from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import ( - extract_receipts, -) - -api_key = "my-api-key" -# Init a new client -mindee_client = Client(api_key) - -# Load a file from disk -input_doc = mindee_client.source_from_path("path/to/your/file.ext") -result_split: PredictResponse = mindee_client.parse( - product.MultiReceiptsDetectorV1, input_doc, close_file=False -) - -extracted_receipts = extract_receipts(input_doc, result_split.document.inference) -for receipt in extracted_receipts: - receipt_as_source = receipt.as_source() - # receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt - result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) - print(result_receipt.document) From fc5502a0f2974281b81902f72e61376b361edb3f Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 25 Sep 2024 14:04:48 +0200 Subject: [PATCH 3/6] fix paths in examples --- examples/auto_invoice_splitter_extraction_example.py | 1 - examples/auto_multi_receipts_extraction_example.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py index 655e665b..b90bd379 100644 --- a/examples/auto_invoice_splitter_extraction_example.py +++ b/examples/auto_invoice_splitter_extraction_example.py @@ -43,5 +43,4 @@ def parse_multi_page(input_source): if __name__ == "__main__": - input_path = "path/to/your/file.ext" parse_invoice(input_path) diff --git a/examples/auto_multi_receipts_extraction_example.py b/examples/auto_multi_receipts_extraction_example.py index 044f2ad7..ef1dbcee 100644 --- a/examples/auto_multi_receipts_extraction_example.py +++ b/examples/auto_multi_receipts_extraction_example.py @@ -22,5 +22,4 @@ def parse_receipts(input_path): if __name__ == "__main__": - input_path = "path/to/your/file.ext" - parse_receipts(input_path) + parse_receipts("path/to/your/file.ext") From 21323f8b6baa0b8630af93b79aa363df231c8b78 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 25 Sep 2024 14:09:36 +0200 Subject: [PATCH 4/6] fix lint --- examples/auto_invoice_splitter_extraction_example.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py index b90bd379..45732236 100644 --- a/examples/auto_invoice_splitter_extraction_example.py +++ b/examples/auto_invoice_splitter_extraction_example.py @@ -1,4 +1,5 @@ import os + from mindee import Client from mindee.extraction.pdf_extractor import PdfExtractor from mindee.input import PathInput @@ -36,9 +37,7 @@ def parse_multi_page(input_source): # Optional: Save the files locally # extracted_pdf.write_to_file("output/path") - invoice_result = mindee_client.parse( - InvoiceV4, extracted_pdf.as_input_source() - ) + invoice_result = mindee_client.parse(InvoiceV4, extracted_pdf.as_input_source()) print(invoice_result.document) From 613e4be023634ea49bac9e5b89ffca11766a19fe Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 25 Sep 2024 14:38:02 +0200 Subject: [PATCH 5/6] fix lingering issues --- ...uto_invoice_splitter_extraction_example.py | 6 ++--- .../auto_multi_receipts_extraction_example.py | 26 ++++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py index 45732236..a4fc3c22 100644 --- a/examples/auto_invoice_splitter_extraction_example.py +++ b/examples/auto_invoice_splitter_extraction_example.py @@ -1,12 +1,10 @@ -import os - from mindee import Client from mindee.extraction.pdf_extractor import PdfExtractor from mindee.input import PathInput from mindee.product import InvoiceSplitterV1, InvoiceV4 mindee_client = Client(api_key="my-api-key") -# mindee_client = Client() # Optionally, set from env. +# mindee_client = Client() # Optionally, set from env. def parse_invoice(file_path): @@ -42,4 +40,4 @@ def parse_multi_page(input_source): if __name__ == "__main__": - parse_invoice(input_path) + parse_invoice("path/to/my/file.ext") diff --git a/examples/auto_multi_receipts_extraction_example.py b/examples/auto_multi_receipts_extraction_example.py index ef1dbcee..2d24ea44 100644 --- a/examples/auto_multi_receipts_extraction_example.py +++ b/examples/auto_multi_receipts_extraction_example.py @@ -1,25 +1,33 @@ -from mindee import Client, PredictResponse, product +import os + +from mindee import Client, product from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import ( extract_receipts, ) -mindee_client = Client(api_key="my-api-key") -# mindee_client = Client() # Optionally, set from env. - def parse_receipts(input_path): + mindee_client = Client(api_key="my-api-key-here") + # mindee_client = Client() # Optionally, set from env. input_doc = mindee_client.source_from_path(input_path) - result_split: PredictResponse = mindee_client.parse( + + result_split = mindee_client.parse( product.MultiReceiptsDetectorV1, input_doc, close_file=False ) extracted_receipts = extract_receipts(input_doc, result_split.document.inference) - for receipt in extracted_receipts: - receipt_as_source = receipt.as_source() - # receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt + + for idx, receipt in enumerate(extracted_receipts, 1): result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) + print(f"Receipt {idx}:") print(result_receipt.document) + print("-" * 40) + + # Uncomment to save each extracted receipt + # save_path = f"./receipt_{idx}.pdf" + # receipt.save_to_file(save_path) if __name__ == "__main__": - parse_receipts("path/to/your/file.ext") + input_file = "path/to/my/file.ext" + parse_receipts(input_file) From c4525c335a5a1dca511c68ded5297b4499125ddf Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 26 Sep 2024 11:47:57 +0200 Subject: [PATCH 6/6] change syntax to fit other sdks syntax --- examples/auto_invoice_splitter_extraction_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py index a4fc3c22..69267c4b 100644 --- a/examples/auto_invoice_splitter_extraction_example.py +++ b/examples/auto_invoice_splitter_extraction_example.py @@ -13,10 +13,10 @@ def parse_invoice(file_path): if input_source.is_pdf() and input_source.count_doc_pages() > 1: parse_multi_page(input_source) else: - parse_single(input_source) + parse_single_page(input_source) -def parse_single(input_source): +def parse_single_page(input_source): invoice_result = mindee_client.parse(InvoiceV4, input_source) print(invoice_result.document)