From 8063543a504c9606e9d0435d2a2ec215b6ac1d4e Mon Sep 17 00:00:00 2001 From: THUAUD Simon Date: Sat, 31 Jan 2026 18:28:51 +0100 Subject: [PATCH 1/2] feat: more cli stuff --- src/harvestor/cli/__init__.py | 0 src/harvestor/cli/main.py | 99 +++++++++++++++++++++++++++++++++++ uv.lock | 56 -------------------- 3 files changed, 99 insertions(+), 56 deletions(-) create mode 100644 src/harvestor/cli/__init__.py create mode 100644 src/harvestor/cli/main.py diff --git a/src/harvestor/cli/__init__.py b/src/harvestor/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/harvestor/cli/main.py b/src/harvestor/cli/main.py new file mode 100644 index 0000000..139f351 --- /dev/null +++ b/src/harvestor/cli/main.py @@ -0,0 +1,99 @@ +""" +CLI to use the tool in the command line. +""" + +import argparse +import json +import sys +from pathlib import Path + + +def build_parser(): + parser = argparse.ArgumentParser( + prog="harvestor", + description="Extract structured data from documents using AI", + ) + + parser.add_argument( + "file_path", + type=Path, + help="Path to the document to process", + ) + parser.add_argument( + "schema", + help="Schema to use (e.g., InvoiceData, ReceiptData)", + ) + parser.add_argument( + "-m", + "--model", + default="Claude Haiku 3", + help="Model to use (default: Claude Haiku 3)", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + help="Output file path (default: stdout)", + ) + parser.add_argument( + "--pretty", + action="store_true", + help="Pretty print JSON output", + ) + + return parser + + +def get_schema(schema_name: str): + """Resolve schema name to actual schema class.""" + from harvestor.schemas.defaults import InvoiceData, ReceiptData + + schemas = { + "InvoiceData": InvoiceData, + "ReceiptData": ReceiptData, + } + + if schema_name not in schemas: + available = ", ".join(schemas.keys()) + raise ValueError(f"Unknown schema: {schema_name}. Available: {available}") + + return schemas[schema_name] + + +def main(): + parser = build_parser() + args = parser.parse_args() + + if not args.file_path.exists(): + print(f"Error: File not found: {args.file_path}", file=sys.stderr) + sys.exit(1) + + try: + schema = get_schema(args.schema) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + from harvestor import harvest + + result = harvest( + source=args.file_path, + schema=schema, + model=args.model, + ) + + if not result.success: + print(f"Error: {result.error}", file=sys.stderr) + sys.exit(1) + + indent = 2 if args.pretty else None + output = json.dumps(result.data, indent=indent, default=str) + + if args.output: + args.output.write_text(output) + else: + print(output) + + +if __name__ == "__main__": + main() diff --git a/uv.lock b/uv.lock index 68cddb5..52baabd 100644 --- a/uv.lock +++ b/uv.lock @@ -41,28 +41,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592 }, ] -[[package]] -name = "black" -version = "26.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "mypy-extensions" }, - { name = "packaging" }, - { name = "pathspec" }, - { name = "platformdirs" }, - { name = "pytokens" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/13/88/560b11e521c522440af991d46848a2bde64b5f7202ec14e1f46f9509d328/black-26.1.0.tar.gz", hash = "sha256:d294ac3340eef9c9eb5d29288e96dc719ff269a88e27b396340459dd85da4c58", size = 658785 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/04/fa2f4784f7237279332aa735cdfd5ae2e7730db0072fb2041dadda9ae551/black-26.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ba1d768fbfb6930fc93b0ecc32a43d8861ded16f47a40f14afa9bb04ab93d304", size = 1877781 }, - { url = "https://files.pythonhosted.org/packages/cf/ad/5a131b01acc0e5336740a039628c0ab69d60cf09a2c87a4ec49f5826acda/black-26.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2b807c240b64609cb0e80d2200a35b23c7df82259f80bef1b2c96eb422b4aac9", size = 1699670 }, - { url = "https://files.pythonhosted.org/packages/da/7c/b05f22964316a52ab6b4265bcd52c0ad2c30d7ca6bd3d0637e438fc32d6e/black-26.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1de0f7d01cc894066a1153b738145b194414cc6eeaad8ef4397ac9abacf40f6b", size = 1775212 }, - { url = "https://files.pythonhosted.org/packages/a6/a3/e8d1526bea0446e040193185353920a9506eab60a7d8beb062029129c7d2/black-26.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:91a68ae46bf07868963671e4d05611b179c2313301bd756a89ad4e3b3db2325b", size = 1409953 }, - { url = "https://files.pythonhosted.org/packages/c7/5a/d62ebf4d8f5e3a1daa54adaab94c107b57be1b1a2f115a0249b41931e188/black-26.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:be5e2fe860b9bd9edbf676d5b60a9282994c03fbbd40fe8f5e75d194f96064ca", size = 1217707 }, - { url = "https://files.pythonhosted.org/packages/e4/3d/51bdb3ecbfadfaf825ec0c75e1de6077422b4afa2091c6c9ba34fbfc0c2d/black-26.1.0-py3-none-any.whl", hash = "sha256:1054e8e47ebd686e078c0bb0eaf31e6ce69c966058d122f2c0c950311f9f3ede", size = 204010 }, -] - [[package]] name = "certifi" version = "2026.1.4" @@ -312,7 +290,6 @@ dependencies = [ [package.optional-dependencies] dev = [ - { name = "black" }, { name = "pytest" }, { name = "pytest-cov" }, { name = "pytest-mock" }, @@ -327,7 +304,6 @@ dev = [ [package.metadata] requires-dist = [ { name = "anthropic", specifier = ">=0.18.0" }, - { name = "black", marker = "extra == 'dev'", specifier = ">=23.12.0" }, { name = "click", specifier = ">=8.1.0" }, { name = "langchain", specifier = ">=0.1.0" }, { name = "langchain-anthropic", specifier = ">=0.1.0" }, @@ -620,15 +596,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, ] -[[package]] -name = "mypy-extensions" -version = "1.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963 }, -] - [[package]] name = "nodeenv" version = "1.10.0" @@ -752,15 +719,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 }, ] -[[package]] -name = "pathspec" -version = "1.0.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206 }, -] - [[package]] name = "pdfminer-six" version = "20251230" @@ -1029,20 +987,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230 }, ] -[[package]] -name = "pytokens" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e5/16/4b9cfd90d55e66ffdb277d7ebe3bc25250c2311336ec3fc73b2673c794d5/pytokens-0.4.0.tar.gz", hash = "sha256:6b0b03e6ea7c9f9d47c5c61164b69ad30f4f0d70a5d9fe7eac4d19f24f77af2d", size = 15039 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/63/627b7e71d557383da5a97f473ad50f8d9c2c1f55c7d3c2531a120c796f6e/pytokens-0.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:73eff3bdd8ad08da679867992782568db0529b887bed4c85694f84cdf35eafc6", size = 159744 }, - { url = "https://files.pythonhosted.org/packages/28/d7/16f434c37ec3824eba6bcb6e798e5381a8dc83af7a1eda0f95c16fe3ade5/pytokens-0.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d97cc1f91b1a8e8ebccf31c367f28225699bea26592df27141deade771ed0afb", size = 253207 }, - { url = "https://files.pythonhosted.org/packages/ab/96/04102856b9527701ae57d74a6393d1aca5bad18a1b1ca48ccffb3c93b392/pytokens-0.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a2c8952c537cb73a1a74369501a83b7f9d208c3cf92c41dd88a17814e68d48ce", size = 267452 }, - { url = "https://files.pythonhosted.org/packages/0e/ef/0936eb472b89ab2d2c2c24bb81c50417e803fa89c731930d9fb01176fe9f/pytokens-0.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5dbf56f3c748aed9310b310d5b8b14e2c96d3ad682ad5a943f381bdbbdddf753", size = 265965 }, - { url = "https://files.pythonhosted.org/packages/ae/f5/64f3d6f7df4a9e92ebda35ee85061f6260e16eac82df9396020eebbca775/pytokens-0.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:e131804513597f2dff2b18f9911d9b6276e21ef3699abeffc1c087c65a3d975e", size = 102813 }, - { url = "https://files.pythonhosted.org/packages/7c/3c/6941a82f4f130af6e1c68c076b6789069ef10c04559bd4733650f902fd3b/pytokens-0.4.0-py3-none-any.whl", hash = "sha256:0508d11b4de157ee12063901603be87fb0253e8f4cb9305eb168b1202ab92068", size = 13224 }, -] - [[package]] name = "pyyaml" version = "6.0.3" From 596b89f7d7b8d675def0f136a5cbd8695272ea2d Mon Sep 17 00:00:00 2001 From: THUAUD Simon Date: Sun, 1 Feb 2026 12:08:55 +0100 Subject: [PATCH 2/2] format and lint --- src/harvestor/cli/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/harvestor/cli/main.py b/src/harvestor/cli/main.py index 139f351..68a8f13 100644 --- a/src/harvestor/cli/main.py +++ b/src/harvestor/cli/main.py @@ -7,6 +7,9 @@ import sys from pathlib import Path +from harvestor import harvest +from harvestor.schemas.defaults import InvoiceData, ReceiptData + def build_parser(): parser = argparse.ArgumentParser( @@ -46,7 +49,6 @@ def build_parser(): def get_schema(schema_name: str): """Resolve schema name to actual schema class.""" - from harvestor.schemas.defaults import InvoiceData, ReceiptData schemas = { "InvoiceData": InvoiceData, @@ -74,8 +76,6 @@ def main(): print(f"Error: {e}", file=sys.stderr) sys.exit(1) - from harvestor import harvest - result = harvest( source=args.file_path, schema=schema,