Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,25 +1,3 @@
# LLM API Keys
ANTHROPIC_API_KEY=sk-ant-your-key-here
OPENAI_API_KEY=sk-your-key-here

# Database
DATABASE_URL=sqlite:///./data/harvestor.db

# Cost Limits
MAX_COST_PER_DOCUMENT=0.50
DAILY_COST_LIMIT=100.00

# Models
DEFAULT_EXTRACTION_MODEL=claude-haiku-4-5-20251001
DEFAULT_VALIDATION_MODEL=claude-sonnet-4-5-20250929

# OCR Settings
ENABLE_TESSERACT_PREPROCESSING=true
OCR_DPI=300
OCR_LANGUAGES=eng+fra+deu+spa

# Features
USE_LAYOUT_ANALYSIS=true
USE_TABLE_EXTRACTION=true
USE_KEYWORD_PROXIMITY=true
ENABLE_CACHING=true
24 changes: 15 additions & 9 deletions example.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from typing import Optional

from dotenv import load_dotenv
from pydantic import BaseModel, Field

from harvestor import Harvestor # , harvest
import os
from harvestor import Harvestor, list_models

load_dotenv()


class SimpleInoviceModelSchema(BaseModel):
class SimpleInvoiceSchema(BaseModel):
"""
Implement the schema you want as output. Customise for each document types.
Implement the schema you want as output. Customize for each document type.
"""

vendor: Optional[str] = Field(None, description="The vendor name")
Expand All @@ -20,16 +20,22 @@ class SimpleInoviceModelSchema(BaseModel):
customer_lastname: Optional[str] = Field(None, description="The customer lastname")


ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
# List available models
print("Available models:", list(list_models().keys()))

h = Harvestor(api_key=ANTHROPIC_API_KEY, model="Claude Haiku 3")
# Use default model (claude-haiku)
h = Harvestor(model="claude-haiku")

output = h.harvest_file(
source="data/uploads/keep_for_test.jpg", schema=SimpleInoviceModelSchema
source="data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
)

print(output.to_summary())

# output_2 = harvest("data/uploads/keep_for_test.jpg", schema=SimpleInoviceModelSchema)
# Alternative: use OpenAI
# h_openai = Harvestor(model="gpt-4o-mini")
# output = h_openai.harvest_file("invoice.jpg", schema=SimpleInvoiceSchema)

# print(output_2.to_summary())
# Alternative: use local Ollama (free)
# h_ollama = Harvestor(model="llava")
# output = h_ollama.harvest_file("invoice.jpg", schema=SimpleInvoiceSchema)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"langchain-openai>=0.0.5",
"anthropic>=0.18.0",
"openai>=1.10.0",
"httpx>=0.27.0", # For Ollama provider

# Document Processing
"PyMuPDF>=1.23.0",
Expand Down
4 changes: 0 additions & 4 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,5 @@ markers =
slow: Slow running tests
vision: Tests that use vision API

# Coverage options (if using pytest-cov)
# Uncomment when pytest-cov is installed
# addopts = --cov=src/harvestor --cov-report=term-missing --cov-report=html

# Minimum Python version
minversion = 3.10
28 changes: 28 additions & 0 deletions src/harvestor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Harvestor - Harvest intelligence from any document

Extract structured data from any document with AI-powered extraction.
Supports multiple LLM providers: Anthropic, OpenAI, and Ollama.
"""

__version__ = "0.1.0"
Expand All @@ -14,6 +15,20 @@
from .config import SUPPORTED_MODELS
from .core.cost_tracker import cost_tracker
from .core.harvestor import Harvestor, harvest
from .providers import (
DEFAULT_MODEL,
MODELS,
PROVIDERS,
AnthropicProvider,
BaseLLMProvider,
CompletionResult,
ModelInfo,
OllamaProvider,
OpenAIProvider,
get_provider,
list_models,
list_providers,
)
from .schemas.base import (
ExtractionResult,
ExtractionStrategy,
Expand All @@ -39,4 +54,17 @@
"LineItem",
# Config
"SUPPORTED_MODELS",
"MODELS",
"DEFAULT_MODEL",
# Providers
"PROVIDERS",
"BaseLLMProvider",
"CompletionResult",
"ModelInfo",
"AnthropicProvider",
"OpenAIProvider",
"OllamaProvider",
"get_provider",
"list_models",
"list_providers",
]
79 changes: 75 additions & 4 deletions src/harvestor/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
from pathlib import Path

from harvestor import harvest
from harvestor import DEFAULT_MODEL, harvest, list_models
from harvestor.schemas.defaults import InvoiceData, ReceiptData


Expand All @@ -20,17 +20,19 @@ def build_parser():
parser.add_argument(
"file_path",
type=Path,
nargs="?",
help="Path to the document to process",
)
parser.add_argument(
"schema",
nargs="?",
help="Schema to use (e.g., InvoiceData, ReceiptData)",
)
parser.add_argument(
"-m",
"--model",
default="Claude Haiku 3",
help="Model to use (default: Claude Haiku 3)",
default=DEFAULT_MODEL,
help=f"Model to use (default: {DEFAULT_MODEL})",
)
parser.add_argument(
"-o",
Expand All @@ -43,13 +45,22 @@ def build_parser():
action="store_true",
help="Pretty print JSON output",
)
parser.add_argument(
"--list-models",
action="store_true",
help="List available models and exit",
)
parser.add_argument(
"--list-schemas",
action="store_true",
help="List available schemas and exit",
)

return parser


def get_schema(schema_name: str):
"""Resolve schema name to actual schema class."""

schemas = {
"InvoiceData": InvoiceData,
"ReceiptData": ReceiptData,
Expand All @@ -62,10 +73,70 @@ def get_schema(schema_name: str):
return schemas[schema_name]


def print_models():
"""Print available models grouped by provider."""
models = list_models()

providers = {}
for name, info in models.items():
provider = info.get("provider", "unknown")
if provider not in providers:
providers[provider] = []
providers[provider].append((name, info))

print("\nAvailable models:")
print("=" * 50)

for provider, model_list in sorted(providers.items()):
print(f"\n{provider.upper()}:")
for name, info in sorted(model_list):
vision = " (vision)" if info.get("supports_vision") else ""
cost = info.get("input_cost", 0)
if cost == 0:
cost_str = "free"
else:
cost_str = f"${cost}/M tokens"
print(f" {name:<20} {cost_str}{vision}")

print(f"\nDefault: {DEFAULT_MODEL}")
print()


def print_schemas():
"""Print available schemas."""
schemas = {
"InvoiceData": InvoiceData,
"ReceiptData": ReceiptData,
}

print("\nAvailable schemas:")
print("=" * 50)

for name, schema in schemas.items():
doc = schema.__doc__ or "No description"
print(f" {name}: {doc.strip().split(chr(10))[0]}")

print()


def main():
parser = build_parser()
args = parser.parse_args()

if args.list_models:
print_models()
sys.exit(0)

if args.list_schemas:
print_schemas()
sys.exit(0)

if not args.file_path:
parser.error("file_path is required")

if not args.schema:
parser.error("schema is required")

if not args.file_path.exists():
print(f"Error: File not found: {args.file_path}", file=sys.stderr)
sys.exit(1)
Expand Down
47 changes: 19 additions & 28 deletions src/harvestor/config.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,19 @@
SUPPORTED_MODELS = {
# Anthropic Claude
"Claude Haiku 3": {"id": "claude-3-haiku-20240307", "input": 0.25, "output": 1.25},
"Claude Haiku 4.5": {
"id": "claude-haiku-4-5-20251001",
"input": 1.0,
"output": 5.0,
},
"Claude Sonnet 3.7": {
"id": "claude-3-7-sonnet-20250219",
"input": 3.0,
"output": 15.0,
},
"Claude Sonnet 4.5": {
"id": "claude-sonnet-4-5-20250929",
"input": 3.0,
"output": 15.0,
},
"Claude Opus 4,5": {
"id": "claude-opus-4-5-20251101",
"input": 5.0,
"output": 25.0,
}, # very good stuff
# OpenAI TODO: check OpenAI models
# "gpt-3.5-turbo": {"input": 0.50, "output": 1.50},
"gpt-4": {"input": 30.0, "output": 60.0},
# "gpt-4-turbo": {"input": 10.0, "output": 30.0},
}
"""
Configuration for Harvestor.

Model definitions are now managed in the providers module.
This file re-exports them for backwards compatibility.
"""

from .providers import DEFAULT_MODEL, MODELS, list_models, list_providers

# Backwards compatibility alias
SUPPORTED_MODELS = MODELS

__all__ = [
"MODELS",
"SUPPORTED_MODELS",
"DEFAULT_MODEL",
"list_models",
"list_providers",
]
33 changes: 13 additions & 20 deletions src/harvestor/core/cost_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from typing import Dict, List, Optional

from ..schemas.base import CostReport, ExtractionStrategy
from ..config import SUPPORTED_MODELS


@dataclass
Expand Down Expand Up @@ -107,14 +106,15 @@ def calculate_cost(
self, model: str, input_tokens: int, output_tokens: int
) -> float:
"""Calculate cost for a given API call."""
if model not in SUPPORTED_MODELS:
# Unknown model, use conservative estimate (GPT-4 pricing)
raise ModelNotSupported(f"Model {model} is not supported.")
else:
pricing = SUPPORTED_MODELS[model]
from ..providers import MODELS

input_cost = (input_tokens / 1_000_000) * pricing["input"]
output_cost = (output_tokens / 1_000_000) * pricing["output"]
if model not in MODELS:
# Unknown model (possibly Ollama custom), assume free
return 0.0

pricing = MODELS[model]
input_cost = (input_tokens / 1_000_000) * pricing["input_cost"]
output_cost = (output_tokens / 1_000_000) * pricing["output_cost"]

return input_cost + output_cost

Expand Down Expand Up @@ -263,19 +263,12 @@ def generate_report(self, days: int = 7) -> CostReport:

# Calculate costs
total_cost = sum(c.cost for c in recent_calls)
free_successes = 0
llm_calls = len(
[
c
for c in recent_calls
if c.strategy
in {
ExtractionStrategy.LLM_HAIKU,
ExtractionStrategy.LLM_SONNET,
ExtractionStrategy.LLM_GPT35,
}
]
free_successes = sum(
1
for c in recent_calls
if c.strategy == ExtractionStrategy.LLM_OLLAMA and c.success
)
llm_calls = len(recent_calls)

# Cost by strategy
cost_by_strategy: Dict[str, float] = {}
Expand Down
Loading