Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,21 @@ class SimpleInvoiceSchema(BaseModel):
print("Available models:", list(list_models().keys()))

# Use default model (claude-haiku)
# h = Harvestor(model="claude-haiku")
h = Harvestor(model="claude-haiku", validate=True)

# output = h.harvest_file(
# source="data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
# )

# print(output.to_summary())
output = h.harvest_file(
source="data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
)

print(output.to_summary())
print(output.validation)
# Alternative: use OpenAI
# h_openai = Harvestor(model="gpt-4o-mini")
# output = h_openai.harvest_file("data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema)

# Alternative: use local Ollama (free) or cloud Ollama
h_ollama = Harvestor(model="gemma3:4b-cloud")
output = h_ollama.harvest_file(
"data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
)
print(output.to_summary())
# h_ollama = Harvestor(model="gemma3:4b-cloud")
# output = h_ollama.harvest_file(
# "data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
# )
# print(output.to_summary())
6 changes: 6 additions & 0 deletions src/harvestor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
ValidationResult,
)
from .schemas.defaults import InvoiceData, LineItem, ReceiptData
from .validators import BaseValidationRule, RuleFinding, RuleSeverity, ValidationEngine

__all__ = [
"__version__",
Expand All @@ -48,6 +49,11 @@
"ExtractionStrategy",
"HarvestResult",
"ValidationResult",
# Validation
"ValidationEngine",
"BaseValidationRule",
"RuleFinding",
"RuleSeverity",
# Output schemas
"InvoiceData",
"ReceiptData",
Expand Down
24 changes: 23 additions & 1 deletion src/harvestor/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def build_parser():
action="store_true",
help="List available schemas and exit",
)
parser.add_argument(
"--validate",
action="store_true",
help="Run validation rules on extracted data",
)

return parser

Expand Down Expand Up @@ -151,14 +156,31 @@ def main():
source=args.file_path,
schema=schema,
model=args.model,
validate=args.validate,
)

if not result.success:
print(f"Error: {result.error}", file=sys.stderr)
sys.exit(1)

indent = 2 if args.pretty else None
output = json.dumps(result.data, indent=indent, default=str)

if result.validation:
full_output = {
"data": result.data,
"validation": {
"is_valid": result.validation.is_valid,
"confidence": result.validation.confidence,
"fraud_risk": result.validation.fraud_risk,
"errors": result.validation.errors,
"warnings": result.validation.warnings,
"fraud_reasons": result.validation.fraud_reasons,
"rules_checked": result.validation.rules_checked,
},
}
output = json.dumps(full_output, indent=indent, default=str)
else:
output = json.dumps(result.data, indent=indent, default=str)

if args.output:
args.output.write_text(output)
Expand Down
45 changes: 40 additions & 5 deletions src/harvestor/core/harvestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(
cost_limit_per_doc: float = 0.10,
daily_cost_limit: Optional[float] = None,
base_url: Optional[str] = None,
validate: bool = False,
validation_rules: Optional[List] = None,
):
"""
Initialize Harvestor.
Expand All @@ -48,6 +50,8 @@ def __init__(
cost_limit_per_doc: Maximum cost per document (default: $0.10)
daily_cost_limit: Optional daily cost limit
base_url: Optional base URL override for the provider
validate: Run validation rules on extracted data (default: False)
validation_rules: Custom validation rules (used with validate=True)
"""
self.model_name = model
self.api_key = api_key
Expand All @@ -61,6 +65,24 @@ def __init__(
# Initialize LLM parser (handles provider selection)
self.llm_parser = LLMParser(model=model, api_key=api_key, base_url=base_url)

# Initialize validation engine if enabled
self._validate = validate
self._validation_engine = None
if validate:
from ..validators import ValidationEngine

self._validation_engine = ValidationEngine(rules=validation_rules)

def _maybe_validate(
self, result: HarvestResult, schema: Type[BaseModel]
) -> HarvestResult:
"""Run validation if enabled and extraction succeeded."""
if self._validate and self._validation_engine and result.success:
result.validation = self._validation_engine.validate(
data=result.data, schema=schema
)
return result

@staticmethod
def get_doc_type_from_schema(schema: Type[BaseModel]) -> str:
"""
Expand Down Expand Up @@ -120,7 +142,7 @@ def harvest_text(

total_time = time.time() - start_time

return HarvestResult(
result = HarvestResult(
success=extraction_result.success,
document_id=document_id,
document_type=doc_type,
Expand All @@ -134,6 +156,7 @@ def harvest_text(
error=extraction_result.error,
language=language,
)
return self._maybe_validate(result, schema)

def harvest_file(
self,
Expand Down Expand Up @@ -349,7 +372,7 @@ def _harvest_image(

processing_time = time.time() - start_time

return HarvestResult(
result = HarvestResult(
success=extraction_result.success,
document_id=document_id,
document_type=doc_type,
Expand All @@ -363,6 +386,7 @@ def _harvest_image(
error=extraction_result.error,
language=language,
)
return self._maybe_validate(result, schema)

def harvest_batch(
self,
Expand Down Expand Up @@ -417,6 +441,8 @@ def harvest(
api_key: Optional[str] = None,
filename: Optional[str] = None,
base_url: Optional[str] = None,
validate: bool = False,
validation_rules: Optional[List] = None,
) -> HarvestResult:
"""
One-liner function for quick extraction.
Expand All @@ -435,8 +461,9 @@ def harvest(
# With OpenAI
result = harvest("invoice.jpg", schema=InvoiceData, model="gpt-4o-mini")

# With local Ollama
result = harvest("invoice.txt", schema=InvoiceData, model="llama3")
# With validation
result = harvest("invoice.pdf", schema=InvoiceData, validate=True)
print(result.validation.fraud_risk)
```

Args:
Expand All @@ -448,11 +475,19 @@ def harvest(
api_key: API key (uses env var if not provided)
filename: Original filename (required when source is bytes/file-like)
base_url: Optional base URL override
validate: Run validation rules on extracted data (default: False)
validation_rules: Custom validation rules (used with validate=True)

Returns:
HarvestResult with extracted data
"""
harvestor = Harvestor(api_key=api_key, model=model, base_url=base_url)
harvestor = Harvestor(
api_key=api_key,
model=model,
base_url=base_url,
validate=validate,
validation_rules=validation_rules,
)
return harvestor.harvest_file(
source=source,
schema=schema,
Expand Down
36 changes: 36 additions & 0 deletions src/harvestor/validators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Validation engine for extracted document data.

Provides rule-based validation, fraud detection, and anomaly flagging
for data extracted by Harvestor.
"""

from ..schemas.base import ValidationResult
from .base import BaseValidationRule, RuleFinding, RuleSeverity
from .engine import ValidationEngine


def validate(data, schema, rules=None, include_defaults=True) -> ValidationResult:
"""
One-liner validation function.

Args:
data: Extracted data dict
schema: Pydantic schema class
rules: Optional custom rules
include_defaults: Include built-in rules (default: True)

Returns:
ValidationResult
"""
engine = ValidationEngine(rules=rules, include_defaults=include_defaults)
return engine.validate(data, schema)


__all__ = [
"BaseValidationRule",
"RuleFinding",
"RuleSeverity",
"ValidationEngine",
"validate",
]
73 changes: 73 additions & 0 deletions src/harvestor/validators/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Base abstractions for validation rules."""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Type

from pydantic import BaseModel


class RuleSeverity(str, Enum):
"""Severity level for a rule finding."""

ERROR = "error"
WARNING = "warning"
INFO = "info"


@dataclass
class RuleFinding:
"""A single finding from a validation rule."""

rule_name: str
severity: RuleSeverity
message: str
field_name: Optional[str] = None
confidence_impact: float = 0.0
is_fraud_signal: bool = False
fraud_weight: float = 0.0


class BaseValidationRule(ABC):
"""Abstract base class for all validation rules."""

@property
@abstractmethod
def name(self) -> str:
"""Unique name for this rule."""
...

@property
@abstractmethod
def description(self) -> str:
"""Human-readable description of what this rule checks."""
...

@property
def supported_schemas(self) -> Optional[Set[Type[BaseModel]]]:
"""Set of schema types this rule applies to. None means all schemas."""
return None

def applies_to(self, schema: Type[BaseModel]) -> bool:
"""Check if this rule applies to the given schema."""
supported = self.supported_schemas
if supported is None:
return True
return any(issubclass(schema, s) for s in supported)

@abstractmethod
def validate(
self, data: Dict[str, Any], schema: Type[BaseModel]
) -> List[RuleFinding]:
"""
Run this rule against extracted data.

Args:
data: The extracted data dict (from HarvestResult.data)
schema: The Pydantic schema class used for extraction

Returns:
List of findings (empty list means rule passed)
"""
...
Loading