Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# LLM API Keys
ANTHROPIC_API_KEY=sk-ant-your-key-here
OPENAI_API_KEY=sk-your-key-here
OLLAMA_API_KEY=you-key-here
28 changes: 19 additions & 9 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from harvestor import Harvestor, list_models


load_dotenv()


Expand All @@ -18,24 +19,33 @@ class SimpleInvoiceSchema(BaseModel):
None, description="The customer firstname"
)
customer_lastname: Optional[str] = Field(None, description="The customer lastname")
invoice_total_price_with_taxes: Optional[float] = Field(
None, description="The total price with taxes"
)
invoice_total_price_without_taxes: Optional[float] = Field(
None, description="The total price without taxes"
)


# List available models
print("Available models:", list(list_models().keys()))

# Use default model (claude-haiku)
h = Harvestor(model="claude-haiku")
# h = Harvestor(model="claude-haiku")

output = h.harvest_file(
source="data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
)
# output = h.harvest_file(
# source="data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
# )

print(output.to_summary())
# print(output.to_summary())

# Alternative: use OpenAI
# h_openai = Harvestor(model="gpt-4o-mini")
# output = h_openai.harvest_file("invoice.jpg", schema=SimpleInvoiceSchema)
# output = h_openai.harvest_file("data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema)

# Alternative: use local Ollama (free)
# h_ollama = Harvestor(model="llava")
# output = h_ollama.harvest_file("invoice.jpg", schema=SimpleInvoiceSchema)
# Alternative: use local Ollama (free) or cloud Ollama
h_ollama = Harvestor(model="gemma3:4b-cloud")
output = h_ollama.harvest_file(
"data/uploads/keep_for_test.jpg", schema=SimpleInvoiceSchema
)
print(output.to_summary())
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dependencies = [
"langchain-openai>=0.0.5",
"anthropic>=0.18.0",
"openai>=1.10.0",
"httpx>=0.27.0", # For Ollama provider
"ollama>=0.6.1",

# Document Processing
"PyMuPDF>=1.23.0",
Expand Down
91 changes: 40 additions & 51 deletions src/harvestor/providers/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
from typing import Optional

import httpx
from ollama import generate, Client, list as list_models

from .base import BaseLLMProvider, CompletionResult, ModelInfo

Expand Down Expand Up @@ -66,17 +66,25 @@ def __init__(
if model not in OLLAMA_MODELS:
# Allow custom models not in the predefined list
self.model_config = {
"id": f"{model}:latest" if ":" not in model else model,
"id": f"{model}:latest"
if ":" not in model
else model, # allow models like ministral-3:3b
"input_cost": 0.0,
"output_cost": 0.0,
"supports_vision": False,
"supports_vision": True,
"context_window": 8192,
}
else:
self.model_config = OLLAMA_MODELS[model]

self.client = None # if self.client -> using ollama cloud
if model.endswith("cloud"):
self.client = Client(
host="https://ollama.com",
headers={"Authorization": "Bearer " + os.environ.get("OLLAMA_API_KEY")},
)

self.model_id = self.model_config["id"]
self.client = httpx.Client(base_url=base_url, timeout=120.0)

def complete(
self,
Expand All @@ -85,20 +93,12 @@ def complete(
temperature: float = 0.0,
) -> CompletionResult:
try:
response = self.client.post(
"/api/generate",
json={
"model": self.model_id,
"prompt": prompt,
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens,
},
},
data = generate(
model=self.model_id,
prompt=prompt,
stream=False,
options={"temperature": temperature, "num_predict": max_tokens},
)
response.raise_for_status()
data = response.json()

return CompletionResult(
success=True,
Expand All @@ -112,13 +112,6 @@ def complete(
},
)

except httpx.ConnectError:
return CompletionResult(
success=False,
content="",
model=self.model_id,
error=f"Cannot connect to Ollama at {self.base_url}. Is Ollama running?",
)
except Exception as e:
return CompletionResult(
success=False,
Expand Down Expand Up @@ -146,41 +139,39 @@ def complete_vision(
try:
image_b64 = base64.standard_b64encode(image_data).decode("utf-8")

response = self.client.post(
"/api/generate",
json={
"model": self.model_id,
"prompt": prompt,
"images": [image_b64],
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens,
},
},
)
response.raise_for_status()
data = response.json()
if self.client:
data = self.client.generate(
model=self.model,
prompt=prompt,
images=[image_b64],
stream=False,
options={"temperature": temperature, "num_predict": max_tokens},
)
else:
data = generate(
model=self.model,
prompt=prompt,
images=[image_b64],
stream=False,
options={"temperature": temperature, "num_predict": max_tokens},
)

return CompletionResult(
success=True,
content=data.get("response", ""),
input_tokens=data.get("prompt_eval_count", 0),
output_tokens=data.get("eval_count", 0),
input_tokens=0
if data.get("prompt_eval_count") is None
else data["prompt_eval_count"],
output_tokens=0
if data.get("eval_count") is None
else data["eval_count"],
model=self.model_id,
metadata={
"total_duration": data.get("total_duration"),
"vision": True,
},
)

except httpx.ConnectError:
return CompletionResult(
success=False,
content="",
model=self.model_id,
error=f"Cannot connect to Ollama at {self.base_url}. Is Ollama running?",
)
except Exception as e:
return CompletionResult(
success=False,
Expand Down Expand Up @@ -210,9 +201,7 @@ def get_provider_name(cls) -> str:
def list_local_models(self) -> list[str]:
"""List models available in the local Ollama installation."""
try:
response = self.client.get("/api/tags")
response.raise_for_status()
data = response.json()
data = list_models()
return [m["name"] for m in data.get("models", [])]
except Exception:
return []
9 changes: 8 additions & 1 deletion src/harvestor/schemas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def to_summary(self) -> str:
status = "SUCCESS" if self.success else "FAILED"
strategy = self.final_strategy.value if self.final_strategy else "N/A"

return f"""
summary = f"""
Harvest Result: {status}
Document: {self.document_id} ({self.document_type})
Strategy: {strategy}
Expand All @@ -199,6 +199,13 @@ def to_summary(self) -> str:
Data: {self.data}
""".strip()

if self.error:
summary += f"""
Error: {self.error}
"""

return summary


@dataclass
class CostReport:
Expand Down
17 changes: 15 additions & 2 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.