From 0a0ed4d3ebde6e6fbfcfe07dbce73b16f7be0fe2 Mon Sep 17 00:00:00 2001 From: Quin Hoxie Date: Thu, 2 Oct 2025 14:27:05 -0700 Subject: [PATCH 1/2] Rework optimize in preparation for new optimizers (e.g. GEPA). Remove unnecessary flag in favor of just calling optimize() on extractor, update examples. --- README.md | 3 +- .../content/docs/examples/legal-contracts.mdx | 2 +- .../docs/examples/scientific-papers.mdx | 2 +- docs/src/content/docs/optimization.mdx | 23 ++- docs/src/content/docs/persistence.mdx | 8 +- docs/src/content/docs/query-parsing.mdx | 49 +++-- docs/src/content/docs/quickstart.mdx | 3 +- docs/src/content/docs/why-dspy.mdx | 1 - docs/src/content/docs/why-langstruct.mdx | 4 +- examples/06_rag_integration.py | 2 +- examples/07_optimization.py | 9 +- langstruct/api.py | 32 ++-- langstruct/core/persistence.py | 9 +- langstruct/core/validation.py | 4 +- langstruct/optimizers/__init__.py | 3 +- langstruct/optimizers/bootstrap.py | 55 ------ tests/conftest.py | 2 +- tests/test_api.py | 41 ++--- tests/test_integration_workflows.py | 171 ++++++++++++++++++ tests/test_persistence.py | 3 +- 20 files changed, 257 insertions(+), 169 deletions(-) delete mode 100644 langstruct/optimizers/bootstrap.py create mode 100644 tests/test_integration_workflows.py diff --git a/README.md b/README.md index 0d70d35..4294624 100644 --- a/README.md +++ b/README.md @@ -205,8 +205,7 @@ Once you've got the basics working, there's more: ```python extractor.optimize( texts=your_examples, - expected_results=expected_outputs, - num_trials=50 + expected_results=expected_outputs ) ``` diff --git a/docs/src/content/docs/examples/legal-contracts.mdx b/docs/src/content/docs/examples/legal-contracts.mdx index d9492e2..2133b16 100644 --- a/docs/src/content/docs/examples/legal-contracts.mdx +++ b/docs/src/content/docs/examples/legal-contracts.mdx @@ -66,11 +66,11 @@ Create an extractor for legal document analysis: extractor = LangStruct( schema=LegalContractSchema, model="gemini/gemini-2.5-flash-lite", # Fast and reliable for legal analysis - optimize=True, use_sources=True, # Critical for legal document traceability temperature=0.1, # Lower temperature for consistency max_retries=3 # Ensure reliability ) +# Later: extractor.optimize(training_texts, expected_results) # Example contract text contract_text = """ diff --git a/docs/src/content/docs/examples/scientific-papers.mdx b/docs/src/content/docs/examples/scientific-papers.mdx index 908de48..904be99 100644 --- a/docs/src/content/docs/examples/scientific-papers.mdx +++ b/docs/src/content/docs/examples/scientific-papers.mdx @@ -76,11 +76,11 @@ Create an extractor for research paper analysis: extractor = LangStruct( schema=ScientificPaperSchema, model="gemini/gemini-2.5-flash-lite", # Fast and reliable for academic content - optimize=True, use_sources=True, # Track where information was found temperature=0.2, # Slightly higher for nuanced interpretation max_retries=3 ) +# Later: extractor.optimize(training_texts, expected_results) # Example research paper text (excerpt) paper_text = """ diff --git a/docs/src/content/docs/optimization.mdx b/docs/src/content/docs/optimization.mdx index caa76ee..5ce19bf 100644 --- a/docs/src/content/docs/optimization.mdx +++ b/docs/src/content/docs/optimization.mdx @@ -9,28 +9,28 @@ Make your extraction more accurate with automatic optimization. LangStruct learn ## The Easy Way -**Enable optimization (configure optimizer) and then optimize with your data:** +**Create an extractor (optionally choose the optimizer) and call `optimize()` when you're ready:** ```python from langstruct import LangStruct -# Create extractor with optimization enabled extractor = LangStruct( example={ "name": "Dr. Sarah Johnson", "age": 34, "occupation": "data scientist" }, - optimize=True # sets up optimizer; run .optimize(...) to train + optimizer="miprov2", # default optimizer ) + +# Later, once you have training data: +# extractor.optimize(texts=training_texts, expected_results=good_results) ``` -**Default behavior (faster startup, good baseline accuracy):** +**Quick experiments (skip optimization entirely):** ```python -# No optimization - good for quick experiments extractor = LangStruct(example={"name": "John", "age": 30}) -# optimize=False by default - enables faster startup ``` ## When You Have Training Data @@ -86,7 +86,6 @@ Most users don't need this, but if you want more control: extractor.optimize( texts=training_texts, expected_results=good_results, - num_trials=50, # More trials = better results (takes longer) validation_split=0.3 # Use 30% for testing improvements ) ``` @@ -110,19 +109,19 @@ extractor.optimize( ## Common Questions -**Q: Do I always need training data?** +**Q: Do I always need training data?** A: No! Optimization can work without training data, but providing examples improves results significantly. -**Q: How long does optimization take?** +**Q: How long does optimization take?** A: Usually 1-5 minutes for typical datasets (10-100 examples). -**Q: Can I optimize an already optimized extractor?** +**Q: Can I optimize an already optimized extractor?** A: Yes! You can keep optimizing with new data as you get it. -**Q: Will this make my extractions slower?** +**Q: Will this make my extractions slower?** A: No - optimization happens once during training. Production extraction speed is the same. -**Q: What happens when I switch models?** +**Q: What happens when I switch models?** A: Just change the model and re-optimize! Same training data, same accuracy - zero prompt rewriting needed. ## Next Steps diff --git a/docs/src/content/docs/persistence.mdx b/docs/src/content/docs/persistence.mdx index 0405427..48e92da 100644 --- a/docs/src/content/docs/persistence.mdx +++ b/docs/src/content/docs/persistence.mdx @@ -46,10 +46,9 @@ print(result.entities) ```python from langstruct import LangStruct -# Create extractor with optimization +# Create extractor extractor = LangStruct( example={"name": "John", "age": 30, "role": "engineer"}, - optimize=True ) # Train the extractor @@ -58,8 +57,7 @@ expected_results = [{"name": "Expected outputs..."}] extractor.optimize( texts=training_texts, - expected_results=expected_results, - num_trials=50 + expected_results=expected_results ) # Save optimized state @@ -215,7 +213,7 @@ Common error scenarios: ```python # Development: Train and save -extractor = LangStruct(schema=MySchema, optimize=True) +extractor = LangStruct(schema=MySchema) extractor.optimize(training_data, expected_results) extractor.save("./production_extractor") diff --git a/docs/src/content/docs/query-parsing.mdx b/docs/src/content/docs/query-parsing.mdx index c52d8f7..cddfa60 100644 --- a/docs/src/content/docs/query-parsing.mdx +++ b/docs/src/content/docs/query-parsing.mdx @@ -44,14 +44,14 @@ This single query contains **three distinct types of information**: - Quarter: Q3 2024 (exact match) - Revenue: > $100B (numeric comparison) - Sector: Technology (category match) - + These need **database-style filtering**, not semantic search **Conceptual topics for similarity search:** - "financial reports" (could be 10-K, earnings, statements) - "AI investments" (could be ML, artificial intelligence, neural networks) - + These need **embedding-based semantic search** @@ -59,7 +59,7 @@ This single query contains **three distinct types of information**: - "Show me" implies retrieval intent - "companies" implies corporate entities - Plural suggests multiple results expected - + These provide **query understanding context** @@ -86,14 +86,14 @@ results = vector_db.similarity_search(query_embedding) **What they are:** Conceptual topics that benefit from semantic understanding - + **Examples:** - "artificial intelligence" ≈ "AI" ≈ "machine learning" - "financial performance" ≈ "earnings" ≈ "fiscal results" - "customer satisfaction" ≈ "user happiness" ≈ "client feedback" - + **How they work:** Converted to embeddings for similarity matching - + **Best for:** - Finding conceptually related content - Handling synonyms and variations @@ -101,15 +101,15 @@ results = vector_db.similarity_search(query_embedding) **What they are:** Exact constraints that must be precisely matched - + **Examples:** - Date/Time: "Q3 2024", "after 2023", "last 30 days" - Numbers: "revenue > $100M", "5-10 employees", "top 3" - Categories: "tech sector", "approved status", "high priority" - Entities: "Apple Inc.", "California", "John Smith" - + **How they work:** Converted to database-style filter operations - + **Best for:** - Enforcing hard constraints - Filtering by exact values @@ -129,7 +129,7 @@ Let's see how different queries naturally decompose: - **Structured filters:** `{"quarter": "Q3 2024", "sector": "Technology", "profitable": true}` - **Why it matters:** You want companies that ARE profitable (filter), not just ones that DISCUSS profitability -#### Healthcare Query +#### Healthcare Query > "Patient records over 65 years old with diabetes showing improvement" - **Semantic terms:** `["showing improvement", "better outcomes"]` @@ -216,7 +216,7 @@ print("📖 Explanation:", result.explanation) 'revenue': {'$gte': 100.0} } 💯 Confidence: 91.5% -📖 Explanation: +📖 Explanation: Searching for: tech companies With filters: • quarter = Q3 2024 @@ -270,30 +270,30 @@ class EnhancedRAGSystem: # Same schema for both extraction and parsing! self.langstruct = LangStruct(example=schema_example) self.vectorstore = Chroma(embedding_function=OpenAIEmbeddings()) - + def index_document(self, text: str): """Extract metadata and index document""" # Extract structured metadata extraction = self.langstruct.extract(text) - + # Index with both text and metadata self.vectorstore.add_texts( texts=[text], metadatas=[extraction.entities] ) - + def natural_query(self, query: str, k: int = 5): """Query using natural language""" # Parse query into components parsed = self.langstruct.query(query) - + # Perform hybrid search results = self.vectorstore.similarity_search( query=' '.join(parsed.semantic_terms), k=k, filter=parsed.structured_filters ) - + return results, parsed.explanation # Usage @@ -407,13 +407,13 @@ ls = LangStruct(example=your_schema) # Query with natural language def smart_search(query: str): parsed = ls.query(query) - + results = collection.query( query_texts=parsed.semantic_terms, where=parsed.structured_filters, n_results=10 ) - + return results ``` @@ -431,19 +431,19 @@ ls = LangStruct(example=your_schema) # Natural language query def pinecone_search(query: str): parsed = ls.query(query) - + # Convert to Pinecone filter format pinecone_filter = { - f"metadata.{k}": v + f"metadata.{k}": v for k, v in parsed.structured_filters.items() } - + results = index.query( vector=embed(parsed.semantic_terms), filter=pinecone_filter, top_k=10 ) - + return results ``` @@ -497,9 +497,8 @@ domain_ls = LangStruct( # Include synonyms in descriptions "earnings": 10.5, # Also covers "profits", "income" }, - # Can optimize for better accuracy - optimize=True ) +# Call domain_ls.optimize(...) with training examples when ready ``` ## Performance Considerations @@ -512,7 +511,7 @@ from functools import lru_cache class CachedLangStruct: def __init__(self, schema): self.ls = LangStruct(example=schema) - + @lru_cache(maxsize=1000) def query_cached(self, query: str): """Cache frequently used queries""" diff --git a/docs/src/content/docs/quickstart.mdx b/docs/src/content/docs/quickstart.mdx index e6418b8..62930af 100644 --- a/docs/src/content/docs/quickstart.mdx +++ b/docs/src/content/docs/quickstart.mdx @@ -87,8 +87,7 @@ extractor = LangStruct(example=schema) # See optimization in action extractor.optimize( texts=["training texts..."], - expected=[{"expected outputs..."}], - num_trials=50 # More trials = better accuracy + expected=[{"expected outputs..."}] ) print(f"Optimized accuracy: {extractor.score:.1%}") ``` diff --git a/docs/src/content/docs/why-dspy.mdx b/docs/src/content/docs/why-dspy.mdx index 338e96d..6936b02 100644 --- a/docs/src/content/docs/why-dspy.mdx +++ b/docs/src/content/docs/why-dspy.mdx @@ -147,7 +147,6 @@ result = extractor.extract("Microsoft announced $65B revenue for Q4") extractor = LangStruct( example={"company": "Apple", "revenue": 100.0}, model="gpt-5-mini", - optimize=True ) extractor.optimize(training_texts, expected_results) diff --git a/docs/src/content/docs/why-langstruct.mdx b/docs/src/content/docs/why-langstruct.mdx index d0c2b96..6a7c687 100644 --- a/docs/src/content/docs/why-langstruct.mdx +++ b/docs/src/content/docs/why-langstruct.mdx @@ -155,7 +155,7 @@ extractor = LangExtract(...) # Month 6: Switch to Claude - everything breaks! # ❌ Prompts don't work the same way -# ❌ Few-shot examples need rewriting +# ❌ Few-shot examples need rewriting # ❌ Back to manual tuning for weeks # Month 12: Move to local Llama - start over again! @@ -166,7 +166,7 @@ extractor = LangExtract(...) ### With LangStruct ```python # Month 1: Set up once -extractor = LangStruct(example=schema, optimize=True) +extractor = LangStruct(example=schema) extractor.optimize(training_data) # Month 6: Switch to Claude diff --git a/examples/06_rag_integration.py b/examples/06_rag_integration.py index b98a270..99db1c8 100644 --- a/examples/06_rag_integration.py +++ b/examples/06_rag_integration.py @@ -69,8 +69,8 @@ def __init__(self, extraction_schema: Dict[str, Any]): self.metadata_extractor = LangStruct( example=extraction_schema, # Model will use LangStruct's default unless specified - optimize=True, # Enable auto-optimization ) + # Call self.metadata_extractor.optimize(...) later with labeled data if needed except Exception as e: raise ValueError( f"Failed to initialize LangStruct: {e}. " diff --git a/examples/07_optimization.py b/examples/07_optimization.py index 980c3a1..c3528bf 100644 --- a/examples/07_optimization.py +++ b/examples/07_optimization.py @@ -27,8 +27,9 @@ def main(): print("=" * 40) try: - # Step 1: Create extractor with optimization enabled - print("\n1️⃣ Creating extractor with auto-optimization...") + # Step 1: Create extractor + print() + print("1️⃣ Creating extractor...") extractor = LangStruct( example={ "person_name": "Dr. Sarah Johnson", @@ -36,9 +37,8 @@ def main(): "years_experience": 8, "specialization": "interventional cardiology", }, - optimize=True, # Enable optimization for better accuracy ) - print("✅ Extractor created with optimization enabled!") + print("✅ Extractor ready! Call optimize() once you have training data.") # Step 2: Initial extraction (baseline) print("\n2️⃣ Baseline extraction...") @@ -143,7 +143,6 @@ def main(): extractor.optimize( texts=training_texts, expected_results=expected_results, - num_trials=10, # More trials → better results (higher cost) ) did_optimize = True print(" ✅ Optimization complete!") diff --git a/langstruct/api.py b/langstruct/api.py index 2d504de..17d6984 100644 --- a/langstruct/api.py +++ b/langstruct/api.py @@ -22,7 +22,6 @@ PersistenceError, ValidationError, ) -from .optimizers.bootstrap import BootstrapOptimizer from .optimizers.metrics import ExtractionMetrics from .optimizers.mipro import MIPROv2Optimizer from .parallel import ParallelProcessor, ProcessingResult @@ -64,7 +63,6 @@ def __init__( self, schema: Optional[Type[Schema]] = None, model: Optional[Union[str, dspy.LM]] = None, - optimize: bool = False, optimizer: str = "miprov2", chunking_config: Optional[ChunkingConfig] = None, use_sources: bool = True, @@ -84,8 +82,7 @@ def __init__( schema: Pydantic schema defining the extraction structure (optional) model: Model name or DSPy LM instance (defaults to "gpt-5-mini"; pass "gpt-5-mini"/"gpt-5-pro" for the latest OpenAI models) - optimize: Whether to use automatic prompt optimization (default: False) - optimizer: Optimizer to use ("miprov2", "bootstrap") + optimizer: Optimizer to use when optimize() runs (default: "miprov2") chunking_config: Configuration for text chunking use_sources: Whether to include source grounding (default: True) example: Single example dict for auto schema generation (optional) @@ -128,7 +125,6 @@ def __init__( schema = ensure_schema_class(schema) self.schema = schema - self.optimize = optimize self.optimizer_name = optimizer self.chunking_config = chunking_config or ChunkingConfig() self.use_sources = use_sources @@ -166,7 +162,8 @@ def __init__( # Initialize the extraction pipeline (robust to monkeypatched constructors) pipeline_cls = core_modules.ExtractionPipeline try: - sig = inspect.signature(pipeline_cls) + # Inspect __init__ directly to get actual parameters (not dspy.Module's *args, **kwargs) + sig = inspect.signature(pipeline_cls.__init__) except (TypeError, ValueError): # Fallback if signature can't be inspected (e.g., C-extensions or mocks) sig = None @@ -198,10 +195,8 @@ def __init__( except TypeError: self.pipeline = pipeline_cls(schema) - # Initialize optimizer if requested + # Optimizer is created lazily when optimize() is called self.optimizer = None - if optimize: - self._initialize_optimizer() # Initialize refinement engine if requested self.refinement_engine = None @@ -509,7 +504,13 @@ def _extract_single( else self.refine_config ) - if effective_refine and self.refinement_engine: + if effective_refine: + # Lazily initialize refinement engine if not already created + if self.refinement_engine is None: + self.refinement_engine = RefinementEngine( + self.schema, self.pipeline.extractor + ) + # Run refinement process refined_result, trace = self.refinement_engine(text, effective_refine) result = refined_result @@ -580,7 +581,6 @@ def optimize( self, texts: List[str], expected_results: Optional[List[Dict]] = None, - num_trials: int = 20, validation_split: float = 0.2, ) -> "LangStruct": """Optimize extraction performance on provided data. @@ -588,7 +588,6 @@ def optimize( Args: texts: Training texts for optimization expected_results: Optional ground truth results for supervised optimization - num_trials: Number of optimization trials to run validation_split: Fraction of data to use for validation Returns: @@ -619,7 +618,6 @@ def optimize( val_texts=val_texts or train_texts, # Use train if no val data train_expected=train_expected, val_expected=val_expected, - num_trials=num_trials, ) self.pipeline = optimized_pipeline @@ -802,7 +800,7 @@ def save(self, path: str) -> None: path: Directory path to save the extractor to (will be created if needed) Example: - >>> extractor = LangStruct(schema=PersonSchema, optimize=True) + >>> extractor = LangStruct(schema=PersonSchema) >>> extractor.optimize(train_texts, expected_results) >>> extractor.save("./my_extractor") >>> # Creates directory with all extractor components @@ -847,11 +845,9 @@ def _initialize_optimizer(self) -> None: """Initialize the appropriate optimizer.""" if self.optimizer_name.lower() == "miprov2": self.optimizer = MIPROv2Optimizer() - elif self.optimizer_name.lower() == "bootstrap": - self.optimizer = BootstrapOptimizer() else: raise ValueError( - f"Unknown optimizer: {self.optimizer_name}. Supported optimizers: miprov2, bootstrap" + f"Unknown optimizer: {self.optimizer_name}. Only 'miprov2' is supported." ) def _parse_refine_config( @@ -1114,5 +1110,5 @@ def __repr__(self) -> str: return ( f"LangStruct(schema={self.schema.__name__}, " f"model={self.lm.__class__.__name__}, " - f"optimize={self.optimize})" + f"optimizer_initialized={self.optimizer is not None})" ) diff --git a/langstruct/core/persistence.py b/langstruct/core/persistence.py index 2606829..3d5a623 100644 --- a/langstruct/core/persistence.py +++ b/langstruct/core/persistence.py @@ -145,11 +145,10 @@ def load_extractor(cls, path: Union[str, Path]) -> "LangStruct": extractor = LangStruct( schema=schema_class, model=metadata.model_name, - optimize=False, # We'll handle optimization separately chunking_config=chunking_config, use_sources=metadata.use_sources, **metadata.lm_config, - ) + ) # Optimizer state restored separately except Exception as e: raise PersistenceError( f"Failed to recreate LangStruct instance. This may be due to missing API keys, " @@ -445,12 +444,6 @@ def _restore_optimizer_state( num_threads=optimizer_state.get("num_threads", 4), **optimizer_state.get("kwargs", {}), ) - elif optimizer_name == "bootstrap": - from ..optimizers.bootstrap import BootstrapOptimizer - - extractor.optimizer = BootstrapOptimizer( - **optimizer_state.get("kwargs", {}) - ) @classmethod def _restore_refinement_state( diff --git a/langstruct/core/validation.py b/langstruct/core/validation.py index b2934d8..42b0a03 100644 --- a/langstruct/core/validation.py +++ b/langstruct/core/validation.py @@ -452,7 +452,9 @@ def _generate_suggestions(self, issues: List[ValidationIssue]) -> List[str]: if IssueType.LOW_CONFIDENCE in issue_types: suggestions.append("🎯 Try a more powerful model (e.g. gpt-5-mini)") suggestions.append("📝 Add more detailed field descriptions") - suggestions.append("🔄 Enable auto-optimization with optimize=True") + suggestions.append( + "🔄 Run extractor.optimize(...) with representative data" + ) if IssueType.MISSING_FIELDS in issue_types: suggestions.append("❓ Make optional fields Optional[type] in schema") diff --git a/langstruct/optimizers/__init__.py b/langstruct/optimizers/__init__.py index a79255f..e4209de 100644 --- a/langstruct/optimizers/__init__.py +++ b/langstruct/optimizers/__init__.py @@ -1,7 +1,6 @@ """Optimization functionality using DSPy optimizers.""" -from .bootstrap import BootstrapOptimizer from .metrics import ExtractionMetrics from .mipro import MIPROv2Optimizer -__all__ = ["MIPROv2Optimizer", "BootstrapOptimizer", "ExtractionMetrics"] +__all__ = ["MIPROv2Optimizer", "ExtractionMetrics"] diff --git a/langstruct/optimizers/bootstrap.py b/langstruct/optimizers/bootstrap.py deleted file mode 100644 index 7599ffc..0000000 --- a/langstruct/optimizers/bootstrap.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Bootstrap optimizer for few-shot example generation.""" - -import logging -from typing import Any, Dict, List, Optional - -from ..core.modules import ExtractionPipeline - -logger = logging.getLogger(__name__) - - -class BootstrapOptimizer: - """DSPy Bootstrap optimizer for generating few-shot examples.""" - - def __init__(self, max_bootstrapped_demos: int = 8, max_labeled_demos: int = 16): - """Initialize Bootstrap optimizer. - - Args: - max_bootstrapped_demos: Maximum number of bootstrapped examples - max_labeled_demos: Maximum number of labeled examples to use - """ - self.max_bootstrapped_demos = max_bootstrapped_demos - self.max_labeled_demos = max_labeled_demos - - def optimize( - self, - pipeline: ExtractionPipeline, - train_texts: List[str], - val_texts: List[str], - train_expected: Optional[List[Dict]] = None, - val_expected: Optional[List[Dict]] = None, - num_trials: int = 20, - ) -> ExtractionPipeline: - """Optimize extraction pipeline using Bootstrap few-shot learning. - - Args: - pipeline: Extraction pipeline to optimize - train_texts: Training texts - val_texts: Validation texts - train_expected: Expected results for training (optional) - val_expected: Expected results for validation (optional) - num_trials: Number of optimization trials - - Returns: - Optimized extraction pipeline - """ - # TODO: Implement Bootstrap optimization - # This will use DSPy's BootstrapFewShot to automatically - # generate good few-shot examples - - logger.info("Bootstrap optimization not yet implemented") - logger.info("Would bootstrap %d examples", self.max_bootstrapped_demos) - logger.info("From %d training examples", len(train_texts)) - - # For now, return the original pipeline - return pipeline diff --git a/tests/conftest.py b/tests/conftest.py index 258ff9d..5f2ac28 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -149,7 +149,7 @@ def setup_test_environment(): import dspy if GOOGLE_API_KEY: - dspy.configure(lm=dspy.LM("gemini/gemini-2.5-flash")) + dspy.configure(lm=dspy.LM("gemini/gemini-2.5-flash-lite")) print(f"\n✅ Running tests with Gemini 2.5 Flash") elif OPENAI_API_KEY: dspy.configure(lm=dspy.LM("openai/gpt-4o-mini")) diff --git a/tests/test_api.py b/tests/test_api.py index 9d1e8f9..0b30ff0 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -21,16 +21,14 @@ def test_basic_initialization(self, person_schema, mock_extraction_pipeline): assert issubclass(extractor.schema, person_schema) assert extractor.schema is not person_schema assert extractor.use_sources is True # Default - assert extractor.optimize is False # Default + assert extractor.optimizer is None def test_initialization_with_options(self, person_schema, mock_extraction_pipeline): """Test LangStruct initialization with custom options.""" - extractor = LangStruct( - schema=person_schema, model="gpt-4o", optimize=True, use_sources=False - ) + extractor = LangStruct(schema=person_schema, model="gpt-4o", use_sources=False) assert issubclass(extractor.schema, person_schema) - assert extractor.optimize is True + assert extractor.optimizer is None assert extractor.use_sources is False @integration_test @@ -155,7 +153,7 @@ def test_constructor_with_schema(self, person_schema, mock_extraction_pipeline): extractor = LangStruct(schema=person_schema) assert issubclass(extractor.schema, person_schema) - assert extractor.optimize is False # Default behavior + assert extractor.optimizer is None assert extractor.use_sources is True # Should be enabled by auto def test_schema_wrapping_enforces_extra_forbid(self, mock_extraction_pipeline): @@ -204,7 +202,7 @@ def test_constructor_with_example( extractor = LangStruct(example=person_example_data) assert extractor.schema is not None - assert extractor.optimize is False # Default behavior + assert extractor.optimizer is None assert extractor.use_sources is True def test_constructor_no_input(self, mock_extraction_pipeline): @@ -308,12 +306,12 @@ def test_source_grounding_override( def test_repr(self, person_schema, mock_extraction_pipeline): """Test __repr__ method.""" - extractor = LangStruct(schema=person_schema, optimize=True) + extractor = LangStruct(schema=person_schema) repr_str = repr(extractor) assert "LangStruct" in repr_str assert "PersonSchema" in repr_str - assert "optimize=True" in repr_str + assert "optimizer_initialized=False" in repr_str def test_save_load_basic_functionality( self, person_schema, mock_extraction_pipeline @@ -342,23 +340,14 @@ def test_save_load_basic_functionality( assert loaded is not None assert issubclass(loaded.schema, person_schema) - def test_optimization_setup(self, person_schema, mock_extraction_pipeline): - """Test optimizer initialization.""" - # Test with MIPROv2 - extractor1 = LangStruct( - schema=person_schema, optimize=True, optimizer="miprov2" - ) - assert extractor1.optimizer is not None - - # Test with Bootstrap - extractor2 = LangStruct( - schema=person_schema, optimize=True, optimizer="bootstrap" - ) - assert extractor2.optimizer is not None + def test_optimize_raises_for_invalid_optimizer( + self, person_schema, mock_extraction_pipeline + ): + """Ensure invalid optimizer names raise when optimization runs.""" + extractor = LangStruct(schema=person_schema, optimizer="invalid") - # Test with invalid optimizer with pytest.raises(ValueError, match="Unknown optimizer"): - LangStruct(schema=person_schema, optimize=True, optimizer="invalid") + extractor.optimize(["text"]) def test_optimization_default_disabled( self, person_schema, mock_extraction_pipeline @@ -367,7 +356,7 @@ def test_optimization_default_disabled( extractor = LangStruct(schema=person_schema) # Optimization should be disabled by default now - assert extractor.optimize is False + assert extractor.optimizer is None def test_evaluate_placeholder(self, person_schema, mock_extraction_pipeline): """Test evaluate method (currently placeholder).""" @@ -523,7 +512,7 @@ def test_auto_configuration_workflow(self, mock_extraction_pipeline): extractor = LangStruct(example=example) # Verify default settings - assert extractor.optimize is False # Default behavior + assert extractor.optimizer is None assert extractor.use_sources is True # Should work for extraction diff --git a/tests/test_integration_workflows.py b/tests/test_integration_workflows.py new file mode 100644 index 0000000..9ca542b --- /dev/null +++ b/tests/test_integration_workflows.py @@ -0,0 +1,171 @@ +"""Slow integration tests that hit real LLM providers when API keys are configured.""" + +from __future__ import annotations + +import json +from typing import Dict, List, Tuple + +import pytest + +from langstruct import LangStruct +from langstruct.core.chunking import ChunkingConfig +from langstruct.core.refinement import Budget, Refine + + +@pytest.fixture(scope="module") +def optimization_dataset() -> Tuple[List[str], List[Dict[str, object]]]: + """Provide lightweight training data for integration optimization runs.""" + texts = [ + """\ + Alice Johnson is a 29-year-old data scientist based in Seattle, Washington. + She leads the analytics team at BlueSky Labs and mentors junior engineers. + """.strip(), + ] + + labels = [ + {"name": "Alice Johnson", "age": 29, "location": "Seattle, Washington"}, + ] + + return texts, labels + + +@pytest.fixture +def optimized_person_extractor( + person_schema, + optimization_dataset, + requires_api_key, +): + """Create a LangStruct instance that has been optimized against the dataset.""" + texts, labels = optimization_dataset + + extractor = LangStruct( + schema=person_schema, + optimizer="miprov2", + use_sources=False, # keep requests smaller for integration runs + ) + + extractor.optimize( + texts=texts, + expected_results=labels, + validation_split=0.0, + ) + + return { + "extractor": extractor, + "train_texts": texts, + "expected_results": labels, + } + + +@pytest.mark.integration +def test_integration_optimize_smoke(optimized_person_extractor): + """End-to-end smoke test covering optimize() and extraction afterwards.""" + bundle = optimized_person_extractor + extractor: LangStruct = bundle["extractor"] + + test_text = ( + "Dr. Emily Davis is a 38-year-old physician based in Austin, Texas, " + "where she leads the cardiology program at Central Health." + ) + + result = extractor.extract(test_text, validate=False, return_sources=False) + + assert isinstance(result.entities, dict) + assert extractor.optimizer is not None + assert getattr(extractor.optimizer, "optimizer", None) is not None + assert 0.0 <= result.confidence <= 1.0 + assert any(str(v).strip() for v in result.entities.values()) + assert result.metadata.get("pipeline") == "langstruct" + + +@pytest.mark.integration +def test_integration_save_load_after_optimization(optimized_person_extractor, tmp_path): + """Ensure optimized extractors persist and reload correctly.""" + bundle = optimized_person_extractor + extractor: LangStruct = bundle["extractor"] + texts: List[str] = bundle["train_texts"] + + save_path = tmp_path / "optimized_extractor" + extractor.save(str(save_path)) + + metadata_path = save_path / "langstruct_metadata.json" + with metadata_path.open("r", encoding="utf-8") as fh: + metadata = json.load(fh) + + assert metadata["optimization_applied"] is True + assert metadata["optimizer_name"] == "miprov2" + + loaded = LangStruct.load(str(save_path)) + loaded_result = loaded.extract(texts[0], validate=False, return_sources=False) + + assert isinstance(loaded_result.entities, dict) + assert loaded.optimizer is not None + assert any(str(v).strip() for v in loaded_result.entities.values()) + + +@pytest.mark.integration +def test_integration_chunked_sources(person_schema, requires_api_key): + """Validate extraction with source grounding across multiple chunks.""" + chunk_config = ChunkingConfig( + max_tokens=12, + overlap_tokens=4, + min_chunk_tokens=3, + preserve_paragraphs=False, + preserve_sentences=False, + ) + + extractor = LangStruct(schema=person_schema, chunking_config=chunk_config) + + long_text = ( + "Charlotte Rivera is a 41-year-old neurologist based in San Diego, " + "California. She leads the neuroscience unit at Horizon Medical Center. " + "Outside of work, Charlotte mentors students at the local university." + ) + + result = extractor.extract(long_text, validate=False, return_sources=True) + + assert isinstance(result.entities, dict) + assert result.sources + assert result.metadata.get("total_chunks", 1) > 1 + assert any(spans for spans in result.sources.values()) + + +@pytest.mark.integration +def test_integration_query_parsing(person_schema, requires_api_key): + """Ensure query() returns structured output using the query parser.""" + extractor = LangStruct(schema=person_schema) + + query = "cardiologists in Seattle over 30" + parsed = extractor.query(query, explain=False) + + assert parsed.raw_query == query + assert 0.0 <= parsed.confidence <= 1.0 + assert parsed.metadata.get("parsed_by") == "llm" + + +@pytest.mark.integration +def test_integration_refinement_flow(person_schema, requires_api_key): + """Exercise refinement engine with conservative budget to limit cost.""" + refine_config = Refine( + strategy="bon", + n_candidates=1, + max_refine_steps=1, + temperature=0.3, + budget=Budget(max_calls=1), + ) + + extractor = LangStruct( + schema=person_schema, + refine=refine_config, + use_sources=False, + ) + + text = ( + "Dr. Olivia Chen is a 36-year-old cardiologist working at Bayview Medical " + "Center in San Francisco, California." + ) + + result = extractor.extract(text, validate=False, return_sources=False) + + assert result.metadata.get("refinement_applied") + assert result.metadata.get("refinement_strategy") == refine_config.strategy diff --git a/tests/test_persistence.py b/tests/test_persistence.py index f539110..e9e0e64 100644 --- a/tests/test_persistence.py +++ b/tests/test_persistence.py @@ -369,7 +369,8 @@ def test_save_with_refinement_config(self): def test_metadata_with_optimization_flag(self): """Test that optimization flag is correctly saved in metadata.""" - extractor = LangStruct(example={"name": "Alice", "age": 30}, optimize=True) + extractor = LangStruct(example={"name": "Alice", "age": 30}) + extractor.optimizer = object() # simulate optimization having run with tempfile.TemporaryDirectory() as temp_dir: save_path = Path(temp_dir) / "test_extractor" From 37552044deb56f4503d5396e8a67be50cc3baf75 Mon Sep 17 00:00:00 2001 From: Quin Hoxie Date: Thu, 2 Oct 2025 14:40:39 -0700 Subject: [PATCH 2/2] Docs updates to clarify optimize usage. --- docs/src/content/docs/optimization.mdx | 25 ++++++++++++++++++------- docs/src/content/docs/quickstart.mdx | 3 +-- docs/src/content/docs/why-dspy.mdx | 8 ++++---- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/docs/src/content/docs/optimization.mdx b/docs/src/content/docs/optimization.mdx index 5ce19bf..c70ef38 100644 --- a/docs/src/content/docs/optimization.mdx +++ b/docs/src/content/docs/optimization.mdx @@ -74,8 +74,19 @@ Optimization can significantly improve accuracy on real-world tasks: ## Persisting Results -Saving/loading an optimized extractor is not yet implemented. -For now, re-run `optimize()` when you start up, or persist your training data and configuration. +Save and load optimized extractors to reuse them without re-running optimization: + +```python +# Save after optimization +extractor.save("./my_extractor") + +# Load later +from langstruct import LangStruct +loaded = LangStruct.load("./my_extractor") + +# Use immediately - optimization is preserved +result = loaded.extract("new text") +``` ## Advanced (If You Need It) @@ -110,25 +121,25 @@ extractor.optimize( ## Common Questions **Q: Do I always need training data?** -A: No! Optimization can work without training data, but providing examples improves results significantly. +A: You need example texts, but not necessarily expected outputs. If you don't provide `expected_results`, LangStruct uses the LLM's confidence ratings to optimize. Providing expected outputs significantly improves accuracy. **Q: How long does optimization take?** A: Usually 1-5 minutes for typical datasets (10-100 examples). **Q: Can I optimize an already optimized extractor?** -A: Yes! You can keep optimizing with new data as you get it. +A: Yes, you can continue optimizing with new data as you collect it. **Q: Will this make my extractions slower?** -A: No - optimization happens once during training. Production extraction speed is the same. +A: No - optimization happens once during training. Production extraction speed is unchanged. **Q: What happens when I switch models?** -A: Just change the model and re-optimize! Same training data, same accuracy - zero prompt rewriting needed. +A: Change the model and re-optimize with the same training data. No prompt rewriting needed. ## Next Steps - Create a LangStruct extractor and enable optimization when you need accuracy! + Create a LangStruct extractor and enable optimization when you need accuracy. [Track where information comes from](/source-grounding/) diff --git a/docs/src/content/docs/quickstart.mdx b/docs/src/content/docs/quickstart.mdx index 62930af..7195e42 100644 --- a/docs/src/content/docs/quickstart.mdx +++ b/docs/src/content/docs/quickstart.mdx @@ -87,9 +87,8 @@ extractor = LangStruct(example=schema) # See optimization in action extractor.optimize( texts=["training texts..."], - expected=[{"expected outputs..."}] + expected_results=[{"expected outputs..."}] # Optional - uses confidence if omitted ) -print(f"Optimized accuracy: {extractor.score:.1%}") ``` ## Process Multiple Documents (with quotas) diff --git a/docs/src/content/docs/why-dspy.mdx b/docs/src/content/docs/why-dspy.mdx index 6936b02..ffe40cf 100644 --- a/docs/src/content/docs/why-dspy.mdx +++ b/docs/src/content/docs/why-dspy.mdx @@ -119,7 +119,7 @@ extractor = LangStruct(example={ # 2. Let MIPROv2 optimize prompts and examples automatically extractor.optimize( - training_texts=["Apple reported $125B in Q3...", "Meta earned $40B..."], + texts=["Apple reported $125B in Q3...", "Meta earned $40B..."], expected_results=[ {"company": "Apple", "revenue": 125.0, "quarter": "Q3"}, {"company": "Meta", "revenue": 40.0, "quarter": "Q3"} @@ -148,15 +148,15 @@ extractor = LangStruct( example={"company": "Apple", "revenue": 100.0}, model="gpt-5-mini", ) -extractor.optimize(training_texts, expected_results) +extractor.optimize(texts=training_texts, expected_results=expected_results) # 6 months later, switch to Claude - just two lines! extractor.model = "claude-3-7-sonnet-latest" -extractor.optimize(training_texts, expected_results) # Auto-reoptimizes prompts +extractor.optimize(texts=training_texts, expected_results=expected_results) # Auto-reoptimizes prompts # Or use local models for privacy extractor.model = "ollama/llama3.2" -extractor.optimize(training_texts, expected_results) # Works the same way +extractor.optimize(texts=training_texts, expected_results=expected_results) # Works the same way # Same accuracy, zero prompt rewriting, zero vendor lock-in ```