From 8a1a9f3ea29b7d99564705deeebbaf7a44abf648 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 10 Dec 2025 16:50:22 +0100 Subject: [PATCH 01/16] Track processing time --- src/parxy_core/drivers/abstract_driver.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/parxy_core/drivers/abstract_driver.py b/src/parxy_core/drivers/abstract_driver.py index c3652a2..eebdfb2 100644 --- a/src/parxy_core/drivers/abstract_driver.py +++ b/src/parxy_core/drivers/abstract_driver.py @@ -1,6 +1,7 @@ import base64 import hashlib import io +import time from abc import ABC, abstractmethod from logging import Logger from typing import Dict, Any, Self, Tuple, Optional @@ -108,8 +109,20 @@ def parse( self._validate_level(level) try: + # Start timing + start_time = time.perf_counter() + document = self._handle(file=file, level=level, **kwargs) + # Calculate elapsed time in milliseconds + end_time = time.perf_counter() + elapsed_ms = (end_time - start_time) * 1000 + + # Store elapsed time in parsing metadata + if document.parsing_metadata is None: + document.parsing_metadata = {} + document.parsing_metadata['driver_elapsed_time'] = elapsed_ms + # Increment the documents processed counter tracer.count( 'documents.processed', From 87cc4b3c52bc31be9e941fb7fa2b41505a45a92a Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 10 Dec 2025 16:50:40 +0100 Subject: [PATCH 02/16] Default to block level for parsing --- src/parxy_cli/commands/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parxy_cli/commands/parse.py b/src/parxy_cli/commands/parse.py index db14c74..5396698 100644 --- a/src/parxy_cli/commands/parse.py +++ b/src/parxy_cli/commands/parse.py @@ -215,7 +215,7 @@ def parse( '-l', help='Extraction level', ), - ] = Level.PAGE, + ] = Level.BLOCK, mode: Annotated[ OutputMode, typer.Option( From 2c75410960430f080c3c4c87b1b8d22521b25af8 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 10 Dec 2025 16:51:02 +0100 Subject: [PATCH 03/16] Introduce LlamaParse parsing mode configuration --- src/parxy_core/models/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/parxy_core/models/config.py b/src/parxy_core/models/config.py index 601c3cd..88a2d46 100644 --- a/src/parxy_core/models/config.py +++ b/src/parxy_core/models/config.py @@ -136,6 +136,9 @@ class LlamaParseConfig(BaseConfig): verbose: Optional[bool] = False """Whether to print the progress of the parsing.""" + parse_mode: Optional[str] = 'parse_page_with_llm' + """Parsing mode to use to process all documents.""" + # Parsing specific configurations (Alphabetical order) disable_ocr: Optional[bool] = False From b61e011145201da5ed62e8058ec39c0b458d3fce Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 10 Dec 2025 16:52:01 +0100 Subject: [PATCH 04/16] Capture all arguments and return value --- src/parxy_core/tracing/client.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/parxy_core/tracing/client.py b/src/parxy_core/tracing/client.py index bcc3601..b72f63a 100644 --- a/src/parxy_core/tracing/client.py +++ b/src/parxy_core/tracing/client.py @@ -79,7 +79,7 @@ def force_flush(self, timeout_millis: int = 30000): return self._wrapped_exporter.force_flush(timeout_millis) -def _serialize_value(value: Any, max_length: int = 10000) -> str: +def _serialize_value(value: Any) -> str: """Serialize a value for span attributes with size limits.""" try: if hasattr(value, 'model_dump_json'): @@ -89,18 +89,15 @@ def _serialize_value(value: Any, max_length: int = 10000) -> str: else: result = json.dumps(value, default=str) - if len(result) > max_length: - return result[:max_length] + '...[truncated]' return result except Exception: - return str(value)[:max_length] + return str(value) def _serialize_args( args: tuple, kwargs: dict, exclude: set[str] | None = None, - max_length: int = 1000, ) -> dict[str, str]: """Serialize function arguments for span attributes.""" exclude = exclude or {'self', 'cls'} @@ -109,11 +106,11 @@ def _serialize_args( for i, arg in enumerate(args): key = f'arg.{i}' if key not in exclude: - attributes[key] = _serialize_value(arg, max_length) + attributes[key] = _serialize_value(arg) for key, value in kwargs.items(): if key not in exclude: - attributes[f'arg.{key}'] = _serialize_value(value, max_length) + attributes[f'arg.{key}'] = _serialize_value(value) return attributes @@ -259,7 +256,6 @@ def instrument( capture_return: bool = True, exclude_args: set[str] | None = None, max_arg_length: int = 1000, - max_return_length: int = 10000, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Decorator to automatically instrument a function with tracing. @@ -278,8 +274,6 @@ def instrument( Argument names to exclude from capture. Always excludes 'self', 'cls'. max_arg_length : int, optional Max length for serialized arguments. Default 1000. - max_return_length : int, optional - Max length for serialized return value. Default 10000. Returns ------- @@ -315,9 +309,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: result = func(*args, **kwargs) if capture_return and result is not None: - span.set_attribute( - 'return', _serialize_value(result, max_return_length) - ) + span.set_attribute('return', _serialize_value(result)) return result From 8e054731f3a0e605a03f5f5cf5e9dbf8ea56c070 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 10 Dec 2025 17:15:26 +0100 Subject: [PATCH 05/16] Estimate costs for each page --- src/parxy_core/drivers/llamaparse.py | 57 +++++++++++++++++++++++++++- tests/drivers/test_llamaparse.py | 42 ++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py index 270a051..33455fd 100644 --- a/src/parxy_core/drivers/llamaparse.py +++ b/src/parxy_core/drivers/llamaparse.py @@ -23,6 +23,17 @@ FileNotFoundException, ) +_credits_per_parsing_mode = { + # Minimum credits per parsing mode as deduced from https://developers.llamaindex.ai/python/cloud/general/pricing/ + 'accurate': 3, # equivalent to Parse page with LLM as observed in their dashboard + 'parse_page_without_llm': 1, + 'parse_page_with_llm': 3, + 'parse_page_with_lvm': 6, + 'parse_page_with_agent': 10, + 'parse_document_with_llm': 30, + 'parse_document_with_agent': 30, +} + class LlamaParseDriver(Driver): """Llama Cloud Services document processing via LlamaParse API. @@ -136,7 +147,51 @@ def _handle( res.error, self.__class__, res.model_dump(exclude={'file_name'}) ) - return llamaparse_to_parxy(doc=res, level=level) + converted_document = llamaparse_to_parxy(doc=res, level=level) + + if converted_document.parsing_metadata is None: + converted_document.parsing_metadata = {} + + converted_document.parsing_metadata['job_id'] = res.job_id + converted_document.parsing_metadata['job_metadata'] = ( + res.job_metadata.model_dump_json() + ) + converted_document.parsing_metadata['job_error'] = res.error + converted_document.parsing_metadata['job_error_code'] = res.error_code + converted_document.parsing_metadata['job_status'] = res.status + + # Extract parsing modes from each page's source_data + parsing_modes = {} + parsing_mode_counts = {} + + for page in converted_document.pages: + if page.source_data and 'parsingMode' in page.source_data: + mode = page.source_data['parsingMode'] + parsing_modes[page.number] = mode + + # Count pages per parsing mode + if mode in parsing_mode_counts: + parsing_mode_counts[mode] += 1 + else: + parsing_mode_counts[mode] = 1 + + if parsing_modes: + converted_document.parsing_metadata['page_parsing_modes'] = parsing_modes + converted_document.parsing_metadata['parsing_mode_counts'] = ( + parsing_mode_counts + ) + + # Calculate cost estimation based on parsing modes + total_cost = 0 + for mode, count in parsing_mode_counts.items(): + # Use the credit cost from the dictionary, or default to 3 if not recognized + credits_per_page = _credits_per_parsing_mode.get(mode, 3) + total_cost += credits_per_page * count + + converted_document.parsing_metadata['cost_estimation'] = total_cost + converted_document.parsing_metadata['cost_estimation_unit'] = 'credits' + + return converted_document @trace_with_output('converting') diff --git a/tests/drivers/test_llamaparse.py b/tests/drivers/test_llamaparse.py index a87cc8d..32c286b 100644 --- a/tests/drivers/test_llamaparse.py +++ b/tests/drivers/test_llamaparse.py @@ -161,3 +161,45 @@ def test_llamaparse_driver_tracing_exception_recorded(self, mock_tracer): count_call = mock_tracer.count.call_args assert count_call[0][0] == 'documents.failures' assert count_call[1]['driver'] == 'LlamaParseDriver' + + def test_llamaparse_driver_extracts_parsing_modes(self): + driver = LlamaParseDriver(LlamaParseConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + # Verify parsing_metadata exists and contains parsing modes + assert document.parsing_metadata is not None + + # Check if page_parsing_modes is in parsing_metadata (it may not be if LlamaParse doesn't provide it) + # This is conditional because the actual LlamaParse response may or may not include parsingMode + if 'page_parsing_modes' in document.parsing_metadata: + parsing_modes = document.parsing_metadata['page_parsing_modes'] + assert isinstance(parsing_modes, dict) + # Verify it's a mapping of page numbers to parsing modes + for page_num, mode in parsing_modes.items(): + assert isinstance(page_num, int) + assert isinstance(mode, str) + + # Verify parsing_mode_counts exists + assert 'parsing_mode_counts' in document.parsing_metadata + parsing_mode_counts = document.parsing_metadata['parsing_mode_counts'] + assert isinstance(parsing_mode_counts, dict) + + # Verify counts are correct + for mode, count in parsing_mode_counts.items(): + assert isinstance(mode, str) + assert isinstance(count, int) + assert count > 0 + + # Verify total count matches number of pages with parsing modes + assert sum(parsing_mode_counts.values()) == len(parsing_modes) + + # Verify cost_estimation exists + assert 'cost_estimation' in document.parsing_metadata + cost_estimation = document.parsing_metadata['cost_estimation'] + assert isinstance(cost_estimation, int) + assert cost_estimation > 0 + + # Verify cost estimation is reasonable (at least 1 credit per page minimum) + assert cost_estimation >= len(parsing_modes) From ea9594ba25bfc384118c2073ffa0b917e782d1c9 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 10 Dec 2025 17:15:40 +0100 Subject: [PATCH 06/16] Test elapsed time included in metadata --- tests/drivers/test_pymupdf.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/drivers/test_pymupdf.py b/tests/drivers/test_pymupdf.py index c5c652e..e62986e 100644 --- a/tests/drivers/test_pymupdf.py +++ b/tests/drivers/test_pymupdf.py @@ -171,3 +171,15 @@ def test_pymupdf_driver_tracing_exception_recorded(self, mock_tracer): # Verify counter was NOT incremented due to exception mock_tracer.count.assert_called_once() + + def test_pymupdf_driver_records_elapsed_time(self): + driver = PyMuPdfDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + # Verify elapsed time is recorded in parsing_metadata + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 From 05a6bbe0bb4758e68811a7c460b89c40668643a1 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Tue, 16 Dec 2025 09:08:33 +0100 Subject: [PATCH 07/16] Fetch usage information --- src/parxy_core/drivers/llamaparse.py | 176 ++++++++++++++++++++++----- 1 file changed, 145 insertions(+), 31 deletions(-) diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py index 33455fd..ae7c26e 100644 --- a/src/parxy_core/drivers/llamaparse.py +++ b/src/parxy_core/drivers/llamaparse.py @@ -1,5 +1,7 @@ import io -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional + +import requests from parxy_core.models.config import LlamaParseConfig from parxy_core.tracing.utils import trace_with_output @@ -76,6 +78,96 @@ def _initialize_driver(self): **self._config.model_dump() if self._config else {}, ) + def _fetch_usage_metrics(self, job_id: str) -> Optional[dict]: + """Fetch actual usage metrics from the LlamaParse beta API. + + Parameters + ---------- + job_id : str + The job ID to fetch metrics for + + Returns + ------- + Optional[dict] + Dictionary with 'total_cost', 'cost_unit', 'parsing_mode_counts', and 'mode_details' + Returns None if organization_id is not configured or if the API call fails + """ + # Only fetch if organization_id is configured + if not self._config or not self._config.organization_id: + return None + + try: + # Construct the beta API endpoint + base_url = self._config.base_url.rstrip('/') + endpoint = f'{base_url}/api/v1/beta/usage-metrics' + + # Prepare request parameters + params = { + 'organization_id': self._config.organization_id, + 'event_aggregation_key': job_id, + } + + # Prepare headers with authentication + headers = { + 'Authorization': f'Bearer {self._config.api_key.get_secret_value()}', + 'Content-Type': 'application/json', + } + + # Make the API request + response = requests.get( + endpoint, params=params, headers=headers, timeout=10 + ) + response.raise_for_status() + + data = response.json() + items = data.get('items', []) + + if not items: + return None + + # Aggregate usage data by parsing mode + parsing_mode_counts = {} + mode_details = [] + + for item in items: + if item.get('event_type') == 'pages_parsed': + mode = item.get('properties', {}).get('mode', 'unknown') + pages = item.get('value', 0) + model = item.get('properties', {}).get('model', 'unknown') + + # Count pages per mode + parsing_mode_counts[mode] = parsing_mode_counts.get(mode, 0) + pages + + # Store detailed info + mode_details.append( + { + 'mode': mode, + 'model': model, + 'pages': pages, + 'day': item.get('day'), + } + ) + + # Calculate total cost based on actual usage + total_cost = 0 + for mode, count in parsing_mode_counts.items(): + credits_per_page = _credits_per_parsing_mode.get(mode, 3) + total_cost += credits_per_page * count + + return { + 'total_cost': total_cost, + 'cost_unit': 'credits', + 'parsing_mode_counts': parsing_mode_counts, + 'mode_details': mode_details, + } + + except Exception as e: + # Log the error but don't fail the parsing + self._logger.warning( + f'Failed to fetch usage metrics from beta API: {str(e)}' + ) + return None + def _handle( self, file: str | io.BytesIO | bytes, @@ -160,36 +252,58 @@ def _handle( converted_document.parsing_metadata['job_error_code'] = res.error_code converted_document.parsing_metadata['job_status'] = res.status - # Extract parsing modes from each page's source_data - parsing_modes = {} - parsing_mode_counts = {} - - for page in converted_document.pages: - if page.source_data and 'parsingMode' in page.source_data: - mode = page.source_data['parsingMode'] - parsing_modes[page.number] = mode - - # Count pages per parsing mode - if mode in parsing_mode_counts: - parsing_mode_counts[mode] += 1 - else: - parsing_mode_counts[mode] = 1 - - if parsing_modes: - converted_document.parsing_metadata['page_parsing_modes'] = parsing_modes - converted_document.parsing_metadata['parsing_mode_counts'] = ( - parsing_mode_counts - ) - - # Calculate cost estimation based on parsing modes - total_cost = 0 - for mode, count in parsing_mode_counts.items(): - # Use the credit cost from the dictionary, or default to 3 if not recognized - credits_per_page = _credits_per_parsing_mode.get(mode, 3) - total_cost += credits_per_page * count - - converted_document.parsing_metadata['cost_estimation'] = total_cost - converted_document.parsing_metadata['cost_estimation_unit'] = 'credits' + # Try to fetch actual usage metrics from beta API if organization_id is configured + usage_metrics = self._fetch_usage_metrics(res.job_id) + + if usage_metrics: + # Use actual metrics from the API + converted_document.parsing_metadata['cost_estimation'] = usage_metrics[ + 'total_cost' + ] + converted_document.parsing_metadata['cost_estimation_unit'] = usage_metrics[ + 'cost_unit' + ] + converted_document.parsing_metadata['parsing_mode_counts'] = usage_metrics[ + 'parsing_mode_counts' + ] + converted_document.parsing_metadata['cost_data_source'] = 'beta_api' + converted_document.parsing_metadata['usage_details'] = usage_metrics[ + 'mode_details' + ] + else: + # Fall back to estimation from page source_data + parsing_modes = {} + parsing_mode_counts = {} + + for page in converted_document.pages: + if page.source_data and 'parsingMode' in page.source_data: + mode = page.source_data['parsingMode'] + parsing_modes[page.number] = mode + + # Count pages per parsing mode + if mode in parsing_mode_counts: + parsing_mode_counts[mode] += 1 + else: + parsing_mode_counts[mode] = 1 + + if parsing_modes: + converted_document.parsing_metadata['page_parsing_modes'] = ( + parsing_modes + ) + converted_document.parsing_metadata['parsing_mode_counts'] = ( + parsing_mode_counts + ) + + # Calculate cost estimation based on parsing modes + total_cost = 0 + for mode, count in parsing_mode_counts.items(): + # Use the credit cost from the dictionary, or default to 3 if not recognized + credits_per_page = _credits_per_parsing_mode.get(mode, 3) + total_cost += credits_per_page * count + + converted_document.parsing_metadata['cost_estimation'] = total_cost + converted_document.parsing_metadata['cost_estimation_unit'] = 'credits' + converted_document.parsing_metadata['cost_data_source'] = 'estimation' return converted_document From 2cd71fb7aee3044af8ca9717ab9445e68647f72f Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 10:30:54 +0100 Subject: [PATCH 08/16] Support parsing mode for LLMWhisperer --- src/parxy_core/drivers/llmwhisperer.py | 10 +- src/parxy_core/models/config.py | 3 + tests/drivers/test_llmwhisperer.py | 192 +++++++++++++++++++++---- 3 files changed, 177 insertions(+), 28 deletions(-) diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index ee78519..cbadda0 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -55,6 +55,10 @@ def _initialize_driver(self): "Install with 'pip install parxy[llmwhisperer]'" ) from e + # Prepare config for client initialization, excluding mode (which is used per-request) + config_dict = self._config.model_dump() if self._config else {} + config_dict.pop('mode', None) # Remove mode as it's not a client init parameter + self.__client = LLMWhispererClientV2( api_key=self._config.api_key.get_secret_value() if self._config and self._config.api_key @@ -79,7 +83,7 @@ def _handle( raw : bool, optional If True, return the raw response dict from LLMWhisperer instead of a `Document`. Default is False. **kwargs : - Additional arguments passed to the LLMWhisperer client (e.g., `wait_timeout`). + Additional arguments passed to the LLMWhisperer client (e.g., `wait_timeout`, `mode`). Returns ------- @@ -94,6 +98,9 @@ def _handle( self._validate_level(level) + # Determine the parsing mode: kwargs takes precedence over config + parsing_mode = kwargs.pop('mode', None) or (getattr(self._config, 'mode', 'form') if self._config else 'form') + try: filename, stream = self.handle_file_input(file) with self._trace_parse(filename, stream, **kwargs) as span: @@ -102,6 +109,7 @@ def _handle( stream=io.BytesIO(stream), wait_for_completion=True, wait_timeout=200, # TODO: Handle configuration of args + mode=parsing_mode, # wait_timeout=kwargs.get("wait_timeout", 200), # **kwargs, ) diff --git a/src/parxy_core/models/config.py b/src/parxy_core/models/config.py index 88a2d46..535025e 100644 --- a/src/parxy_core/models/config.py +++ b/src/parxy_core/models/config.py @@ -167,6 +167,9 @@ class LlmWhispererConfig(BaseConfig): logging_level: Optional[str] = 'INFO' """The logging level for the client. Can be "DEBUG", "INFO", "WARNING" or "ERROR". Default "INFO".""" + mode: Optional[str] = 'form' + """Default parsing mode. Can be high_quality, form, low_cost or native_text""" + model_config = SettingsConfigDict( env_prefix='parxy_llmwhisperer_', env_file='.env', extra='ignore' ) diff --git a/tests/drivers/test_llmwhisperer.py b/tests/drivers/test_llmwhisperer.py index a9b182e..e420e10 100644 --- a/tests/drivers/test_llmwhisperer.py +++ b/tests/drivers/test_llmwhisperer.py @@ -118,30 +118,168 @@ def test_llmwhisperer_driver_tracing_span_created(self, mock_tracer): assert count_call[0][0] == 'documents.processed' assert count_call[1]['driver'] == 'LlmWhispererDriver' - @patch('parxy_core.drivers.abstract_driver.tracer') - def test_llmwhisperer_driver_tracing_exception_recorded(self, mock_tracer): - # Setup mocks - mock_span = MagicMock() - mock_span.__enter__ = Mock(return_value=mock_span) - mock_span.__exit__ = Mock(return_value=False) - mock_tracer.span = Mock(return_value=mock_span) - mock_tracer.count = Mock() - mock_tracer.error = Mock() - - driver = LlmWhispererDriver(LlmWhispererConfig()) - path = self.__fixture_path('non-existing-file.pdf') - - # Attempt to parse non-existing file - with pytest.raises(FileNotFoundException): - driver.parse(path) - - # Verify error was logged via tracer.error - mock_tracer.error.assert_called_once() - error_call = mock_tracer.error.call_args - assert error_call[0][0] == 'Parsing failed' - - # Verify documents.failures counter was incremented - mock_tracer.count.assert_called_once() - count_call = mock_tracer.count.call_args - assert count_call[0][0] == 'documents.failures' - assert count_call[1]['driver'] == 'LlmWhispererDriver' + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_tracing_exception_recorded(self, mock_tracer): + # Setup mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = LlmWhispererDriver(LlmWhispererConfig()) + path = self.__fixture_path('non-existing-file.pdf') + + # Attempt to parse non-existing file + with pytest.raises(FileNotFoundException): + driver.parse(path) + + # Verify error was logged via tracer.error + mock_tracer.error.assert_called_once() + error_call = mock_tracer.error.call_args + assert error_call[0][0] == 'Parsing failed' + + # Verify documents.failures counter was incremented + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.failures' + assert count_call[1]['driver'] == 'LlmWhispererDriver' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_from_kwargs(self, mock_tracer, mock_client_class): + """Test that mode parameter from kwargs is passed to whisper method""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}} + } + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver with default mode in config + config = LlmWhispererConfig(mode='form') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse with mode override in kwargs + document = driver.parse(test_data, level='page', mode='high_quality') + + # Verify whisper was called with the mode from kwargs + mock_client.whisper.assert_called_once() + call_kwargs = mock_client.whisper.call_args[1] + assert call_kwargs['mode'] == 'high_quality' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_from_config(self, mock_tracer, mock_client_class): + """Test that mode parameter from config is used when not in kwargs""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}} + } + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver with specific mode in config + config = LlmWhispererConfig(mode='low_cost') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse without mode in kwargs + document = driver.parse(test_data, level='page') + + # Verify whisper was called with the mode from config + mock_client.whisper.assert_called_once() + call_kwargs = mock_client.whisper.call_args[1] + assert call_kwargs['mode'] == 'low_cost' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_cost_estimation(self, mock_tracer, mock_client_class): + """Test that cost estimation uses the correct parsing mode""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response with 2 pages + mock_response = { + 'extraction': { + 'result_text': 'Page 1\n<<<\x0cPage 2\n<<<\x0c', + 'metadata': { + '0': {'page_number': 0}, + '1': {'page_number': 1} + } + } + } + mock_client.whisper.return_value = mock_response + + # Mock usage info + mock_usage_info = { + 'quota': 1000, + 'used': 50, + 'remaining': 950 + } + mock_client.get_usage_info.return_value = mock_usage_info + + # Create driver with native_text mode (1/1000 credits per page) + config = LlmWhispererConfig(mode='native_text') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse document + document = driver.parse(test_data, level='page') + + # Verify cost estimation metadata + assert document.parsing_metadata is not None + assert 'parsing_mode' in document.parsing_metadata + assert document.parsing_metadata['parsing_mode'] == 'native_text' + assert 'cost_estimation' in document.parsing_metadata + # 2 pages * (1/1000) credits per page = 0.002 credits + assert document.parsing_metadata['cost_estimation'] == 0.002 + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + assert document.parsing_metadata['pages_processed'] == 2 From b985a7cd521107614c4b45c10074ad1584bb1abb Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 10:35:36 +0100 Subject: [PATCH 09/16] Estimate costs in LLMWhisperer based on mode --- src/parxy_core/drivers/llmwhisperer.py | 53 +++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index cbadda0..1c2e9de 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -25,6 +25,17 @@ from parxy_core.models import Document, Page +_credits_per_parsing_mode_per_page = { + # https://unstract.com/pricing/ + # https://docs.unstract.com/llmwhisperer/llm_whisperer/llm_whisperer_modes/ + 'native_text': 1 / 1000, + 'low_cost': 5 / 1000, + 'high_quality': 10 / 1000, + 'form': 15 / 1000, + 'table': 15 / 1000, # assumed to be the same as form +} + + class LlmWhispererDriver(Driver): """Unstract LLMWhisperer API driver implementation. @@ -63,9 +74,28 @@ def _initialize_driver(self): api_key=self._config.api_key.get_secret_value() if self._config and self._config.api_key else None, - **self._config.model_dump() if self._config else {}, + **config_dict, ) + def _fetch_usage_info(self) -> dict | None: + """Fetch usage information from the LLMWhisperer API. + + Returns + ------- + dict | None + Dictionary with usage information including quota, page counts, and subscription plan. + Returns None if the API call fails. + """ + try: + usage_info = self.__client.get_usage_info() + return usage_info + except Exception as e: + # Log the error but don't fail the parsing + self._logger.warning( + f'Failed to fetch usage information from LLMWhisperer API: {str(e)}' + ) + return None + def _handle( self, file: str | io.BytesIO | bytes, @@ -132,6 +162,27 @@ def _handle( doc = llmwhisperer_to_parxy(res) doc.filename = filename + + # Initialize parsing_metadata if needed + if doc.parsing_metadata is None: + doc.parsing_metadata = {} + + # Calculate cost based on number of pages and parsing mode + num_pages = len(doc.pages) + credits_per_page = _credits_per_parsing_mode_per_page.get(parsing_mode, 10 / 1000) + estimated_cost = credits_per_page * num_pages + + doc.parsing_metadata['parsing_mode'] = parsing_mode + doc.parsing_metadata['cost_estimation'] = estimated_cost + doc.parsing_metadata['cost_estimation_unit'] = 'credits' + doc.parsing_metadata['pages_processed'] = num_pages + + # Fetch usage information from the API + usage_info = self._fetch_usage_info() + + if usage_info: + doc.parsing_metadata['usage_info'] = usage_info + return doc From 3d02a18338c26ee913faf6a6e6cc8add3600979c Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 10:46:27 +0100 Subject: [PATCH 10/16] Include whisper metadata --- src/parxy_core/drivers/llmwhisperer.py | 36 ++++++++++- tests/drivers/test_llmwhisperer.py | 90 +++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 4 deletions(-) diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index 1c2e9de..4dfad08 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -167,12 +167,44 @@ def _handle( if doc.parsing_metadata is None: doc.parsing_metadata = {} + # Extract whisper-specific metadata from the response + if 'whisper_hash' in res: + doc.parsing_metadata['whisper_hash'] = res['whisper_hash'] + + if 'mode' in res: + doc.parsing_metadata['parsing_mode'] = res['mode'] + else: + doc.parsing_metadata['parsing_mode'] = parsing_mode + + # Extract processing details + whisper_details = {} + if 'completed_at' in res: + whisper_details['completed_at'] = res['completed_at'] + if 'processing_started_at' in res: + whisper_details['processing_started_at'] = res['processing_started_at'] + if 'processing_time_in_seconds' in res: + whisper_details['processing_time_in_seconds'] = res['processing_time_in_seconds'] + if 'total_pages' in res: + whisper_details['total_pages'] = res['total_pages'] + if 'requested_pages' in res: + whisper_details['requested_pages'] = res['requested_pages'] + if 'processed_pages' in res: + whisper_details['processed_pages'] = res['processed_pages'] + if 'upload_file_size_in_kb' in res: + whisper_details['upload_file_size_in_kb'] = res['upload_file_size_in_kb'] + if 'tag' in res: + whisper_details['tag'] = res['tag'] + + if whisper_details: + doc.parsing_metadata['whisper_details'] = whisper_details + # Calculate cost based on number of pages and parsing mode + # Use the actual mode from the response if available, otherwise use the requested mode + actual_mode = res.get('mode', parsing_mode) num_pages = len(doc.pages) - credits_per_page = _credits_per_parsing_mode_per_page.get(parsing_mode, 10 / 1000) + credits_per_page = _credits_per_parsing_mode_per_page.get(actual_mode, 10 / 1000) estimated_cost = credits_per_page * num_pages - doc.parsing_metadata['parsing_mode'] = parsing_mode doc.parsing_metadata['cost_estimation'] = estimated_cost doc.parsing_metadata['cost_estimation_unit'] = 'credits' doc.parsing_metadata['pages_processed'] = num_pages diff --git a/tests/drivers/test_llmwhisperer.py b/tests/drivers/test_llmwhisperer.py index e420e10..0a2881e 100644 --- a/tests/drivers/test_llmwhisperer.py +++ b/tests/drivers/test_llmwhisperer.py @@ -244,7 +244,7 @@ def test_llmwhisperer_driver_mode_cost_estimation(self, mock_tracer, mock_client mock_client = MagicMock() mock_client_class.return_value = mock_client - # Mock whisper response with 2 pages + # Mock whisper response with 2 pages including detail fields mock_response = { 'extraction': { 'result_text': 'Page 1\n<<<\x0cPage 2\n<<<\x0c', @@ -252,7 +252,17 @@ def test_llmwhisperer_driver_mode_cost_estimation(self, mock_tracer, mock_client '0': {'page_number': 0}, '1': {'page_number': 1} } - } + }, + 'whisper_hash': 'abc123def456', + 'mode': 'native_text', + 'completed_at': 'Mon, 10 Feb 2025 10:40:58 GMT', + 'processing_started_at': 'Mon, 10 Feb 2025 10:40:53 GMT', + 'processing_time_in_seconds': 5.0, + 'total_pages': 2, + 'requested_pages': 2, + 'processed_pages': 2, + 'upload_file_size_in_kb': 618.488, + 'tag': 'test_tag' } mock_client.whisper.return_value = mock_response @@ -283,3 +293,79 @@ def test_llmwhisperer_driver_mode_cost_estimation(self, mock_tracer, mock_client assert document.parsing_metadata['cost_estimation'] == 0.002 assert document.parsing_metadata['cost_estimation_unit'] == 'credits' assert document.parsing_metadata['pages_processed'] == 2 + + # Verify whisper-specific metadata + assert 'whisper_hash' in document.parsing_metadata + assert document.parsing_metadata['whisper_hash'] == 'abc123def456' + + # Verify whisper details + assert 'whisper_details' in document.parsing_metadata + whisper_details = document.parsing_metadata['whisper_details'] + assert whisper_details['completed_at'] == 'Mon, 10 Feb 2025 10:40:58 GMT' + assert whisper_details['processing_started_at'] == 'Mon, 10 Feb 2025 10:40:53 GMT' + assert whisper_details['processing_time_in_seconds'] == 5.0 + assert whisper_details['total_pages'] == 2 + assert whisper_details['requested_pages'] == 2 + assert whisper_details['processed_pages'] == 2 + assert whisper_details['upload_file_size_in_kb'] == 618.488 + assert whisper_details['tag'] == 'test_tag' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_metadata_extraction(self, mock_tracer, mock_client_class): + """Test that whisper metadata is properly extracted from response""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response with partial metadata (some fields missing) + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}} + }, + 'whisper_hash': 'xyz789', + 'mode': 'high_quality', + 'processing_time_in_seconds': 3.5, + 'total_pages': 1 + # Other fields intentionally missing to test robustness + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver + config = LlmWhispererConfig(mode='form') + driver = LlmWhispererDriver(config) + + # Use bytes input + test_data = b'%PDF-1.4 test content' + + # Parse document + document = driver.parse(test_data, level='page') + + # Verify whisper hash is extracted + assert 'whisper_hash' in document.parsing_metadata + assert document.parsing_metadata['whisper_hash'] == 'xyz789' + + # Verify mode from response takes precedence over config + assert document.parsing_metadata['parsing_mode'] == 'high_quality' + + # Verify whisper details only contains fields that were present + assert 'whisper_details' in document.parsing_metadata + whisper_details = document.parsing_metadata['whisper_details'] + assert whisper_details['processing_time_in_seconds'] == 3.5 + assert whisper_details['total_pages'] == 1 + + # These fields should not be present since they weren't in the response + assert 'completed_at' not in whisper_details + assert 'processing_started_at' not in whisper_details + assert 'requested_pages' not in whisper_details + assert 'tag' not in whisper_details From 5ae60c9d31e42f9e5f401d798d2c90ca9e097aa6 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 12:19:36 +0100 Subject: [PATCH 11/16] Add credits usage estimation to landing ai driver --- src/parxy_core/drivers/landingai.py | 39 +++++++++++- tests/drivers/test_landingai.py | 98 ++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 3 deletions(-) diff --git a/src/parxy_core/drivers/landingai.py b/src/parxy_core/drivers/landingai.py index cb180ef..f58fe83 100644 --- a/src/parxy_core/drivers/landingai.py +++ b/src/parxy_core/drivers/landingai.py @@ -55,7 +55,44 @@ def _handle( service=self.__class__, ) from aex - return landingaiade_to_parxy(parse_response) + doc = landingaiade_to_parxy(parse_response) + + # Initialize parsing_metadata if needed + if doc.parsing_metadata is None: + doc.parsing_metadata = {} + + # Extract cost information from metadata + # According to https://docs.landing.ai/ade/ade-json-response.md + # metadata contains: credit_usage, duration_ms, filename, job_id, page_count, version + if parse_response.metadata: + metadata = parse_response.metadata + + # Extract cost estimation from credit_usage + if hasattr(metadata, 'credit_usage') and metadata.credit_usage is not None: + doc.parsing_metadata['cost_estimation'] = metadata.credit_usage + doc.parsing_metadata['cost_estimation_unit'] = 'credits' + + # Extract processing details + ade_details = {} + if hasattr(metadata, 'duration_ms') and metadata.duration_ms is not None: + ade_details['duration_ms'] = metadata.duration_ms + if hasattr(metadata, 'job_id') and metadata.job_id is not None: + ade_details['job_id'] = metadata.job_id + if hasattr(metadata, 'page_count') and metadata.page_count is not None: + ade_details['page_count'] = metadata.page_count + if hasattr(metadata, 'version') and metadata.version is not None: + ade_details['version'] = metadata.version + if hasattr(metadata, 'filename') and metadata.filename is not None: + ade_details['filename'] = metadata.filename + + # Add failed_pages if present (for partial content responses) + if hasattr(metadata, 'failed_pages') and metadata.failed_pages is not None: + ade_details['failed_pages'] = metadata.failed_pages + + if ade_details: + doc.parsing_metadata['ade_details'] = ade_details + + return doc @trace_with_output('converting') diff --git a/tests/drivers/test_landingai.py b/tests/drivers/test_landingai.py index 84f8d25..61f948e 100644 --- a/tests/drivers/test_landingai.py +++ b/tests/drivers/test_landingai.py @@ -88,7 +88,10 @@ def test_landingai_driver_read_empty_document_page_level(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == '1' + stripped_text = ( + re.sub(r'<[^>]+>', '', document.pages[0].text).replace('\n', '').strip() + ) + assert stripped_text == '1' def test_landingai_driver_read_document(self): driver = LandingAIADEDriver(LandingAIConfig()) @@ -108,7 +111,7 @@ def test_landingai_driver_read_document(self): stripped_text = re.sub(r'<[^>]+>', '', document.pages[0].text).strip() assert ( stripped_text - == 'This is the header\n\n\nThis is a test PDF to be used as input in unit\ntests\n\n\n## This is a heading 1\nThis is a paragraph below heading 1\n\n\n1' + == 'This is the header\n\n\nThis is a test PDF to be used as input in unit tests\n\n\n# This is a heading 1\nThis is a paragraph below heading 1\n\n\n1' ) @patch('parxy_core.drivers.abstract_driver.tracer') @@ -171,3 +174,94 @@ def test_landingai_driver_tracing_exception_recorded(self, mock_tracer): count_call = mock_tracer.count.call_args assert count_call[0][0] == 'documents.failures' assert count_call[1]['driver'] == 'LandingAIADEDriver' + + @patch('landingai_ade.LandingAIADE') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): + """Test that cost estimation is extracted from parse response metadata""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock parse response with metadata including credit usage + # Based on https://docs.landing.ai/ade/ade-json-response.md + mock_metadata = MagicMock() + mock_metadata.credit_usage = 6.0 + mock_metadata.duration_ms = 24382 + mock_metadata.filename = 'test-document.pdf' + mock_metadata.job_id = 'td8wu72tq2g9l9tfgkwn3q3kp' + mock_metadata.page_count = 2 + mock_metadata.version = 'dpt-2-20251103' + mock_metadata.model_dump = Mock(return_value={ + 'credit_usage': 6.0, + 'duration_ms': 24382, + 'filename': 'test-document.pdf', + 'job_id': 'td8wu72tq2g9l9tfgkwn3q3kp', + 'page_count': 2, + 'version': 'dpt-2-20251103' + }) + + mock_chunk_1 = MagicMock() + mock_chunk_1.markdown = 'Page 1 content' + mock_chunk_1.type = 'text' + mock_chunk_1.grounding = MagicMock() + mock_chunk_1.grounding.page = 0 + mock_chunk_1.grounding.box = MagicMock() + mock_chunk_1.grounding.box.left = 0.1 + mock_chunk_1.grounding.box.top = 0.1 + mock_chunk_1.grounding.box.right = 0.9 + mock_chunk_1.grounding.box.bottom = 0.5 + mock_chunk_1.model_dump = Mock(return_value={}) + + mock_chunk_2 = MagicMock() + mock_chunk_2.markdown = 'Page 2 content' + mock_chunk_2.type = 'text' + mock_chunk_2.grounding = MagicMock() + mock_chunk_2.grounding.page = 1 + mock_chunk_2.grounding.box = MagicMock() + mock_chunk_2.grounding.box.left = 0.1 + mock_chunk_2.grounding.box.top = 0.1 + mock_chunk_2.grounding.box.right = 0.9 + mock_chunk_2.grounding.box.bottom = 0.5 + mock_chunk_2.model_dump = Mock(return_value={}) + + mock_response_metadata = MagicMock() + mock_response_metadata.filename = 'test-document.pdf' + mock_response_metadata.model_dump = Mock(return_value={}) + + mock_response = MagicMock() + mock_response.chunks = [mock_chunk_1, mock_chunk_2] + mock_response.metadata = mock_metadata + mock_response.model_dump_json = Mock(return_value='{}') + + mock_client.parse.return_value = mock_response + + # Create driver + driver = LandingAIADEDriver(LandingAIConfig()) + + # Parse document + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path) + + # Verify cost estimation metadata + assert document.parsing_metadata is not None + assert 'cost_estimation' in document.parsing_metadata + assert document.parsing_metadata['cost_estimation'] == 6.0 + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + + # Verify ADE details + assert 'ade_details' in document.parsing_metadata + ade_details = document.parsing_metadata['ade_details'] + assert ade_details['duration_ms'] == 24382 + assert ade_details['filename'] == 'test-document.pdf' + assert ade_details['job_id'] == 'td8wu72tq2g9l9tfgkwn3q3kp' + assert ade_details['page_count'] == 2 + assert ade_details['version'] == 'dpt-2-20251103' From b5e09802b2c83701d3395e3ad92bb97e39d17021 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 12:20:08 +0100 Subject: [PATCH 12/16] Ignore git and possible worktree directories when running tests --- pytest.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 3904651..74e381a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,7 @@ [pytest] filterwarnings = ignore:.*Swig.* - ignore:.*no current event loop.* \ No newline at end of file + ignore:.*no current event loop.* + + +norecursedirs = .git worktrees parxy.worktrees From 762845f524ae86043dcde47e390140096fe6149d Mon Sep 17 00:00:00 2001 From: avvertix <5672748+avvertix@users.noreply.github.com> Date: Fri, 2 Jan 2026 11:21:07 +0000 Subject: [PATCH 13/16] Fix styling --- src/parxy_core/drivers/landingai.py | 14 +- src/parxy_core/drivers/llmwhisperer.py | 24 +- tests/drivers/test_landingai.py | 34 +- tests/drivers/test_llmwhisperer.py | 501 ++++++++++++------------- 4 files changed, 290 insertions(+), 283 deletions(-) diff --git a/src/parxy_core/drivers/landingai.py b/src/parxy_core/drivers/landingai.py index f58fe83..270cd24 100644 --- a/src/parxy_core/drivers/landingai.py +++ b/src/parxy_core/drivers/landingai.py @@ -56,22 +56,22 @@ def _handle( ) from aex doc = landingaiade_to_parxy(parse_response) - + # Initialize parsing_metadata if needed if doc.parsing_metadata is None: doc.parsing_metadata = {} - + # Extract cost information from metadata # According to https://docs.landing.ai/ade/ade-json-response.md # metadata contains: credit_usage, duration_ms, filename, job_id, page_count, version if parse_response.metadata: metadata = parse_response.metadata - + # Extract cost estimation from credit_usage if hasattr(metadata, 'credit_usage') and metadata.credit_usage is not None: doc.parsing_metadata['cost_estimation'] = metadata.credit_usage doc.parsing_metadata['cost_estimation_unit'] = 'credits' - + # Extract processing details ade_details = {} if hasattr(metadata, 'duration_ms') and metadata.duration_ms is not None: @@ -84,14 +84,14 @@ def _handle( ade_details['version'] = metadata.version if hasattr(metadata, 'filename') and metadata.filename is not None: ade_details['filename'] = metadata.filename - + # Add failed_pages if present (for partial content responses) if hasattr(metadata, 'failed_pages') and metadata.failed_pages is not None: ade_details['failed_pages'] = metadata.failed_pages - + if ade_details: doc.parsing_metadata['ade_details'] = ade_details - + return doc diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index 4dfad08..adc96df 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -28,11 +28,11 @@ _credits_per_parsing_mode_per_page = { # https://unstract.com/pricing/ # https://docs.unstract.com/llmwhisperer/llm_whisperer/llm_whisperer_modes/ - 'native_text': 1 / 1000, + 'native_text': 1 / 1000, 'low_cost': 5 / 1000, 'high_quality': 10 / 1000, 'form': 15 / 1000, - 'table': 15 / 1000, # assumed to be the same as form + 'table': 15 / 1000, # assumed to be the same as form } @@ -69,7 +69,7 @@ def _initialize_driver(self): # Prepare config for client initialization, excluding mode (which is used per-request) config_dict = self._config.model_dump() if self._config else {} config_dict.pop('mode', None) # Remove mode as it's not a client init parameter - + self.__client = LLMWhispererClientV2( api_key=self._config.api_key.get_secret_value() if self._config and self._config.api_key @@ -129,7 +129,9 @@ def _handle( self._validate_level(level) # Determine the parsing mode: kwargs takes precedence over config - parsing_mode = kwargs.pop('mode', None) or (getattr(self._config, 'mode', 'form') if self._config else 'form') + parsing_mode = kwargs.pop('mode', None) or ( + getattr(self._config, 'mode', 'form') if self._config else 'form' + ) try: filename, stream = self.handle_file_input(file) @@ -170,12 +172,12 @@ def _handle( # Extract whisper-specific metadata from the response if 'whisper_hash' in res: doc.parsing_metadata['whisper_hash'] = res['whisper_hash'] - + if 'mode' in res: doc.parsing_metadata['parsing_mode'] = res['mode'] else: doc.parsing_metadata['parsing_mode'] = parsing_mode - + # Extract processing details whisper_details = {} if 'completed_at' in res: @@ -183,7 +185,9 @@ def _handle( if 'processing_started_at' in res: whisper_details['processing_started_at'] = res['processing_started_at'] if 'processing_time_in_seconds' in res: - whisper_details['processing_time_in_seconds'] = res['processing_time_in_seconds'] + whisper_details['processing_time_in_seconds'] = res[ + 'processing_time_in_seconds' + ] if 'total_pages' in res: whisper_details['total_pages'] = res['total_pages'] if 'requested_pages' in res: @@ -194,7 +198,7 @@ def _handle( whisper_details['upload_file_size_in_kb'] = res['upload_file_size_in_kb'] if 'tag' in res: whisper_details['tag'] = res['tag'] - + if whisper_details: doc.parsing_metadata['whisper_details'] = whisper_details @@ -202,7 +206,9 @@ def _handle( # Use the actual mode from the response if available, otherwise use the requested mode actual_mode = res.get('mode', parsing_mode) num_pages = len(doc.pages) - credits_per_page = _credits_per_parsing_mode_per_page.get(actual_mode, 10 / 1000) + credits_per_page = _credits_per_parsing_mode_per_page.get( + actual_mode, 10 / 1000 + ) estimated_cost = credits_per_page * num_pages doc.parsing_metadata['cost_estimation'] = estimated_cost diff --git a/tests/drivers/test_landingai.py b/tests/drivers/test_landingai.py index 61f948e..c371eb6 100644 --- a/tests/drivers/test_landingai.py +++ b/tests/drivers/test_landingai.py @@ -190,7 +190,7 @@ def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): # Setup client mock mock_client = MagicMock() mock_client_class.return_value = mock_client - + # Mock parse response with metadata including credit usage # Based on https://docs.landing.ai/ade/ade-json-response.md mock_metadata = MagicMock() @@ -200,15 +200,17 @@ def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): mock_metadata.job_id = 'td8wu72tq2g9l9tfgkwn3q3kp' mock_metadata.page_count = 2 mock_metadata.version = 'dpt-2-20251103' - mock_metadata.model_dump = Mock(return_value={ - 'credit_usage': 6.0, - 'duration_ms': 24382, - 'filename': 'test-document.pdf', - 'job_id': 'td8wu72tq2g9l9tfgkwn3q3kp', - 'page_count': 2, - 'version': 'dpt-2-20251103' - }) - + mock_metadata.model_dump = Mock( + return_value={ + 'credit_usage': 6.0, + 'duration_ms': 24382, + 'filename': 'test-document.pdf', + 'job_id': 'td8wu72tq2g9l9tfgkwn3q3kp', + 'page_count': 2, + 'version': 'dpt-2-20251103', + } + ) + mock_chunk_1 = MagicMock() mock_chunk_1.markdown = 'Page 1 content' mock_chunk_1.type = 'text' @@ -220,7 +222,7 @@ def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): mock_chunk_1.grounding.box.right = 0.9 mock_chunk_1.grounding.box.bottom = 0.5 mock_chunk_1.model_dump = Mock(return_value={}) - + mock_chunk_2 = MagicMock() mock_chunk_2.markdown = 'Page 2 content' mock_chunk_2.type = 'text' @@ -232,21 +234,21 @@ def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): mock_chunk_2.grounding.box.right = 0.9 mock_chunk_2.grounding.box.bottom = 0.5 mock_chunk_2.model_dump = Mock(return_value={}) - + mock_response_metadata = MagicMock() mock_response_metadata.filename = 'test-document.pdf' mock_response_metadata.model_dump = Mock(return_value={}) - + mock_response = MagicMock() mock_response.chunks = [mock_chunk_1, mock_chunk_2] mock_response.metadata = mock_metadata mock_response.model_dump_json = Mock(return_value='{}') - + mock_client.parse.return_value = mock_response # Create driver driver = LandingAIADEDriver(LandingAIConfig()) - + # Parse document path = self.__fixture_path('test-doc.pdf') document = driver.parse(path) @@ -256,7 +258,7 @@ def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): assert 'cost_estimation' in document.parsing_metadata assert document.parsing_metadata['cost_estimation'] == 6.0 assert document.parsing_metadata['cost_estimation_unit'] == 'credits' - + # Verify ADE details assert 'ade_details' in document.parsing_metadata ade_details = document.parsing_metadata['ade_details'] diff --git a/tests/drivers/test_llmwhisperer.py b/tests/drivers/test_llmwhisperer.py index 0a2881e..fe4d6fe 100644 --- a/tests/drivers/test_llmwhisperer.py +++ b/tests/drivers/test_llmwhisperer.py @@ -118,254 +118,253 @@ def test_llmwhisperer_driver_tracing_span_created(self, mock_tracer): assert count_call[0][0] == 'documents.processed' assert count_call[1]['driver'] == 'LlmWhispererDriver' - @patch('parxy_core.drivers.abstract_driver.tracer') - def test_llmwhisperer_driver_tracing_exception_recorded(self, mock_tracer): - # Setup mocks - mock_span = MagicMock() - mock_span.__enter__ = Mock(return_value=mock_span) - mock_span.__exit__ = Mock(return_value=False) - mock_tracer.span = Mock(return_value=mock_span) - mock_tracer.count = Mock() - mock_tracer.error = Mock() - - driver = LlmWhispererDriver(LlmWhispererConfig()) - path = self.__fixture_path('non-existing-file.pdf') - - # Attempt to parse non-existing file - with pytest.raises(FileNotFoundException): - driver.parse(path) - - # Verify error was logged via tracer.error - mock_tracer.error.assert_called_once() - error_call = mock_tracer.error.call_args - assert error_call[0][0] == 'Parsing failed' - - # Verify documents.failures counter was incremented - mock_tracer.count.assert_called_once() - count_call = mock_tracer.count.call_args - assert count_call[0][0] == 'documents.failures' - assert count_call[1]['driver'] == 'LlmWhispererDriver' - - @patch('unstract.llmwhisperer.LLMWhispererClientV2') - @patch('parxy_core.drivers.abstract_driver.tracer') - def test_llmwhisperer_driver_mode_from_kwargs(self, mock_tracer, mock_client_class): - """Test that mode parameter from kwargs is passed to whisper method""" - # Setup tracing mocks - mock_span = MagicMock() - mock_span.__enter__ = Mock(return_value=mock_span) - mock_span.__exit__ = Mock(return_value=False) - mock_tracer.span = Mock(return_value=mock_span) - mock_tracer.count = Mock() - mock_tracer.info = Mock() - - # Setup client mock - mock_client = MagicMock() - mock_client_class.return_value = mock_client - - # Mock whisper response - mock_response = { - 'extraction': { - 'result_text': 'Test content\n<<<\x0c', - 'metadata': {'0': {'page_number': 0}} - } - } - mock_client.whisper.return_value = mock_response - mock_client.get_usage_info.return_value = None - - # Create driver with default mode in config - config = LlmWhispererConfig(mode='form') - driver = LlmWhispererDriver(config) - - # Use bytes input instead of file path to avoid file I/O - test_data = b'%PDF-1.4 test content' - - # Parse with mode override in kwargs - document = driver.parse(test_data, level='page', mode='high_quality') - - # Verify whisper was called with the mode from kwargs - mock_client.whisper.assert_called_once() - call_kwargs = mock_client.whisper.call_args[1] - assert call_kwargs['mode'] == 'high_quality' - - @patch('unstract.llmwhisperer.LLMWhispererClientV2') - @patch('parxy_core.drivers.abstract_driver.tracer') - def test_llmwhisperer_driver_mode_from_config(self, mock_tracer, mock_client_class): - """Test that mode parameter from config is used when not in kwargs""" - # Setup tracing mocks - mock_span = MagicMock() - mock_span.__enter__ = Mock(return_value=mock_span) - mock_span.__exit__ = Mock(return_value=False) - mock_tracer.span = Mock(return_value=mock_span) - mock_tracer.count = Mock() - mock_tracer.info = Mock() - - # Setup client mock - mock_client = MagicMock() - mock_client_class.return_value = mock_client - - # Mock whisper response - mock_response = { - 'extraction': { - 'result_text': 'Test content\n<<<\x0c', - 'metadata': {'0': {'page_number': 0}} - } - } - mock_client.whisper.return_value = mock_response - mock_client.get_usage_info.return_value = None - - # Create driver with specific mode in config - config = LlmWhispererConfig(mode='low_cost') - driver = LlmWhispererDriver(config) - - # Use bytes input instead of file path to avoid file I/O - test_data = b'%PDF-1.4 test content' - - # Parse without mode in kwargs - document = driver.parse(test_data, level='page') - - # Verify whisper was called with the mode from config - mock_client.whisper.assert_called_once() - call_kwargs = mock_client.whisper.call_args[1] - assert call_kwargs['mode'] == 'low_cost' - - @patch('unstract.llmwhisperer.LLMWhispererClientV2') - @patch('parxy_core.drivers.abstract_driver.tracer') - def test_llmwhisperer_driver_mode_cost_estimation(self, mock_tracer, mock_client_class): - """Test that cost estimation uses the correct parsing mode""" - # Setup tracing mocks - mock_span = MagicMock() - mock_span.__enter__ = Mock(return_value=mock_span) - mock_span.__exit__ = Mock(return_value=False) - mock_tracer.span = Mock(return_value=mock_span) - mock_tracer.count = Mock() - mock_tracer.info = Mock() - - # Setup client mock - mock_client = MagicMock() - mock_client_class.return_value = mock_client - - # Mock whisper response with 2 pages including detail fields - mock_response = { - 'extraction': { - 'result_text': 'Page 1\n<<<\x0cPage 2\n<<<\x0c', - 'metadata': { - '0': {'page_number': 0}, - '1': {'page_number': 1} - } - }, - 'whisper_hash': 'abc123def456', - 'mode': 'native_text', - 'completed_at': 'Mon, 10 Feb 2025 10:40:58 GMT', - 'processing_started_at': 'Mon, 10 Feb 2025 10:40:53 GMT', - 'processing_time_in_seconds': 5.0, - 'total_pages': 2, - 'requested_pages': 2, - 'processed_pages': 2, - 'upload_file_size_in_kb': 618.488, - 'tag': 'test_tag' - } - mock_client.whisper.return_value = mock_response - - # Mock usage info - mock_usage_info = { - 'quota': 1000, - 'used': 50, - 'remaining': 950 - } - mock_client.get_usage_info.return_value = mock_usage_info - - # Create driver with native_text mode (1/1000 credits per page) - config = LlmWhispererConfig(mode='native_text') - driver = LlmWhispererDriver(config) - - # Use bytes input instead of file path to avoid file I/O - test_data = b'%PDF-1.4 test content' - - # Parse document - document = driver.parse(test_data, level='page') - - # Verify cost estimation metadata - assert document.parsing_metadata is not None - assert 'parsing_mode' in document.parsing_metadata - assert document.parsing_metadata['parsing_mode'] == 'native_text' - assert 'cost_estimation' in document.parsing_metadata - # 2 pages * (1/1000) credits per page = 0.002 credits - assert document.parsing_metadata['cost_estimation'] == 0.002 - assert document.parsing_metadata['cost_estimation_unit'] == 'credits' - assert document.parsing_metadata['pages_processed'] == 2 - - # Verify whisper-specific metadata - assert 'whisper_hash' in document.parsing_metadata - assert document.parsing_metadata['whisper_hash'] == 'abc123def456' - - # Verify whisper details - assert 'whisper_details' in document.parsing_metadata - whisper_details = document.parsing_metadata['whisper_details'] - assert whisper_details['completed_at'] == 'Mon, 10 Feb 2025 10:40:58 GMT' - assert whisper_details['processing_started_at'] == 'Mon, 10 Feb 2025 10:40:53 GMT' - assert whisper_details['processing_time_in_seconds'] == 5.0 - assert whisper_details['total_pages'] == 2 - assert whisper_details['requested_pages'] == 2 - assert whisper_details['processed_pages'] == 2 - assert whisper_details['upload_file_size_in_kb'] == 618.488 - assert whisper_details['tag'] == 'test_tag' - - @patch('unstract.llmwhisperer.LLMWhispererClientV2') - @patch('parxy_core.drivers.abstract_driver.tracer') - def test_llmwhisperer_driver_metadata_extraction(self, mock_tracer, mock_client_class): - """Test that whisper metadata is properly extracted from response""" - # Setup tracing mocks - mock_span = MagicMock() - mock_span.__enter__ = Mock(return_value=mock_span) - mock_span.__exit__ = Mock(return_value=False) - mock_tracer.span = Mock(return_value=mock_span) - mock_tracer.count = Mock() - mock_tracer.info = Mock() - - # Setup client mock - mock_client = MagicMock() - mock_client_class.return_value = mock_client - - # Mock whisper response with partial metadata (some fields missing) - mock_response = { - 'extraction': { - 'result_text': 'Test content\n<<<\x0c', - 'metadata': {'0': {'page_number': 0}} - }, - 'whisper_hash': 'xyz789', - 'mode': 'high_quality', - 'processing_time_in_seconds': 3.5, - 'total_pages': 1 - # Other fields intentionally missing to test robustness - } - mock_client.whisper.return_value = mock_response - mock_client.get_usage_info.return_value = None - - # Create driver - config = LlmWhispererConfig(mode='form') - driver = LlmWhispererDriver(config) - - # Use bytes input - test_data = b'%PDF-1.4 test content' - - # Parse document - document = driver.parse(test_data, level='page') - - # Verify whisper hash is extracted - assert 'whisper_hash' in document.parsing_metadata - assert document.parsing_metadata['whisper_hash'] == 'xyz789' - - # Verify mode from response takes precedence over config - assert document.parsing_metadata['parsing_mode'] == 'high_quality' - - # Verify whisper details only contains fields that were present - assert 'whisper_details' in document.parsing_metadata - whisper_details = document.parsing_metadata['whisper_details'] - assert whisper_details['processing_time_in_seconds'] == 3.5 - assert whisper_details['total_pages'] == 1 - - # These fields should not be present since they weren't in the response - assert 'completed_at' not in whisper_details - assert 'processing_started_at' not in whisper_details - assert 'requested_pages' not in whisper_details - assert 'tag' not in whisper_details + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_tracing_exception_recorded(self, mock_tracer): + # Setup mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = LlmWhispererDriver(LlmWhispererConfig()) + path = self.__fixture_path('non-existing-file.pdf') + + # Attempt to parse non-existing file + with pytest.raises(FileNotFoundException): + driver.parse(path) + + # Verify error was logged via tracer.error + mock_tracer.error.assert_called_once() + error_call = mock_tracer.error.call_args + assert error_call[0][0] == 'Parsing failed' + + # Verify documents.failures counter was incremented + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.failures' + assert count_call[1]['driver'] == 'LlmWhispererDriver' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_from_kwargs(self, mock_tracer, mock_client_class): + """Test that mode parameter from kwargs is passed to whisper method""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}}, + } + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver with default mode in config + config = LlmWhispererConfig(mode='form') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse with mode override in kwargs + document = driver.parse(test_data, level='page', mode='high_quality') + + # Verify whisper was called with the mode from kwargs + mock_client.whisper.assert_called_once() + call_kwargs = mock_client.whisper.call_args[1] + assert call_kwargs['mode'] == 'high_quality' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_from_config(self, mock_tracer, mock_client_class): + """Test that mode parameter from config is used when not in kwargs""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}}, + } + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver with specific mode in config + config = LlmWhispererConfig(mode='low_cost') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse without mode in kwargs + document = driver.parse(test_data, level='page') + + # Verify whisper was called with the mode from config + mock_client.whisper.assert_called_once() + call_kwargs = mock_client.whisper.call_args[1] + assert call_kwargs['mode'] == 'low_cost' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_cost_estimation( + self, mock_tracer, mock_client_class + ): + """Test that cost estimation uses the correct parsing mode""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response with 2 pages including detail fields + mock_response = { + 'extraction': { + 'result_text': 'Page 1\n<<<\x0cPage 2\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}, '1': {'page_number': 1}}, + }, + 'whisper_hash': 'abc123def456', + 'mode': 'native_text', + 'completed_at': 'Mon, 10 Feb 2025 10:40:58 GMT', + 'processing_started_at': 'Mon, 10 Feb 2025 10:40:53 GMT', + 'processing_time_in_seconds': 5.0, + 'total_pages': 2, + 'requested_pages': 2, + 'processed_pages': 2, + 'upload_file_size_in_kb': 618.488, + 'tag': 'test_tag', + } + mock_client.whisper.return_value = mock_response + + # Mock usage info + mock_usage_info = {'quota': 1000, 'used': 50, 'remaining': 950} + mock_client.get_usage_info.return_value = mock_usage_info + + # Create driver with native_text mode (1/1000 credits per page) + config = LlmWhispererConfig(mode='native_text') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse document + document = driver.parse(test_data, level='page') + + # Verify cost estimation metadata + assert document.parsing_metadata is not None + assert 'parsing_mode' in document.parsing_metadata + assert document.parsing_metadata['parsing_mode'] == 'native_text' + assert 'cost_estimation' in document.parsing_metadata + # 2 pages * (1/1000) credits per page = 0.002 credits + assert document.parsing_metadata['cost_estimation'] == 0.002 + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + assert document.parsing_metadata['pages_processed'] == 2 + + # Verify whisper-specific metadata + assert 'whisper_hash' in document.parsing_metadata + assert document.parsing_metadata['whisper_hash'] == 'abc123def456' + + # Verify whisper details + assert 'whisper_details' in document.parsing_metadata + whisper_details = document.parsing_metadata['whisper_details'] + assert whisper_details['completed_at'] == 'Mon, 10 Feb 2025 10:40:58 GMT' + assert ( + whisper_details['processing_started_at'] == 'Mon, 10 Feb 2025 10:40:53 GMT' + ) + assert whisper_details['processing_time_in_seconds'] == 5.0 + assert whisper_details['total_pages'] == 2 + assert whisper_details['requested_pages'] == 2 + assert whisper_details['processed_pages'] == 2 + assert whisper_details['upload_file_size_in_kb'] == 618.488 + assert whisper_details['tag'] == 'test_tag' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_metadata_extraction( + self, mock_tracer, mock_client_class + ): + """Test that whisper metadata is properly extracted from response""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response with partial metadata (some fields missing) + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}}, + }, + 'whisper_hash': 'xyz789', + 'mode': 'high_quality', + 'processing_time_in_seconds': 3.5, + 'total_pages': 1, + # Other fields intentionally missing to test robustness + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver + config = LlmWhispererConfig(mode='form') + driver = LlmWhispererDriver(config) + + # Use bytes input + test_data = b'%PDF-1.4 test content' + + # Parse document + document = driver.parse(test_data, level='page') + + # Verify whisper hash is extracted + assert 'whisper_hash' in document.parsing_metadata + assert document.parsing_metadata['whisper_hash'] == 'xyz789' + + # Verify mode from response takes precedence over config + assert document.parsing_metadata['parsing_mode'] == 'high_quality' + + # Verify whisper details only contains fields that were present + assert 'whisper_details' in document.parsing_metadata + whisper_details = document.parsing_metadata['whisper_details'] + assert whisper_details['processing_time_in_seconds'] == 3.5 + assert whisper_details['total_pages'] == 1 + + # These fields should not be present since they weren't in the response + assert 'completed_at' not in whisper_details + assert 'processing_started_at' not in whisper_details + assert 'requested_pages' not in whisper_details + assert 'tag' not in whisper_details From aa3ce8625f990adb2c62a1b76119deeca84200c8 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 12:26:43 +0100 Subject: [PATCH 14/16] Remove max_arg_lenth --- src/parxy_core/tracing/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parxy_core/tracing/client.py b/src/parxy_core/tracing/client.py index b72f63a..4016e92 100644 --- a/src/parxy_core/tracing/client.py +++ b/src/parxy_core/tracing/client.py @@ -297,10 +297,10 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: attributes: dict[str, Any] = {'function': func.__qualname__} - if capture_args: - attributes.update( - _serialize_args(args, kwargs, exclude, max_arg_length) - ) + if capture_args: + attributes.update( + _serialize_args(args, kwargs, exclude) + ) with self._tracer.start_as_current_span( span_name, attributes=attributes From 5a91995c84ff54a09b01d0c71068010b6a62497c Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Fri, 2 Jan 2026 12:30:10 +0100 Subject: [PATCH 15/16] update test to use default value for level --- tests/commands/test_parse.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/commands/test_parse.py b/tests/commands/test_parse.py index d64fc81..e434676 100644 --- a/tests/commands/test_parse.py +++ b/tests/commands/test_parse.py @@ -40,12 +40,12 @@ def test_parse_command_calls_facade_correctly(runner, mock_document, tmp_path): # Assert the command executed successfully assert result.exit_code == 0 - # Assert Parxy.parse was called with the correct arguments - mock_parxy.parse.assert_called_once_with( - file=str(test_file), - level='page', # default level - driver_name='pymupdf', # default driver - ) + # Assert Parxy.parse was called with the correct arguments + mock_parxy.parse.assert_called_once_with( + file=str(test_file), + level='block', # default level + driver_name='pymupdf', # default driver + ) def test_parse_command_with_custom_options(runner, mock_document, tmp_path): From 05f39ede03a8b529ed5888eae99731814d0aa0ff Mon Sep 17 00:00:00 2001 From: avvertix <5672748+avvertix@users.noreply.github.com> Date: Fri, 2 Jan 2026 11:30:28 +0000 Subject: [PATCH 16/16] Fix styling --- src/parxy_core/tracing/client.py | 6 ++---- tests/commands/test_parse.py | 12 ++++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/parxy_core/tracing/client.py b/src/parxy_core/tracing/client.py index 4016e92..7a09ff6 100644 --- a/src/parxy_core/tracing/client.py +++ b/src/parxy_core/tracing/client.py @@ -297,10 +297,8 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: attributes: dict[str, Any] = {'function': func.__qualname__} - if capture_args: - attributes.update( - _serialize_args(args, kwargs, exclude) - ) + if capture_args: + attributes.update(_serialize_args(args, kwargs, exclude)) with self._tracer.start_as_current_span( span_name, attributes=attributes diff --git a/tests/commands/test_parse.py b/tests/commands/test_parse.py index e434676..de8a57d 100644 --- a/tests/commands/test_parse.py +++ b/tests/commands/test_parse.py @@ -40,12 +40,12 @@ def test_parse_command_calls_facade_correctly(runner, mock_document, tmp_path): # Assert the command executed successfully assert result.exit_code == 0 - # Assert Parxy.parse was called with the correct arguments - mock_parxy.parse.assert_called_once_with( - file=str(test_file), - level='block', # default level - driver_name='pymupdf', # default driver - ) + # Assert Parxy.parse was called with the correct arguments + mock_parxy.parse.assert_called_once_with( + file=str(test_file), + level='block', # default level + driver_name='pymupdf', # default driver + ) def test_parse_command_with_custom_options(runner, mock_document, tmp_path):