diff --git a/pytest.ini b/pytest.ini index 3904651..74e381a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,7 @@ [pytest] filterwarnings = ignore:.*Swig.* - ignore:.*no current event loop.* \ No newline at end of file + ignore:.*no current event loop.* + + +norecursedirs = .git worktrees parxy.worktrees diff --git a/src/parxy_cli/commands/parse.py b/src/parxy_cli/commands/parse.py index db14c74..5396698 100644 --- a/src/parxy_cli/commands/parse.py +++ b/src/parxy_cli/commands/parse.py @@ -215,7 +215,7 @@ def parse( '-l', help='Extraction level', ), - ] = Level.PAGE, + ] = Level.BLOCK, mode: Annotated[ OutputMode, typer.Option( diff --git a/src/parxy_core/drivers/abstract_driver.py b/src/parxy_core/drivers/abstract_driver.py index c3652a2..eebdfb2 100644 --- a/src/parxy_core/drivers/abstract_driver.py +++ b/src/parxy_core/drivers/abstract_driver.py @@ -1,6 +1,7 @@ import base64 import hashlib import io +import time from abc import ABC, abstractmethod from logging import Logger from typing import Dict, Any, Self, Tuple, Optional @@ -108,8 +109,20 @@ def parse( self._validate_level(level) try: + # Start timing + start_time = time.perf_counter() + document = self._handle(file=file, level=level, **kwargs) + # Calculate elapsed time in milliseconds + end_time = time.perf_counter() + elapsed_ms = (end_time - start_time) * 1000 + + # Store elapsed time in parsing metadata + if document.parsing_metadata is None: + document.parsing_metadata = {} + document.parsing_metadata['driver_elapsed_time'] = elapsed_ms + # Increment the documents processed counter tracer.count( 'documents.processed', diff --git a/src/parxy_core/drivers/landingai.py b/src/parxy_core/drivers/landingai.py index cb180ef..270cd24 100644 --- a/src/parxy_core/drivers/landingai.py +++ b/src/parxy_core/drivers/landingai.py @@ -55,7 +55,44 @@ def _handle( service=self.__class__, ) from aex - return landingaiade_to_parxy(parse_response) + doc = landingaiade_to_parxy(parse_response) + + # Initialize parsing_metadata if needed + if doc.parsing_metadata is None: + doc.parsing_metadata = {} + + # Extract cost information from metadata + # According to https://docs.landing.ai/ade/ade-json-response.md + # metadata contains: credit_usage, duration_ms, filename, job_id, page_count, version + if parse_response.metadata: + metadata = parse_response.metadata + + # Extract cost estimation from credit_usage + if hasattr(metadata, 'credit_usage') and metadata.credit_usage is not None: + doc.parsing_metadata['cost_estimation'] = metadata.credit_usage + doc.parsing_metadata['cost_estimation_unit'] = 'credits' + + # Extract processing details + ade_details = {} + if hasattr(metadata, 'duration_ms') and metadata.duration_ms is not None: + ade_details['duration_ms'] = metadata.duration_ms + if hasattr(metadata, 'job_id') and metadata.job_id is not None: + ade_details['job_id'] = metadata.job_id + if hasattr(metadata, 'page_count') and metadata.page_count is not None: + ade_details['page_count'] = metadata.page_count + if hasattr(metadata, 'version') and metadata.version is not None: + ade_details['version'] = metadata.version + if hasattr(metadata, 'filename') and metadata.filename is not None: + ade_details['filename'] = metadata.filename + + # Add failed_pages if present (for partial content responses) + if hasattr(metadata, 'failed_pages') and metadata.failed_pages is not None: + ade_details['failed_pages'] = metadata.failed_pages + + if ade_details: + doc.parsing_metadata['ade_details'] = ade_details + + return doc @trace_with_output('converting') diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py index 270a051..ae7c26e 100644 --- a/src/parxy_core/drivers/llamaparse.py +++ b/src/parxy_core/drivers/llamaparse.py @@ -1,5 +1,7 @@ import io -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional + +import requests from parxy_core.models.config import LlamaParseConfig from parxy_core.tracing.utils import trace_with_output @@ -23,6 +25,17 @@ FileNotFoundException, ) +_credits_per_parsing_mode = { + # Minimum credits per parsing mode as deduced from https://developers.llamaindex.ai/python/cloud/general/pricing/ + 'accurate': 3, # equivalent to Parse page with LLM as observed in their dashboard + 'parse_page_without_llm': 1, + 'parse_page_with_llm': 3, + 'parse_page_with_lvm': 6, + 'parse_page_with_agent': 10, + 'parse_document_with_llm': 30, + 'parse_document_with_agent': 30, +} + class LlamaParseDriver(Driver): """Llama Cloud Services document processing via LlamaParse API. @@ -65,6 +78,96 @@ def _initialize_driver(self): **self._config.model_dump() if self._config else {}, ) + def _fetch_usage_metrics(self, job_id: str) -> Optional[dict]: + """Fetch actual usage metrics from the LlamaParse beta API. + + Parameters + ---------- + job_id : str + The job ID to fetch metrics for + + Returns + ------- + Optional[dict] + Dictionary with 'total_cost', 'cost_unit', 'parsing_mode_counts', and 'mode_details' + Returns None if organization_id is not configured or if the API call fails + """ + # Only fetch if organization_id is configured + if not self._config or not self._config.organization_id: + return None + + try: + # Construct the beta API endpoint + base_url = self._config.base_url.rstrip('/') + endpoint = f'{base_url}/api/v1/beta/usage-metrics' + + # Prepare request parameters + params = { + 'organization_id': self._config.organization_id, + 'event_aggregation_key': job_id, + } + + # Prepare headers with authentication + headers = { + 'Authorization': f'Bearer {self._config.api_key.get_secret_value()}', + 'Content-Type': 'application/json', + } + + # Make the API request + response = requests.get( + endpoint, params=params, headers=headers, timeout=10 + ) + response.raise_for_status() + + data = response.json() + items = data.get('items', []) + + if not items: + return None + + # Aggregate usage data by parsing mode + parsing_mode_counts = {} + mode_details = [] + + for item in items: + if item.get('event_type') == 'pages_parsed': + mode = item.get('properties', {}).get('mode', 'unknown') + pages = item.get('value', 0) + model = item.get('properties', {}).get('model', 'unknown') + + # Count pages per mode + parsing_mode_counts[mode] = parsing_mode_counts.get(mode, 0) + pages + + # Store detailed info + mode_details.append( + { + 'mode': mode, + 'model': model, + 'pages': pages, + 'day': item.get('day'), + } + ) + + # Calculate total cost based on actual usage + total_cost = 0 + for mode, count in parsing_mode_counts.items(): + credits_per_page = _credits_per_parsing_mode.get(mode, 3) + total_cost += credits_per_page * count + + return { + 'total_cost': total_cost, + 'cost_unit': 'credits', + 'parsing_mode_counts': parsing_mode_counts, + 'mode_details': mode_details, + } + + except Exception as e: + # Log the error but don't fail the parsing + self._logger.warning( + f'Failed to fetch usage metrics from beta API: {str(e)}' + ) + return None + def _handle( self, file: str | io.BytesIO | bytes, @@ -136,7 +239,73 @@ def _handle( res.error, self.__class__, res.model_dump(exclude={'file_name'}) ) - return llamaparse_to_parxy(doc=res, level=level) + converted_document = llamaparse_to_parxy(doc=res, level=level) + + if converted_document.parsing_metadata is None: + converted_document.parsing_metadata = {} + + converted_document.parsing_metadata['job_id'] = res.job_id + converted_document.parsing_metadata['job_metadata'] = ( + res.job_metadata.model_dump_json() + ) + converted_document.parsing_metadata['job_error'] = res.error + converted_document.parsing_metadata['job_error_code'] = res.error_code + converted_document.parsing_metadata['job_status'] = res.status + + # Try to fetch actual usage metrics from beta API if organization_id is configured + usage_metrics = self._fetch_usage_metrics(res.job_id) + + if usage_metrics: + # Use actual metrics from the API + converted_document.parsing_metadata['cost_estimation'] = usage_metrics[ + 'total_cost' + ] + converted_document.parsing_metadata['cost_estimation_unit'] = usage_metrics[ + 'cost_unit' + ] + converted_document.parsing_metadata['parsing_mode_counts'] = usage_metrics[ + 'parsing_mode_counts' + ] + converted_document.parsing_metadata['cost_data_source'] = 'beta_api' + converted_document.parsing_metadata['usage_details'] = usage_metrics[ + 'mode_details' + ] + else: + # Fall back to estimation from page source_data + parsing_modes = {} + parsing_mode_counts = {} + + for page in converted_document.pages: + if page.source_data and 'parsingMode' in page.source_data: + mode = page.source_data['parsingMode'] + parsing_modes[page.number] = mode + + # Count pages per parsing mode + if mode in parsing_mode_counts: + parsing_mode_counts[mode] += 1 + else: + parsing_mode_counts[mode] = 1 + + if parsing_modes: + converted_document.parsing_metadata['page_parsing_modes'] = ( + parsing_modes + ) + converted_document.parsing_metadata['parsing_mode_counts'] = ( + parsing_mode_counts + ) + + # Calculate cost estimation based on parsing modes + total_cost = 0 + for mode, count in parsing_mode_counts.items(): + # Use the credit cost from the dictionary, or default to 3 if not recognized + credits_per_page = _credits_per_parsing_mode.get(mode, 3) + total_cost += credits_per_page * count + + converted_document.parsing_metadata['cost_estimation'] = total_cost + converted_document.parsing_metadata['cost_estimation_unit'] = 'credits' + converted_document.parsing_metadata['cost_data_source'] = 'estimation' + + return converted_document @trace_with_output('converting') diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index ee78519..adc96df 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -25,6 +25,17 @@ from parxy_core.models import Document, Page +_credits_per_parsing_mode_per_page = { + # https://unstract.com/pricing/ + # https://docs.unstract.com/llmwhisperer/llm_whisperer/llm_whisperer_modes/ + 'native_text': 1 / 1000, + 'low_cost': 5 / 1000, + 'high_quality': 10 / 1000, + 'form': 15 / 1000, + 'table': 15 / 1000, # assumed to be the same as form +} + + class LlmWhispererDriver(Driver): """Unstract LLMWhisperer API driver implementation. @@ -55,13 +66,36 @@ def _initialize_driver(self): "Install with 'pip install parxy[llmwhisperer]'" ) from e + # Prepare config for client initialization, excluding mode (which is used per-request) + config_dict = self._config.model_dump() if self._config else {} + config_dict.pop('mode', None) # Remove mode as it's not a client init parameter + self.__client = LLMWhispererClientV2( api_key=self._config.api_key.get_secret_value() if self._config and self._config.api_key else None, - **self._config.model_dump() if self._config else {}, + **config_dict, ) + def _fetch_usage_info(self) -> dict | None: + """Fetch usage information from the LLMWhisperer API. + + Returns + ------- + dict | None + Dictionary with usage information including quota, page counts, and subscription plan. + Returns None if the API call fails. + """ + try: + usage_info = self.__client.get_usage_info() + return usage_info + except Exception as e: + # Log the error but don't fail the parsing + self._logger.warning( + f'Failed to fetch usage information from LLMWhisperer API: {str(e)}' + ) + return None + def _handle( self, file: str | io.BytesIO | bytes, @@ -79,7 +113,7 @@ def _handle( raw : bool, optional If True, return the raw response dict from LLMWhisperer instead of a `Document`. Default is False. **kwargs : - Additional arguments passed to the LLMWhisperer client (e.g., `wait_timeout`). + Additional arguments passed to the LLMWhisperer client (e.g., `wait_timeout`, `mode`). Returns ------- @@ -94,6 +128,11 @@ def _handle( self._validate_level(level) + # Determine the parsing mode: kwargs takes precedence over config + parsing_mode = kwargs.pop('mode', None) or ( + getattr(self._config, 'mode', 'form') if self._config else 'form' + ) + try: filename, stream = self.handle_file_input(file) with self._trace_parse(filename, stream, **kwargs) as span: @@ -102,6 +141,7 @@ def _handle( stream=io.BytesIO(stream), wait_for_completion=True, wait_timeout=200, # TODO: Handle configuration of args + mode=parsing_mode, # wait_timeout=kwargs.get("wait_timeout", 200), # **kwargs, ) @@ -124,6 +164,63 @@ def _handle( doc = llmwhisperer_to_parxy(res) doc.filename = filename + + # Initialize parsing_metadata if needed + if doc.parsing_metadata is None: + doc.parsing_metadata = {} + + # Extract whisper-specific metadata from the response + if 'whisper_hash' in res: + doc.parsing_metadata['whisper_hash'] = res['whisper_hash'] + + if 'mode' in res: + doc.parsing_metadata['parsing_mode'] = res['mode'] + else: + doc.parsing_metadata['parsing_mode'] = parsing_mode + + # Extract processing details + whisper_details = {} + if 'completed_at' in res: + whisper_details['completed_at'] = res['completed_at'] + if 'processing_started_at' in res: + whisper_details['processing_started_at'] = res['processing_started_at'] + if 'processing_time_in_seconds' in res: + whisper_details['processing_time_in_seconds'] = res[ + 'processing_time_in_seconds' + ] + if 'total_pages' in res: + whisper_details['total_pages'] = res['total_pages'] + if 'requested_pages' in res: + whisper_details['requested_pages'] = res['requested_pages'] + if 'processed_pages' in res: + whisper_details['processed_pages'] = res['processed_pages'] + if 'upload_file_size_in_kb' in res: + whisper_details['upload_file_size_in_kb'] = res['upload_file_size_in_kb'] + if 'tag' in res: + whisper_details['tag'] = res['tag'] + + if whisper_details: + doc.parsing_metadata['whisper_details'] = whisper_details + + # Calculate cost based on number of pages and parsing mode + # Use the actual mode from the response if available, otherwise use the requested mode + actual_mode = res.get('mode', parsing_mode) + num_pages = len(doc.pages) + credits_per_page = _credits_per_parsing_mode_per_page.get( + actual_mode, 10 / 1000 + ) + estimated_cost = credits_per_page * num_pages + + doc.parsing_metadata['cost_estimation'] = estimated_cost + doc.parsing_metadata['cost_estimation_unit'] = 'credits' + doc.parsing_metadata['pages_processed'] = num_pages + + # Fetch usage information from the API + usage_info = self._fetch_usage_info() + + if usage_info: + doc.parsing_metadata['usage_info'] = usage_info + return doc diff --git a/src/parxy_core/models/config.py b/src/parxy_core/models/config.py index 601c3cd..535025e 100644 --- a/src/parxy_core/models/config.py +++ b/src/parxy_core/models/config.py @@ -136,6 +136,9 @@ class LlamaParseConfig(BaseConfig): verbose: Optional[bool] = False """Whether to print the progress of the parsing.""" + parse_mode: Optional[str] = 'parse_page_with_llm' + """Parsing mode to use to process all documents.""" + # Parsing specific configurations (Alphabetical order) disable_ocr: Optional[bool] = False @@ -164,6 +167,9 @@ class LlmWhispererConfig(BaseConfig): logging_level: Optional[str] = 'INFO' """The logging level for the client. Can be "DEBUG", "INFO", "WARNING" or "ERROR". Default "INFO".""" + mode: Optional[str] = 'form' + """Default parsing mode. Can be high_quality, form, low_cost or native_text""" + model_config = SettingsConfigDict( env_prefix='parxy_llmwhisperer_', env_file='.env', extra='ignore' ) diff --git a/src/parxy_core/tracing/client.py b/src/parxy_core/tracing/client.py index bcc3601..7a09ff6 100644 --- a/src/parxy_core/tracing/client.py +++ b/src/parxy_core/tracing/client.py @@ -79,7 +79,7 @@ def force_flush(self, timeout_millis: int = 30000): return self._wrapped_exporter.force_flush(timeout_millis) -def _serialize_value(value: Any, max_length: int = 10000) -> str: +def _serialize_value(value: Any) -> str: """Serialize a value for span attributes with size limits.""" try: if hasattr(value, 'model_dump_json'): @@ -89,18 +89,15 @@ def _serialize_value(value: Any, max_length: int = 10000) -> str: else: result = json.dumps(value, default=str) - if len(result) > max_length: - return result[:max_length] + '...[truncated]' return result except Exception: - return str(value)[:max_length] + return str(value) def _serialize_args( args: tuple, kwargs: dict, exclude: set[str] | None = None, - max_length: int = 1000, ) -> dict[str, str]: """Serialize function arguments for span attributes.""" exclude = exclude or {'self', 'cls'} @@ -109,11 +106,11 @@ def _serialize_args( for i, arg in enumerate(args): key = f'arg.{i}' if key not in exclude: - attributes[key] = _serialize_value(arg, max_length) + attributes[key] = _serialize_value(arg) for key, value in kwargs.items(): if key not in exclude: - attributes[f'arg.{key}'] = _serialize_value(value, max_length) + attributes[f'arg.{key}'] = _serialize_value(value) return attributes @@ -259,7 +256,6 @@ def instrument( capture_return: bool = True, exclude_args: set[str] | None = None, max_arg_length: int = 1000, - max_return_length: int = 10000, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Decorator to automatically instrument a function with tracing. @@ -278,8 +274,6 @@ def instrument( Argument names to exclude from capture. Always excludes 'self', 'cls'. max_arg_length : int, optional Max length for serialized arguments. Default 1000. - max_return_length : int, optional - Max length for serialized return value. Default 10000. Returns ------- @@ -304,9 +298,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: attributes: dict[str, Any] = {'function': func.__qualname__} if capture_args: - attributes.update( - _serialize_args(args, kwargs, exclude, max_arg_length) - ) + attributes.update(_serialize_args(args, kwargs, exclude)) with self._tracer.start_as_current_span( span_name, attributes=attributes @@ -315,9 +307,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: result = func(*args, **kwargs) if capture_return and result is not None: - span.set_attribute( - 'return', _serialize_value(result, max_return_length) - ) + span.set_attribute('return', _serialize_value(result)) return result diff --git a/tests/commands/test_parse.py b/tests/commands/test_parse.py index d64fc81..de8a57d 100644 --- a/tests/commands/test_parse.py +++ b/tests/commands/test_parse.py @@ -43,7 +43,7 @@ def test_parse_command_calls_facade_correctly(runner, mock_document, tmp_path): # Assert Parxy.parse was called with the correct arguments mock_parxy.parse.assert_called_once_with( file=str(test_file), - level='page', # default level + level='block', # default level driver_name='pymupdf', # default driver ) diff --git a/tests/drivers/test_landingai.py b/tests/drivers/test_landingai.py index 84f8d25..c371eb6 100644 --- a/tests/drivers/test_landingai.py +++ b/tests/drivers/test_landingai.py @@ -88,7 +88,10 @@ def test_landingai_driver_read_empty_document_page_level(self): assert document.metadata is None assert len(document.pages) == 1 assert isinstance(document.pages[0], Page) - assert document.pages[0].text == '1' + stripped_text = ( + re.sub(r'<[^>]+>', '', document.pages[0].text).replace('\n', '').strip() + ) + assert stripped_text == '1' def test_landingai_driver_read_document(self): driver = LandingAIADEDriver(LandingAIConfig()) @@ -108,7 +111,7 @@ def test_landingai_driver_read_document(self): stripped_text = re.sub(r'<[^>]+>', '', document.pages[0].text).strip() assert ( stripped_text - == 'This is the header\n\n\nThis is a test PDF to be used as input in unit\ntests\n\n\n## This is a heading 1\nThis is a paragraph below heading 1\n\n\n1' + == 'This is the header\n\n\nThis is a test PDF to be used as input in unit tests\n\n\n# This is a heading 1\nThis is a paragraph below heading 1\n\n\n1' ) @patch('parxy_core.drivers.abstract_driver.tracer') @@ -171,3 +174,96 @@ def test_landingai_driver_tracing_exception_recorded(self, mock_tracer): count_call = mock_tracer.count.call_args assert count_call[0][0] == 'documents.failures' assert count_call[1]['driver'] == 'LandingAIADEDriver' + + @patch('landingai_ade.LandingAIADE') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_landingai_driver_cost_estimation(self, mock_tracer, mock_client_class): + """Test that cost estimation is extracted from parse response metadata""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock parse response with metadata including credit usage + # Based on https://docs.landing.ai/ade/ade-json-response.md + mock_metadata = MagicMock() + mock_metadata.credit_usage = 6.0 + mock_metadata.duration_ms = 24382 + mock_metadata.filename = 'test-document.pdf' + mock_metadata.job_id = 'td8wu72tq2g9l9tfgkwn3q3kp' + mock_metadata.page_count = 2 + mock_metadata.version = 'dpt-2-20251103' + mock_metadata.model_dump = Mock( + return_value={ + 'credit_usage': 6.0, + 'duration_ms': 24382, + 'filename': 'test-document.pdf', + 'job_id': 'td8wu72tq2g9l9tfgkwn3q3kp', + 'page_count': 2, + 'version': 'dpt-2-20251103', + } + ) + + mock_chunk_1 = MagicMock() + mock_chunk_1.markdown = 'Page 1 content' + mock_chunk_1.type = 'text' + mock_chunk_1.grounding = MagicMock() + mock_chunk_1.grounding.page = 0 + mock_chunk_1.grounding.box = MagicMock() + mock_chunk_1.grounding.box.left = 0.1 + mock_chunk_1.grounding.box.top = 0.1 + mock_chunk_1.grounding.box.right = 0.9 + mock_chunk_1.grounding.box.bottom = 0.5 + mock_chunk_1.model_dump = Mock(return_value={}) + + mock_chunk_2 = MagicMock() + mock_chunk_2.markdown = 'Page 2 content' + mock_chunk_2.type = 'text' + mock_chunk_2.grounding = MagicMock() + mock_chunk_2.grounding.page = 1 + mock_chunk_2.grounding.box = MagicMock() + mock_chunk_2.grounding.box.left = 0.1 + mock_chunk_2.grounding.box.top = 0.1 + mock_chunk_2.grounding.box.right = 0.9 + mock_chunk_2.grounding.box.bottom = 0.5 + mock_chunk_2.model_dump = Mock(return_value={}) + + mock_response_metadata = MagicMock() + mock_response_metadata.filename = 'test-document.pdf' + mock_response_metadata.model_dump = Mock(return_value={}) + + mock_response = MagicMock() + mock_response.chunks = [mock_chunk_1, mock_chunk_2] + mock_response.metadata = mock_metadata + mock_response.model_dump_json = Mock(return_value='{}') + + mock_client.parse.return_value = mock_response + + # Create driver + driver = LandingAIADEDriver(LandingAIConfig()) + + # Parse document + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path) + + # Verify cost estimation metadata + assert document.parsing_metadata is not None + assert 'cost_estimation' in document.parsing_metadata + assert document.parsing_metadata['cost_estimation'] == 6.0 + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + + # Verify ADE details + assert 'ade_details' in document.parsing_metadata + ade_details = document.parsing_metadata['ade_details'] + assert ade_details['duration_ms'] == 24382 + assert ade_details['filename'] == 'test-document.pdf' + assert ade_details['job_id'] == 'td8wu72tq2g9l9tfgkwn3q3kp' + assert ade_details['page_count'] == 2 + assert ade_details['version'] == 'dpt-2-20251103' diff --git a/tests/drivers/test_llamaparse.py b/tests/drivers/test_llamaparse.py index a87cc8d..32c286b 100644 --- a/tests/drivers/test_llamaparse.py +++ b/tests/drivers/test_llamaparse.py @@ -161,3 +161,45 @@ def test_llamaparse_driver_tracing_exception_recorded(self, mock_tracer): count_call = mock_tracer.count.call_args assert count_call[0][0] == 'documents.failures' assert count_call[1]['driver'] == 'LlamaParseDriver' + + def test_llamaparse_driver_extracts_parsing_modes(self): + driver = LlamaParseDriver(LlamaParseConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + # Verify parsing_metadata exists and contains parsing modes + assert document.parsing_metadata is not None + + # Check if page_parsing_modes is in parsing_metadata (it may not be if LlamaParse doesn't provide it) + # This is conditional because the actual LlamaParse response may or may not include parsingMode + if 'page_parsing_modes' in document.parsing_metadata: + parsing_modes = document.parsing_metadata['page_parsing_modes'] + assert isinstance(parsing_modes, dict) + # Verify it's a mapping of page numbers to parsing modes + for page_num, mode in parsing_modes.items(): + assert isinstance(page_num, int) + assert isinstance(mode, str) + + # Verify parsing_mode_counts exists + assert 'parsing_mode_counts' in document.parsing_metadata + parsing_mode_counts = document.parsing_metadata['parsing_mode_counts'] + assert isinstance(parsing_mode_counts, dict) + + # Verify counts are correct + for mode, count in parsing_mode_counts.items(): + assert isinstance(mode, str) + assert isinstance(count, int) + assert count > 0 + + # Verify total count matches number of pages with parsing modes + assert sum(parsing_mode_counts.values()) == len(parsing_modes) + + # Verify cost_estimation exists + assert 'cost_estimation' in document.parsing_metadata + cost_estimation = document.parsing_metadata['cost_estimation'] + assert isinstance(cost_estimation, int) + assert cost_estimation > 0 + + # Verify cost estimation is reasonable (at least 1 credit per page minimum) + assert cost_estimation >= len(parsing_modes) diff --git a/tests/drivers/test_llmwhisperer.py b/tests/drivers/test_llmwhisperer.py index a9b182e..fe4d6fe 100644 --- a/tests/drivers/test_llmwhisperer.py +++ b/tests/drivers/test_llmwhisperer.py @@ -145,3 +145,226 @@ def test_llmwhisperer_driver_tracing_exception_recorded(self, mock_tracer): count_call = mock_tracer.count.call_args assert count_call[0][0] == 'documents.failures' assert count_call[1]['driver'] == 'LlmWhispererDriver' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_from_kwargs(self, mock_tracer, mock_client_class): + """Test that mode parameter from kwargs is passed to whisper method""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}}, + } + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver with default mode in config + config = LlmWhispererConfig(mode='form') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse with mode override in kwargs + document = driver.parse(test_data, level='page', mode='high_quality') + + # Verify whisper was called with the mode from kwargs + mock_client.whisper.assert_called_once() + call_kwargs = mock_client.whisper.call_args[1] + assert call_kwargs['mode'] == 'high_quality' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_from_config(self, mock_tracer, mock_client_class): + """Test that mode parameter from config is used when not in kwargs""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}}, + } + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver with specific mode in config + config = LlmWhispererConfig(mode='low_cost') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse without mode in kwargs + document = driver.parse(test_data, level='page') + + # Verify whisper was called with the mode from config + mock_client.whisper.assert_called_once() + call_kwargs = mock_client.whisper.call_args[1] + assert call_kwargs['mode'] == 'low_cost' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_mode_cost_estimation( + self, mock_tracer, mock_client_class + ): + """Test that cost estimation uses the correct parsing mode""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response with 2 pages including detail fields + mock_response = { + 'extraction': { + 'result_text': 'Page 1\n<<<\x0cPage 2\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}, '1': {'page_number': 1}}, + }, + 'whisper_hash': 'abc123def456', + 'mode': 'native_text', + 'completed_at': 'Mon, 10 Feb 2025 10:40:58 GMT', + 'processing_started_at': 'Mon, 10 Feb 2025 10:40:53 GMT', + 'processing_time_in_seconds': 5.0, + 'total_pages': 2, + 'requested_pages': 2, + 'processed_pages': 2, + 'upload_file_size_in_kb': 618.488, + 'tag': 'test_tag', + } + mock_client.whisper.return_value = mock_response + + # Mock usage info + mock_usage_info = {'quota': 1000, 'used': 50, 'remaining': 950} + mock_client.get_usage_info.return_value = mock_usage_info + + # Create driver with native_text mode (1/1000 credits per page) + config = LlmWhispererConfig(mode='native_text') + driver = LlmWhispererDriver(config) + + # Use bytes input instead of file path to avoid file I/O + test_data = b'%PDF-1.4 test content' + + # Parse document + document = driver.parse(test_data, level='page') + + # Verify cost estimation metadata + assert document.parsing_metadata is not None + assert 'parsing_mode' in document.parsing_metadata + assert document.parsing_metadata['parsing_mode'] == 'native_text' + assert 'cost_estimation' in document.parsing_metadata + # 2 pages * (1/1000) credits per page = 0.002 credits + assert document.parsing_metadata['cost_estimation'] == 0.002 + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + assert document.parsing_metadata['pages_processed'] == 2 + + # Verify whisper-specific metadata + assert 'whisper_hash' in document.parsing_metadata + assert document.parsing_metadata['whisper_hash'] == 'abc123def456' + + # Verify whisper details + assert 'whisper_details' in document.parsing_metadata + whisper_details = document.parsing_metadata['whisper_details'] + assert whisper_details['completed_at'] == 'Mon, 10 Feb 2025 10:40:58 GMT' + assert ( + whisper_details['processing_started_at'] == 'Mon, 10 Feb 2025 10:40:53 GMT' + ) + assert whisper_details['processing_time_in_seconds'] == 5.0 + assert whisper_details['total_pages'] == 2 + assert whisper_details['requested_pages'] == 2 + assert whisper_details['processed_pages'] == 2 + assert whisper_details['upload_file_size_in_kb'] == 618.488 + assert whisper_details['tag'] == 'test_tag' + + @patch('unstract.llmwhisperer.LLMWhispererClientV2') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_llmwhisperer_driver_metadata_extraction( + self, mock_tracer, mock_client_class + ): + """Test that whisper metadata is properly extracted from response""" + # Setup tracing mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + # Setup client mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock whisper response with partial metadata (some fields missing) + mock_response = { + 'extraction': { + 'result_text': 'Test content\n<<<\x0c', + 'metadata': {'0': {'page_number': 0}}, + }, + 'whisper_hash': 'xyz789', + 'mode': 'high_quality', + 'processing_time_in_seconds': 3.5, + 'total_pages': 1, + # Other fields intentionally missing to test robustness + } + mock_client.whisper.return_value = mock_response + mock_client.get_usage_info.return_value = None + + # Create driver + config = LlmWhispererConfig(mode='form') + driver = LlmWhispererDriver(config) + + # Use bytes input + test_data = b'%PDF-1.4 test content' + + # Parse document + document = driver.parse(test_data, level='page') + + # Verify whisper hash is extracted + assert 'whisper_hash' in document.parsing_metadata + assert document.parsing_metadata['whisper_hash'] == 'xyz789' + + # Verify mode from response takes precedence over config + assert document.parsing_metadata['parsing_mode'] == 'high_quality' + + # Verify whisper details only contains fields that were present + assert 'whisper_details' in document.parsing_metadata + whisper_details = document.parsing_metadata['whisper_details'] + assert whisper_details['processing_time_in_seconds'] == 3.5 + assert whisper_details['total_pages'] == 1 + + # These fields should not be present since they weren't in the response + assert 'completed_at' not in whisper_details + assert 'processing_started_at' not in whisper_details + assert 'requested_pages' not in whisper_details + assert 'tag' not in whisper_details diff --git a/tests/drivers/test_pymupdf.py b/tests/drivers/test_pymupdf.py index c5c652e..e62986e 100644 --- a/tests/drivers/test_pymupdf.py +++ b/tests/drivers/test_pymupdf.py @@ -171,3 +171,15 @@ def test_pymupdf_driver_tracing_exception_recorded(self, mock_tracer): # Verify counter was NOT incremented due to exception mock_tracer.count.assert_called_once() + + def test_pymupdf_driver_records_elapsed_time(self): + driver = PyMuPdfDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + # Verify elapsed time is recorded in parsing_metadata + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0