diff --git a/src/parxy_core/drivers/landingai.py b/src/parxy_core/drivers/landingai.py index 00eb697..9c689af 100644 --- a/src/parxy_core/drivers/landingai.py +++ b/src/parxy_core/drivers/landingai.py @@ -110,6 +110,19 @@ def landingaiade_to_parxy(parsed_data: ParseResponse) -> Document: page_chunks[page_num] = [] page_chunks[page_num].append(chunk) + # Determine total page count from metadata + total_pages = ( + parsed_data.metadata.page_count + if parsed_data.metadata and parsed_data.metadata.page_count + else 0 + ) + + # Insert empty pages for any gaps in page_chunks + existing_pages = set(page_chunks.keys()) + for page_num in range(total_pages): + if page_num not in existing_pages: + page_chunks[page_num] = [] + # Convert to pages pages = [] for page_num in sorted(page_chunks.keys()): diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py index 6931939..426f53b 100644 --- a/src/parxy_core/drivers/llamaparse.py +++ b/src/parxy_core/drivers/llamaparse.py @@ -449,11 +449,15 @@ def _convert_text_block(text_block: PageItem, page_number: int) -> TextBlock: x1=text_block.bBox.x + text_block.bBox.w, y1=text_block.bBox.y + text_block.bBox.h, ) + # Handle empty page marker + text_value = text_block.value if text_block.value else '' + if text_value == 'NO_CONTENT_HERE': + text_value = '' return TextBlock( type='text', category=text_block.type, level=text_block.lvl, - text=text_block.value if text_block.value else '', + text=text_value, bbox=bbox, page=page_number, source_data=text_block.model_dump(exclude={'bBox', 'value', 'type', 'lvl'}), @@ -485,7 +489,7 @@ def _convert_page( number=page.page - 1, width=page.width, height=page.height, - text=page.text, + text=page.text if page.text != 'NO_CONTENT_HERE' else '', blocks=text_blocks, source_data=page.model_dump( exclude={'page', 'text', 'items', 'width', 'height'}