Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/parxy_core/drivers/landingai.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@ def landingaiade_to_parxy(parsed_data: ParseResponse) -> Document:
page_chunks[page_num] = []
page_chunks[page_num].append(chunk)

# Determine total page count from metadata
total_pages = (
parsed_data.metadata.page_count
if parsed_data.metadata and parsed_data.metadata.page_count
else 0
)

# Insert empty pages for any gaps in page_chunks
existing_pages = set(page_chunks.keys())
for page_num in range(total_pages):
if page_num not in existing_pages:
page_chunks[page_num] = []

# Convert to pages
pages = []
for page_num in sorted(page_chunks.keys()):
Expand Down
8 changes: 6 additions & 2 deletions src/parxy_core/drivers/llamaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,11 +449,15 @@ def _convert_text_block(text_block: PageItem, page_number: int) -> TextBlock:
x1=text_block.bBox.x + text_block.bBox.w,
y1=text_block.bBox.y + text_block.bBox.h,
)
# Handle empty page marker
text_value = text_block.value if text_block.value else ''
if text_value == 'NO_CONTENT_HERE':
text_value = ''
return TextBlock(
type='text',
category=text_block.type,
level=text_block.lvl,
text=text_block.value if text_block.value else '',
text=text_value,
bbox=bbox,
page=page_number,
source_data=text_block.model_dump(exclude={'bBox', 'value', 'type', 'lvl'}),
Expand Down Expand Up @@ -485,7 +489,7 @@ def _convert_page(
number=page.page - 1,
width=page.width,
height=page.height,
text=page.text,
text=page.text if page.text != 'NO_CONTENT_HERE' else '',
blocks=text_blocks,
source_data=page.model_dump(
exclude={'page', 'text', 'items', 'width', 'height'}
Expand Down