Skip to content

Commit

Permalink
Add markdown renderer, swap how ids are named
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 15, 2024
1 parent 76ea3e5 commit 2d6256c
Show file tree
Hide file tree
Showing 21 changed files with 141 additions and 120 deletions.
4 changes: 2 additions & 2 deletions marker/v2/builders/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def group_caption_blocks(self, page: PageGroup):
group_block.structure = block_structure

# Update the structure of the page to reflect the new block
page.update_structure_item(block_id, group_block._id)
page.update_structure_item(block_id, group_block.id)
page.remove_structure_items(block_structure)

def group_lists(self, page: PageGroup):
Expand Down Expand Up @@ -86,5 +86,5 @@ def group_lists(self, page: PageGroup):
group_block.structure = block_structure

# Update the structure of the page to reflect the new block
page.update_structure_item(block_id, group_block._id)
page.update_structure_item(block_id, group_block.id)
page.remove_structure_items(block_structure)
9 changes: 4 additions & 5 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
from marker.v2.providers.pdf import PdfProvider
from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
setup_detection_model
from marker.v2.renderers.line import LineRenderer
from marker.v2.renderers.span import SpanRenderer
from marker.v2.renderers.markdown import MarkdownRenderer


class PdfConverter(BaseConverter):
Expand All @@ -41,9 +40,9 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
#table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
#table_processor(document)

renderer_lst = [SpanRenderer(), LineRenderer()]
rendered = document.render(renderer_lst)
return rendered
renderer = MarkdownRenderer()
document_output = document.render()
return renderer(document_output)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion marker/v2/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __call__(self, document: Document):

equation_data.append({
"image": image,
"block_id": block._id,
"block_id": block.id,
"token_count": token_count
})

Expand Down
2 changes: 1 addition & 1 deletion marker/v2/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __call__(self, document: Document):
)

table_data.append({
"block_id": block._id,
"block_id": block.id,
"table_image": image,
"table_bbox": image_poly.bbox,
"text_lines": text_lines,
Expand Down
3 changes: 2 additions & 1 deletion marker/v2/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

from marker.v2.providers import BaseProvider
from marker.v2.schema.polygon import PolygonBox
from marker.v2.schema.text.line import Line, Span
from marker.v2.schema.text.line import Line
from marker.v2.schema.text.span import Span

PdfPageProviderLine = Tuple[List[Line], List[List[Span]]]
PdfPageProviderLines = Dict[int, PdfPageProviderLine]
Expand Down
2 changes: 1 addition & 1 deletion marker/v2/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ def __init__(self, config: Optional[BaseModel | dict] = None):
for k in config.model_fields:
setattr(self, k, config[k])

def __call__(self, document, block, children=None):
def __call__(self, document_output):
# Children are in reading order
raise NotImplementedError
19 changes: 0 additions & 19 deletions marker/v2/renderers/default.py

This file was deleted.

41 changes: 0 additions & 41 deletions marker/v2/renderers/line.py

This file was deleted.

26 changes: 26 additions & 0 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from bs4 import BeautifulSoup
from markdownify import markdownify
from marker.v2.renderers import BaseRenderer


class MarkdownRenderer(BaseRenderer):
def extract_html(self, document_output):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
for ref in content_refs:
src = ref.get('src')
for item in document_output.children:
if item.id == src:
content = self.extract_html(item)
break

ref.replace_with(BeautifulSoup(content, 'html.parser'))

return str(soup)

def __call__(self, document_output):
full_html = self.extract_html(document_output)
return markdownify(full_html)


9 changes: 0 additions & 9 deletions marker/v2/renderers/span.py

This file was deleted.

8 changes: 0 additions & 8 deletions marker/v2/renderers/util.py

This file was deleted.

2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from marker.v2.schema.blocks.base import Block, BlockId
from marker.v2.schema.blocks.base import Block, BlockId, BlockOutput
from marker.v2.schema.blocks.caption import Caption
from marker.v2.schema.blocks.code import Code
from marker.v2.schema.blocks.figure import Figure
Expand Down
54 changes: 40 additions & 14 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@

from typing import Optional, List, Any

from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, field_validator

from marker.v2.renderers.util import renderer_for_block
from marker.v2.schema.polygon import PolygonBox


class BlockOutput(BaseModel):
html: str
polygon: PolygonBox
id: BlockId
children: List[BlockOutput] | None = None


class BlockId(BaseModel):
page_id: int
block_id: int | None = None
Expand All @@ -22,9 +28,21 @@ def __repr__(self):
return str(self)

def __eq__(self, other):
if not isinstance(other, BlockId):
if not isinstance(other, (BlockId, str)):
return NotImplemented
return self.page_id == other.page_id and self.block_id == other.block_id and self.block_type == other.block_type

if isinstance(other, str):
return str(self) == other
else:
return self.page_id == other.page_id and self.block_id == other.block_id and self.block_type == other.block_type

@field_validator("block_type")
@classmethod
def validate_block_type(cls, v):
from marker.v2.schema import BlockTypes
if not hasattr(BlockTypes, v):
raise ValueError(f"Invalid block type: {v}")
return v


class Block(BaseModel):
Expand All @@ -38,7 +56,7 @@ class Block(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

@property
def _id(self) -> BlockId:
def id(self) -> BlockId:
return BlockId(
page_id=self.page_id,
block_id=self.block_id,
Expand All @@ -49,9 +67,9 @@ def add_structure(self, block: Block):
self.polygon = self.polygon.merge([block.polygon])

if self.structure is None:
self.structure = [block._id]
self.structure = [block.id]
else:
self.structure.append(block._id)
self.structure.append(block.id)

def update_structure_item(self, old_id: BlockId, new_id: BlockId):
if self.structure is not None:
Expand Down Expand Up @@ -82,14 +100,22 @@ def raw_text(self, document) -> str:
text += "\n"
return text

def render(self, document, renderer_list: list):
child_blocks = []
def assemble_html(self, child_blocks):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
return template

def render(self, document):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
if not block.rendered:
block.render(document, renderer_list)
child_blocks.append(block)
child_content.append(block.render(document))

renderer = renderer_for_block(self, renderer_list)
self.rendered = renderer(document, self, child_blocks)
return BlockOutput(
html=self.assemble_html(child_content),
polygon=self.polygon,
id=self.id,
children=child_content
)
4 changes: 4 additions & 0 deletions marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@

class ListItem(Block):
block_type: str = "ListItem"

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
return f"<li>{template}</li>"
4 changes: 4 additions & 0 deletions marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@

class Text(Block):
block_type: str = "Text"

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
return f"<p>{template}</p>"
27 changes: 19 additions & 8 deletions marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@

from pydantic import BaseModel

from marker.v2.schema.blocks import BlockId
from marker.v2.schema.blocks import BlockId, BlockOutput
from marker.v2.schema.groups.page import PageGroup
from marker.v2.renderers.util import renderer_for_block


class DocumentOutput(BaseModel):
children: List[BlockOutput]
html: str
block_type: str = "Document"


class Document(BaseModel):
Expand All @@ -21,12 +26,18 @@ def get_block(self, block_id: BlockId):
return block
return None

def render(self, renderer_lst: list | None = None):
if renderer_lst is None:
renderer_lst = []
def assemble_html(self, child_blocks):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
return template

def render(self):
child_content = []
for page in self.pages:
page.render(self, renderer_lst)
child_content.append(page.render(self))

doc_renderer = renderer_for_block(self, renderer_lst)
return doc_renderer(self, self, self.pages)
return DocumentOutput(
children=child_content,
html=self.assemble_html(child_content)
)
3 changes: 2 additions & 1 deletion marker/v2/schema/groups/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from marker.v2.schema.groups.table import TableGroup
from marker.v2.schema.groups.list import ListGroup
from marker.v2.schema.groups.picture import PictureGroup
from marker.v2.schema.groups.page import PageGroup

GROUP_BLOCK_REGISTRY = {
v.model_fields['block_type'].default: v for k, v in locals().items()
if isinstance(v, type)
and issubclass(v, Block)
and v != Block # Exclude the base Block class
and v.model_fields['block_type'].default.endswith("Group")
and (v.model_fields['block_type'].default.endswith("Group") or v.model_fields['block_type'].default == "Page")
}

6 changes: 5 additions & 1 deletion marker/v2/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@


class ListGroup(Block):
block_type: str = "ListGroup"
block_type: str = "ListGroup"

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
return f"<ul>{template}</ul>"
2 changes: 1 addition & 1 deletion marker/v2/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ def add_full_block(self, block: Block) -> Block:

def get_block(self, block_id: BlockId) -> Block | None:
for block in self.children:
if block._id == block_id:
if block.id == block_id:
return block
Loading

0 comments on commit 2d6256c

Please sign in to comment.