From 2d6256cba46316a0298af7f25cb0c1d7ed1172b0 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 15 Nov 2024 13:05:28 -0500 Subject: [PATCH] Add markdown renderer, swap how ids are named --- marker/v2/builders/structure.py | 4 +-- marker/v2/converters/pdf.py | 9 +++-- marker/v2/processors/equation.py | 2 +- marker/v2/processors/table.py | 2 +- marker/v2/providers/pdf.py | 3 +- marker/v2/renderers/__init__.py | 2 +- marker/v2/renderers/default.py | 19 ---------- marker/v2/renderers/line.py | 41 ---------------------- marker/v2/renderers/markdown.py | 26 ++++++++++++++ marker/v2/renderers/span.py | 9 ----- marker/v2/renderers/util.py | 8 ----- marker/v2/schema/blocks/__init__.py | 2 +- marker/v2/schema/blocks/base.py | 54 +++++++++++++++++++++-------- marker/v2/schema/blocks/listitem.py | 4 +++ marker/v2/schema/blocks/text.py | 4 +++ marker/v2/schema/document.py | 27 ++++++++++----- marker/v2/schema/groups/__init__.py | 3 +- marker/v2/schema/groups/list.py | 6 +++- marker/v2/schema/groups/page.py | 2 +- marker/v2/schema/text/line.py | 26 ++++++++++---- marker/v2/schema/text/span.py | 8 +++++ 21 files changed, 141 insertions(+), 120 deletions(-) delete mode 100644 marker/v2/renderers/default.py delete mode 100644 marker/v2/renderers/line.py create mode 100644 marker/v2/renderers/markdown.py delete mode 100644 marker/v2/renderers/span.py delete mode 100644 marker/v2/renderers/util.py diff --git a/marker/v2/builders/structure.py b/marker/v2/builders/structure.py index f73d7be..ab59b82 100644 --- a/marker/v2/builders/structure.py +++ b/marker/v2/builders/structure.py @@ -58,7 +58,7 @@ def group_caption_blocks(self, page: PageGroup): group_block.structure = block_structure # Update the structure of the page to reflect the new block - page.update_structure_item(block_id, group_block._id) + page.update_structure_item(block_id, group_block.id) page.remove_structure_items(block_structure) def group_lists(self, page: PageGroup): @@ -86,5 +86,5 @@ def group_lists(self, page: PageGroup): group_block.structure = block_structure # Update the structure of the page to reflect the new block - page.update_structure_item(block_id, group_block._id) + page.update_structure_item(block_id, group_block.id) page.remove_structure_items(block_structure) diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 27ea2a4..c9160f7 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -13,8 +13,7 @@ from marker.v2.providers.pdf import PdfProvider from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \ setup_detection_model -from marker.v2.renderers.line import LineRenderer -from marker.v2.renderers.span import SpanRenderer +from marker.v2.renderers.markdown import MarkdownRenderer class PdfConverter(BaseConverter): @@ -41,9 +40,9 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): #table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model) #table_processor(document) - renderer_lst = [SpanRenderer(), LineRenderer()] - rendered = document.render(renderer_lst) - return rendered + renderer = MarkdownRenderer() + document_output = document.render() + return renderer(document_output) if __name__ == "__main__": diff --git a/marker/v2/processors/equation.py b/marker/v2/processors/equation.py index 17e039f..e3988dd 100644 --- a/marker/v2/processors/equation.py +++ b/marker/v2/processors/equation.py @@ -35,7 +35,7 @@ def __call__(self, document: Document): equation_data.append({ "image": image, - "block_id": block._id, + "block_id": block.id, "token_count": token_count }) diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py index fa62df0..cdffbf2 100644 --- a/marker/v2/processors/table.py +++ b/marker/v2/processors/table.py @@ -47,7 +47,7 @@ def __call__(self, document: Document): ) table_data.append({ - "block_id": block._id, + "block_id": block.id, "table_image": image, "table_bbox": image_poly.bbox, "text_lines": text_lines, diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py index 782d6ea..47004cb 100644 --- a/marker/v2/providers/pdf.py +++ b/marker/v2/providers/pdf.py @@ -9,7 +9,8 @@ from marker.v2.providers import BaseProvider from marker.v2.schema.polygon import PolygonBox -from marker.v2.schema.text.line import Line, Span +from marker.v2.schema.text.line import Line +from marker.v2.schema.text.span import Span PdfPageProviderLine = Tuple[List[Line], List[List[Span]]] PdfPageProviderLines = Dict[int, PdfPageProviderLine] diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index 00616c9..089ae8f 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -17,6 +17,6 @@ def __init__(self, config: Optional[BaseModel | dict] = None): for k in config.model_fields: setattr(self, k, config[k]) - def __call__(self, document, block, children=None): + def __call__(self, document_output): # Children are in reading order raise NotImplementedError \ No newline at end of file diff --git a/marker/v2/renderers/default.py b/marker/v2/renderers/default.py deleted file mode 100644 index 56c64d1..0000000 --- a/marker/v2/renderers/default.py +++ /dev/null @@ -1,19 +0,0 @@ -from marker.schema.block import Line -from marker.v2.renderers import BaseRenderer -from marker.v2.schema.text.span import Span - - -class DefaultRenderer(BaseRenderer): - def __call__(self, document, block, children=None): - text = "" - if children is not None and len(children) > 0: - for child in children: - text += child.rendered - if isinstance(block, Span): - text = block.text - elif isinstance(block, Line): - text = text.rstrip() + "\n" - else: - text += "\n" - - return text diff --git a/marker/v2/renderers/line.py b/marker/v2/renderers/line.py deleted file mode 100644 index fb22227..0000000 --- a/marker/v2/renderers/line.py +++ /dev/null @@ -1,41 +0,0 @@ -import re -from typing import List, Optional - -from marker.v2.renderers import BaseRenderer -from marker.v2.schema import BlockTypes -from marker.v2.schema.text import Span - - -def surround_text(s, char_to_insert): - leading_whitespace = re.match(r'^(\s*)', s).group(1) - trailing_whitespace = re.search(r'(\s*)$', s).group(1) - stripped_string = s.strip() - modified_string = char_to_insert + stripped_string + char_to_insert - final_string = leading_whitespace + modified_string + trailing_whitespace - return final_string - - -class LineRenderer(BaseRenderer): - block_type = BlockTypes.Line - - def __call__(self, document, block, children: Optional[List[Span]] = None): - text = "" - for i, child in enumerate(children): - next_span = None - next_idx = i + 1 - while len(children) > next_idx: - next_span = children[next_idx] - next_idx += 1 - if len(next_span.text.strip()) > 0: - break - span_text = child.rendered - - # Don't bold or italicize very short sequences - # Avoid bolding first and last sequence so lines can be joined properly - if len(span_text) > 3 and 0 < i < len(children) - 1: - if child.italic and (not next_span or not next_span.italic): - span_text = surround_text(span_text, "*") - elif child.bold and (not next_span or not next_span.bold): - span_text = surround_text(span_text, "**") - text += span_text - return text \ No newline at end of file diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py new file mode 100644 index 0000000..950e863 --- /dev/null +++ b/marker/v2/renderers/markdown.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup +from markdownify import markdownify +from marker.v2.renderers import BaseRenderer + + +class MarkdownRenderer(BaseRenderer): + def extract_html(self, document_output): + soup = BeautifulSoup(document_output.html, 'html.parser') + + content_refs = soup.find_all('content-ref') + for ref in content_refs: + src = ref.get('src') + for item in document_output.children: + if item.id == src: + content = self.extract_html(item) + break + + ref.replace_with(BeautifulSoup(content, 'html.parser')) + + return str(soup) + + def __call__(self, document_output): + full_html = self.extract_html(document_output) + return markdownify(full_html) + + diff --git a/marker/v2/renderers/span.py b/marker/v2/renderers/span.py deleted file mode 100644 index ada64ff..0000000 --- a/marker/v2/renderers/span.py +++ /dev/null @@ -1,9 +0,0 @@ -from marker.v2.renderers import BaseRenderer -from marker.v2.schema import BlockTypes - - -class SpanRenderer(BaseRenderer): - block_type = BlockTypes.Span - - def __call__(self, document, block, children=None): - return block.text \ No newline at end of file diff --git a/marker/v2/renderers/util.py b/marker/v2/renderers/util.py deleted file mode 100644 index e9a301a..0000000 --- a/marker/v2/renderers/util.py +++ /dev/null @@ -1,8 +0,0 @@ -def renderer_for_block(block, renderer_list: list): - from marker.v2.renderers.default import DefaultRenderer - - for renderer in renderer_list: - if renderer.block_type == block.block_type: - return renderer - - return DefaultRenderer() diff --git a/marker/v2/schema/blocks/__init__.py b/marker/v2/schema/blocks/__init__.py index c6ee258..20d753d 100644 --- a/marker/v2/schema/blocks/__init__.py +++ b/marker/v2/schema/blocks/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from marker.v2.schema.blocks.base import Block, BlockId +from marker.v2.schema.blocks.base import Block, BlockId, BlockOutput from marker.v2.schema.blocks.caption import Caption from marker.v2.schema.blocks.code import Code from marker.v2.schema.blocks.figure import Figure diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index 5331a7a..4218300 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -2,12 +2,18 @@ from typing import Optional, List, Any -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, field_validator -from marker.v2.renderers.util import renderer_for_block from marker.v2.schema.polygon import PolygonBox +class BlockOutput(BaseModel): + html: str + polygon: PolygonBox + id: BlockId + children: List[BlockOutput] | None = None + + class BlockId(BaseModel): page_id: int block_id: int | None = None @@ -22,9 +28,21 @@ def __repr__(self): return str(self) def __eq__(self, other): - if not isinstance(other, BlockId): + if not isinstance(other, (BlockId, str)): return NotImplemented - return self.page_id == other.page_id and self.block_id == other.block_id and self.block_type == other.block_type + + if isinstance(other, str): + return str(self) == other + else: + return self.page_id == other.page_id and self.block_id == other.block_id and self.block_type == other.block_type + + @field_validator("block_type") + @classmethod + def validate_block_type(cls, v): + from marker.v2.schema import BlockTypes + if not hasattr(BlockTypes, v): + raise ValueError(f"Invalid block type: {v}") + return v class Block(BaseModel): @@ -38,7 +56,7 @@ class Block(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) @property - def _id(self) -> BlockId: + def id(self) -> BlockId: return BlockId( page_id=self.page_id, block_id=self.block_id, @@ -49,9 +67,9 @@ def add_structure(self, block: Block): self.polygon = self.polygon.merge([block.polygon]) if self.structure is None: - self.structure = [block._id] + self.structure = [block.id] else: - self.structure.append(block._id) + self.structure.append(block.id) def update_structure_item(self, old_id: BlockId, new_id: BlockId): if self.structure is not None: @@ -82,14 +100,22 @@ def raw_text(self, document) -> str: text += "\n" return text - def render(self, document, renderer_list: list): - child_blocks = [] + def assemble_html(self, child_blocks): + template = "" + for c in child_blocks: + template += f"" + return template + + def render(self, document): + child_content = [] if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - if not block.rendered: - block.render(document, renderer_list) - child_blocks.append(block) + child_content.append(block.render(document)) - renderer = renderer_for_block(self, renderer_list) - self.rendered = renderer(document, self, child_blocks) + return BlockOutput( + html=self.assemble_html(child_content), + polygon=self.polygon, + id=self.id, + children=child_content + ) diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py index 3f13849..d4c511d 100644 --- a/marker/v2/schema/blocks/listitem.py +++ b/marker/v2/schema/blocks/listitem.py @@ -3,3 +3,7 @@ class ListItem(Block): block_type: str = "ListItem" + + def assemble_html(self, child_blocks): + template = super().assemble_html(child_blocks) + return f"
  • {template}
  • " diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py index 3bee947..e95d729 100644 --- a/marker/v2/schema/blocks/text.py +++ b/marker/v2/schema/blocks/text.py @@ -3,3 +3,7 @@ class Text(Block): block_type: str = "Text" + + def assemble_html(self, child_blocks): + template = super().assemble_html(child_blocks) + return f"

    {template}

    " diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index ef03764..c8c837c 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -4,9 +4,14 @@ from pydantic import BaseModel -from marker.v2.schema.blocks import BlockId +from marker.v2.schema.blocks import BlockId, BlockOutput from marker.v2.schema.groups.page import PageGroup -from marker.v2.renderers.util import renderer_for_block + + +class DocumentOutput(BaseModel): + children: List[BlockOutput] + html: str + block_type: str = "Document" class Document(BaseModel): @@ -21,12 +26,18 @@ def get_block(self, block_id: BlockId): return block return None - def render(self, renderer_lst: list | None = None): - if renderer_lst is None: - renderer_lst = [] + def assemble_html(self, child_blocks): + template = "" + for c in child_blocks: + template += f"" + return template + def render(self): + child_content = [] for page in self.pages: - page.render(self, renderer_lst) + child_content.append(page.render(self)) - doc_renderer = renderer_for_block(self, renderer_lst) - return doc_renderer(self, self, self.pages) + return DocumentOutput( + children=child_content, + html=self.assemble_html(child_content) + ) diff --git a/marker/v2/schema/groups/__init__.py b/marker/v2/schema/groups/__init__.py index dcf6a17..dbd47f2 100644 --- a/marker/v2/schema/groups/__init__.py +++ b/marker/v2/schema/groups/__init__.py @@ -3,12 +3,13 @@ from marker.v2.schema.groups.table import TableGroup from marker.v2.schema.groups.list import ListGroup from marker.v2.schema.groups.picture import PictureGroup +from marker.v2.schema.groups.page import PageGroup GROUP_BLOCK_REGISTRY = { v.model_fields['block_type'].default: v for k, v in locals().items() if isinstance(v, type) and issubclass(v, Block) and v != Block # Exclude the base Block class - and v.model_fields['block_type'].default.endswith("Group") + and (v.model_fields['block_type'].default.endswith("Group") or v.model_fields['block_type'].default == "Page") } diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py index c480a76..5220975 100644 --- a/marker/v2/schema/groups/list.py +++ b/marker/v2/schema/groups/list.py @@ -2,4 +2,8 @@ class ListGroup(Block): - block_type: str = "ListGroup" \ No newline at end of file + block_type: str = "ListGroup" + + def assemble_html(self, child_blocks): + template = super().assemble_html(child_blocks) + return f"" \ No newline at end of file diff --git a/marker/v2/schema/groups/page.py b/marker/v2/schema/groups/page.py index dddce32..349c5c8 100644 --- a/marker/v2/schema/groups/page.py +++ b/marker/v2/schema/groups/page.py @@ -43,5 +43,5 @@ def add_full_block(self, block: Block) -> Block: def get_block(self, block_id: BlockId) -> Block | None: for block in self.children: - if block._id == block_id: + if block.id == block_id: return block diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py index 545930c..c61756d 100644 --- a/marker/v2/schema/text/line.py +++ b/marker/v2/schema/text/line.py @@ -1,11 +1,25 @@ -from typing import List - -from marker.v2.schema.blocks import Block -from marker.v2.schema.text.span import Span +from marker.v2.schema.blocks import Block, BlockOutput class Line(Block): block_type: str = "Line" - def is_continuation(self, other): - pass + def assemble_html(self, child_blocks): + template = "" + for c in child_blocks: + template += c.html + return template + + def render(self, document): + child_content = [] + if self.structure is not None and len(self.structure) > 0: + for block_id in self.structure: + block = document.get_block(block_id) + child_content.append(block.render(document)) + + return BlockOutput( + html=self.assemble_html(child_content), + polygon=self.polygon, + id=self.id, + children=[] + ) diff --git a/marker/v2/schema/text/span.py b/marker/v2/schema/text/span.py index cce09b4..4fbb40d 100644 --- a/marker/v2/schema/text/span.py +++ b/marker/v2/schema/text/span.py @@ -21,3 +21,11 @@ def bold(self): @property def italic(self): return 'italic' in self.formats + + def assemble_html(self, child_blocks): + if len(self.text) > 3: + if self.italic: + return f"{self.text}" + elif self.bold: + return f"{self.text}" + return self.text