Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into feat-picture-description
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfim-ibm committed Feb 7, 2025
2 parents dbb35c7 + 02faf53 commit 80e0bef
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 53 deletions.
108 changes: 62 additions & 46 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"

Expand Down Expand Up @@ -81,15 +81,15 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
self.markdown = self.shorten_underscore_sequences(text_stream)
self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read()
# remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
self.markdown = self.shorten_underscore_sequences(md_content)
self.markdown = self._shorten_underscore_sequences(md_content)
self.valid = True

_log.debug(self.markdown)
Expand All @@ -99,7 +99,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) from e
return

def close_table(self, doc: DoclingDocument):
def _close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
Expand Down Expand Up @@ -156,30 +156,35 @@ def close_table(self, doc: DoclingDocument):
doc.add_table(data=table_data)
return

def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_element,
parent=parent_item,
text=txt,
)
self.inline_texts = []

def iterate_elements(
def _iterate_elements(
self,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
):

if element in visited:
return

# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
Expand Down Expand Up @@ -207,8 +212,8 @@ def traverse(node: marko.block.BlockElement):
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
parent_element = doc.add_text(
label=doc_label, parent=parent_element, text=snippet_text
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
)

elif isinstance(element, marko.block.List):
Expand All @@ -218,35 +223,37 @@ def traverse(node: marko.block.BlockElement):
has_non_empty_list_items = True
break

self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_element = doc.add_group(
label=label, name=f"list", parent=parent_element
parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item
)

elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")

snippet_text = str(element.children[0].children[0].children) # type: ignore
first_child = element.children[0]
snippet_text = str(first_child.children[0].children) # type: ignore
is_numbered = False
if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
parent_item is not None
and isinstance(parent_item, DocItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text
enumerated=is_numbered, parent=parent_item, text=snippet_text
)
visited.add(first_child)

elif isinstance(element, marko.inline.Image):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")

fig_caption: Optional[TextItem] = None
Expand All @@ -255,10 +262,10 @@ def traverse(node: marko.block.BlockElement):
label=DocItemLabel.CAPTION, text=element.title
)

doc.add_picture(parent=parent_element, caption=fig_caption)
doc.add_picture(parent=parent_item, caption=fig_caption)

elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self.process_inline_text(parent_element, doc)
self._process_inline_text(parent_item, doc)

elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
Expand All @@ -272,28 +279,27 @@ def traverse(node: marko.block.BlockElement):
else:
self.md_table_buffer.append(snippet_text)
else:
self.close_table(doc)
self.in_table = False
self._close_table(doc)
# most likely just inline text
self.inline_texts.append(str(element.children))

elif isinstance(element, marko.inline.CodeSpan):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_element, text=snippet_text)
doc.add_code(parent=parent_item, text=snippet_text)

elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_element, text=snippet_text)
doc.add_code(parent=parent_item, text=snippet_text)

elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
Expand All @@ -302,8 +308,8 @@ def traverse(node: marko.block.BlockElement):

elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self.process_inline_text(parent_element, doc)
self.close_table(doc)
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug("HTML Block: {}".format(element))
if (
len(element.body) > 0
Expand All @@ -312,18 +318,16 @@ def traverse(node: marko.block.BlockElement):

# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_element, text=text_to_add)
doc.add_code(parent=parent_item, text=text_to_add)
else:
if not isinstance(element, str):
self.close_table(doc)
self._close_table(doc)
_log.debug("Some other element: {}".format(element))

processed_block_types = (
marko.block.ListItem,
marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
# marko.block.Paragraph,
marko.inline.RawText,
)

Expand All @@ -332,7 +336,13 @@ def traverse(node: marko.block.BlockElement):
element, processed_block_types
):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
self._iterate_elements(
element=child,
depth=depth + 1,
doc=doc,
visited=visited,
parent_item=parent_item,
)

def is_valid(self) -> bool:
return self.valid
Expand Down Expand Up @@ -366,9 +376,15 @@ def convert(self) -> DoclingDocument:
marko_parser = Markdown()
parsed_ast = marko_parser.parse(self.markdown)
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text
self.close_table(doc=doc) # handle any last hanging table
self._iterate_elements(
element=parsed_ast,
depth=0,
doc=doc,
parent_item=None,
visited=set(),
)
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table

# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:
Expand Down
2 changes: 1 addition & 1 deletion docling/models/code_formula_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
Processes the given batch of elements and enriches them with predictions.
"""

_model_repo_folder = "CodeFormula"
_model_repo_folder = "ds4sd--CodeFormula"
elements_batch_size = 5
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
expansion_factor = 0.03
Expand Down
2 changes: 1 addition & 1 deletion docling/models/document_picture_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
Processes a batch of elements and adds classification annotations.
"""

_model_repo_folder = "DocumentFigureClassifier"
_model_repo_folder = "ds4sd--DocumentFigureClassifier"
images_scale = 2

def __init__(
Expand Down
2 changes: 1 addition & 1 deletion docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


class LayoutModel(BasePageModel):
_model_repo_folder = "docling-models"
_model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/layout"

TEXT_ELEM_LABELS = [
Expand Down
2 changes: 1 addition & 1 deletion docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


class TableStructureModel(BasePageModel):
_model_repo_folder = "docling-models"
_model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer"

def __init__(
Expand Down
31 changes: 31 additions & 0 deletions tests/data/groundtruth/docling_v2/nested.md.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Nesting

A list featuring nesting:

- abc
- abc123
- abc1234
- abc12345
- a.
- b.
- abcd1234:
- abcd12345:
- a.
- b.
- def:
- def1234:
- def12345。
- after one empty line
- foo
- afer two empty lines
- bar

- changing symbol

A nested HTML list:

- First item
- Second item with subitems:
- Subitem 1
- Subitem 2
- Last list item
66 changes: 66 additions & 0 deletions tests/data/md/nested.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Nesting

A list featuring nesting:

- abc
- abc123
- abc1234
- abc12345
- a.
- b.
- abcd1234:
- abcd12345:
- a.
- b.
- def:
- def1234:
- def12345。

- after one empty line
- foo


- afer two empty lines
- bar
* changing symbol

A nested HTML list:

<ul>
<li>First item</li>
<li>Second item with subitems:
<ul>
<li>Subitem 1</li>
<li>Subitem 2</li>
</ul>
</li>
<li>Last list item</li>
</ul>

<!--
Table nesting apparently not yet suported by HTML backend:
<table>
<tr>
<td>Cell</td>
<td>Nested Table
<table>
<tr>
<td>Cell 1</td>
<>
</tr>
<tr>
<td>Cell 2</td>
</tr>
<tr>
<td>Cell 3</td>
</tr>
<tr>
<td>Cell 4</td>
</tr>
</table>
</td>
</tr>
<tr><td>additional row</td></tr>
</table>
-->
Loading

0 comments on commit 80e0bef

Please sign in to comment.