Skip to content

Commit

Permalink
Improve merge_columns documentation, pass metadata to TRs
Browse files Browse the repository at this point in the history
  • Loading branch information
marijnkoolen committed Sep 27, 2023
1 parent 602c4ca commit eb444ef
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
6 changes: 4 additions & 2 deletions pagexml/column_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,11 @@ def make_derived_column(lines: List[pdm.PageXMLTextLine], metadata: dict, page_i

def merge_columns(columns: List[pdm.PageXMLColumn],
doc_id: str, metadata: dict) -> pdm.PageXMLColumn:
"""Merge two columns into one, sorting lines by baseline height."""
"""Merge a list of columns into one. First, all text regions of all columns are
checked for spatial overlap, whereby overlapping text regions are merged.
Within the merged text regions, lines are sorted by baseline height."""
trs = [tr for col in columns for tr in col.text_regions]
merged_tr = pagexml_helper.merge_textregions(trs)
merged_tr = pagexml_helper.merge_textregions(trs, metadata)
merged_coords = copy.deepcopy(merged_tr.coords)
merged_col = pdm.PageXMLColumn(doc_id=doc_id, doc_type='index_column',
metadata=metadata, coords=merged_coords,
Expand Down
2 changes: 1 addition & 1 deletion pagexml/helper/pagexml_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def merge_sets(sets: List[Set[any]], min_overlap: int = 1) -> List[Set[any]]:


def merge_textregions(text_regions: List[pdm.PageXMLTextRegion],
metadata: dict, doc_id: str = None) -> Union[pdm.PageXMLTextRegion, None]:
metadata: dict = None, doc_id: str = None) -> Union[pdm.PageXMLTextRegion, None]:
"""Merge two text_regions into one, sorting lines by baseline height."""
if len(text_regions) == 0:
return None
Expand Down

0 comments on commit eb444ef

Please sign in to comment.