Skip to content

Commit

Permalink
Merge pull request #12 from knaw-huc/merge_columns
Browse files Browse the repository at this point in the history
  • Loading branch information
brambg committed Sep 27, 2023
2 parents 9ac725e + eb444ef commit 4896ae7
Show file tree
Hide file tree
Showing 9 changed files with 253 additions and 57 deletions.
99 changes: 78 additions & 21 deletions pagexml/analysis/layout_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,17 @@ def interpolate_baseline_points(points: List[Tuple[int, int]],
return interpolated_baseline_points


def compute_points_distances(points1: List[Tuple[int, int]], points2: List[Tuple[int, int]],
step: int = 50):
if points1 is None or points2 is None:
return np.array([])
b1_points = interpolate_baseline_points(points1, step=step)
b2_points = interpolate_baseline_points(points2, step=step)
distances = np.array([abs(b2_points[curr_x] - b1_points[curr_x]) for curr_x in b1_points
if curr_x in b2_points])
return distances


def compute_baseline_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
line2: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
step: int = 50) -> np.ndarray:
Expand Down Expand Up @@ -124,19 +135,29 @@ def compute_baseline_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.PageXM
points2 = line2.baseline.points if line2.baseline.points is not None else []
else:
points2 = [point for line in line2 for point in line.baseline.points if line.baseline.points is not None]
if points1 is None or points2 is None:
return np.array([])
b1_points = interpolate_baseline_points(points1, step=step)
b2_points = interpolate_baseline_points(points2, step=step)
distances = np.array([abs(b2_points[curr_x] - b1_points[curr_x]) for curr_x in b1_points
if curr_x in b2_points])
distances = compute_points_distances(points1, points2, step=step)
if len(distances) == 0:
avg1 = average_baseline_height(line1)
avg2 = average_baseline_height(line2)
distances = np.array([abs(avg1 - avg2)])
return distances


def get_bottom_points(line: pdm.PageXMLTextLine) -> List[Tuple[int, int]]:
right_most = [p for p in line.coords.points if p[0] == line.coords.right][0]
right_most_index = line.coords.points.index(right_most)
return line.coords.points[right_most_index:]


def compute_bounding_box_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
line2: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
step: int = 50):
points1 = get_bottom_points(line1)
points2 = get_bottom_points(line2)
distances = compute_points_distances(points1, points2, step=step)
return distances


def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]]) -> int:
"""Compute the average (mean) baseline height for comparing lines that
are not horizontally aligned.
Expand Down Expand Up @@ -179,8 +200,9 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
return int(total_avg)


def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, debug: int = 0) -> Tuple[List[Tuple[int, int]],
List[Tuple[int, int]]]:
def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
debug: int = 0) -> Tuple[List[Tuple[int, int]],
List[Tuple[int, int]]]:
"""Split the list of bounding polygon coordinates of a line in sets of points above and below
the baseline. When a line has no baseline or no bounding polygon, empty lists are
returned
Expand All @@ -195,18 +217,25 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, debug: int = 0)
ci_c = 0
below_baseline = []
above_baseline = []
if line.baseline is None:
if line.baseline is None or line.coords is None:
return above_baseline, below_baseline
if not line.baseline or not line.coords:
return above_baseline, below_baseline
if line.coords.right < line.baseline.left:
return above_baseline, below_baseline
if line.coords.left > line.baseline.right:
return above_baseline, below_baseline
interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=50).items()]
if debug > 2:
print('baseline_points:', line.baseline.points)
print('interpolated_baseline_points:', interpolated_baseline_points)
sorted_coord_points = sorted(line.coords.points, key=lambda p: p[0])
if debug > 0:
print('sorted_coord_points:', sorted_coord_points)
print('len(sorted_coord_points):', len(sorted_coord_points))
if debug > 1:
print('ci_c:', ci_c)
num_baseline_points = len(line.baseline.points)
num_baseline_points = len(interpolated_baseline_points)
num_coord_points = len(sorted_coord_points)
for ci_b, curr_b in enumerate(interpolated_baseline_points):
curr_bx, curr_by = curr_b
Expand All @@ -223,24 +252,28 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, debug: int = 0)
if debug > 0:
print(f'sort_above_below - curr_c ({ci_c}): {curr_c}')
ci_c += 1
if curr_cy > curr_by:
if debug > 0:
print(f'sort_above_below - below')
below_baseline.append(curr_c)
elif curr_cy < curr_by:
if curr_cy < curr_by:
if debug > 0:
print(f'sort_above_below - above')
above_baseline.append(curr_c)
else:
if debug > 0:
print(f'sort_above_below - neither')
pass
print(f'sort_above_below - below')
below_baseline.append(curr_c)

return above_baseline, below_baseline


def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50) -> np.array:
above_baseline, below_baseline = sort_coords_above_below_baseline(line)
def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50,
ignore_errors: bool = True, debug: int = 0) -> np.array:
above_baseline, below_baseline = sort_coords_above_below_baseline(line, debug=debug)
if len(above_baseline) == 0:
if ignore_errors is False:
ValueError(f'line {line.id} has no bounding coordinates above baseline')
return None
if len(below_baseline) == 0:
if ignore_errors is False:
ValueError(f'Warning: line {line.id} has no bounding coordinates below baseline')
int_base = interpolate_baseline_points(line.baseline.points, step=step)
int_above = interpolate_baseline_points(above_baseline, step=step)

Expand All @@ -249,10 +282,13 @@ def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50) -> np.array:
if x in int_above:
height[x] = int_base[x] - int_above[x]

if len(height) == 0:
print()
return None
return np.array(list(height.values()))


def get_height_stats(line_heights: np.array) -> Dict[str, int]:
def compute_height_stats(line_heights: np.array) -> Dict[str, int]:
return {
'max': line_heights.max(),
'min': line_heights.min(),
Expand All @@ -261,14 +297,35 @@ def get_height_stats(line_heights: np.array) -> Dict[str, int]:
}


def get_line_height_stats(line: pdm.PageXMLTextLine, step: int = 50,
ignore_errors: bool = False, debug: int = 0) -> Union[Dict[str, int], None]:
try:
line_heights = get_text_heights(line, step=step, ignore_errors=ignore_errors, debug=debug)
if debug > 0:
print('get_line_height_stats - line_heights:', line_heights)
if line_heights is None:
return None
return compute_height_stats(line_heights)
except IndexError:
print('ERROR INFO:')
print('get_line_height_stats - line.baseline:', line.baseline)
print('get_line_height_stats - line.coords:', line.coords)
raise
except AttributeError:
return None


def get_line_distances(lines: List[pdm.PageXMLTextLine]) -> List[np.ndarray]:
all_distances = []
for li, curr_line in enumerate(lines):
next_line = None
if li + 1 < len(lines):
next_line = lines[li + 1]
if next_line:
distances = compute_baseline_distances(curr_line, next_line)
if curr_line.baseline and next_line.baseline:
distances = compute_baseline_distances(curr_line, next_line)
else:
distances = compute_bounding_box_distances(curr_line, next_line)
all_distances.append(distances)
return all_distances

Expand Down
1 change: 0 additions & 1 deletion pagexml/analysis/text_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,6 @@ def _set_merged_with(self, lines: Iterable[Union[str, Dict[str, str]]],
min_common_freq: int = 1000) -> None:
prev_words = []
typical_start_words, typical_end_words = get_typical_start_end_words(self)
li = 0
for li, line in enumerate(lines):
if line["text"] is None:
continue
Expand Down
14 changes: 8 additions & 6 deletions pagexml/column_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,16 @@ def make_derived_column(lines: List[pdm.PageXMLTextLine], metadata: dict, page_i

def merge_columns(columns: List[pdm.PageXMLColumn],
doc_id: str, metadata: dict) -> pdm.PageXMLColumn:
"""Merge two columns into one, sorting lines by baseline height."""
merged_lines = [line for col in columns for line in col.get_lines()]
merged_lines = list(set(merged_lines))
sorted_lines = sorted(merged_lines, key=lambda x: x.baseline.y)
merged_coords = pdm.parse_derived_coords(sorted_lines)
"""Merge a list of columns into one. First, all text regions of all columns are
checked for spatial overlap, whereby overlapping text regions are merged.
Within the merged text regions, lines are sorted by baseline height."""
trs = [tr for col in columns for tr in col.text_regions]
merged_tr = pagexml_helper.merge_textregions(trs, metadata)
merged_coords = copy.deepcopy(merged_tr.coords)
merged_col = pdm.PageXMLColumn(doc_id=doc_id, doc_type='index_column',
metadata=metadata, coords=merged_coords,
lines=merged_lines)
text_regions=[merged_tr])
merged_col.set_as_parent([merged_tr])
return merged_col


Expand Down
54 changes: 51 additions & 3 deletions pagexml/helper/pagexml_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def elements_overlap(element1: pdm.PageXMLDoc, element2: pdm.PageXMLDoc,
if v_overlap / element2.coords.height > threshold:
if h_overlap / element2.coords.width > threshold:
return True
else:
return False
else:
return False

Expand Down Expand Up @@ -81,6 +83,43 @@ def horizontal_group_lines(lines: List[pdm.PageXMLTextLine]) -> List[List[pdm.Pa
return horizontally_grouped_lines


def merge_sets(sets: List[Set[any]], min_overlap: int = 1) -> List[Set[any]]:
merged_sets = []

while len(sets) > 0:
current_set = sets.pop(0)
merged_set = set(current_set)

i = 0
while i < len(sets):
if len(merged_set.intersection(sets[i])) >= min_overlap:
merged_set.update(sets[i])
sets.pop(i)
else:
i += 1

merged_sets.append(merged_set)

return merged_sets


def merge_textregions(text_regions: List[pdm.PageXMLTextRegion],
metadata: dict = None, doc_id: str = None) -> Union[pdm.PageXMLTextRegion, None]:
"""Merge two text_regions into one, sorting lines by baseline height."""
if len(text_regions) == 0:
return None
merged_lines = [line for tr in text_regions for line in tr.get_lines()]
merged_lines = list(set(merged_lines))
sorted_lines = sorted(merged_lines, key=lambda x: x.baseline.y)
merged_coords = pdm.parse_derived_coords(sorted_lines)
merged_tr = pdm.PageXMLTextRegion(doc_id=doc_id, doc_type='index_text_region',
metadata=metadata, coords=merged_coords,
lines=sorted_lines)
if doc_id is None:
merged_tr.set_derived_id(text_regions[0].parent.id)
return merged_tr


def horizontally_merge_lines(lines: List[pdm.PageXMLTextLine]) -> List[pdm.PageXMLTextLine]:
"""Sort lines vertically and merge horizontally adjacent lines."""
horizontally_grouped_lines = horizontal_group_lines(lines)
Expand Down Expand Up @@ -368,7 +407,8 @@ def __iter__(self):


def make_line_text(line: pdm.PageXMLTextLine, do_merge: bool,
end_word: str, merge_word: str, word_break_chars: Union[str, Set[str]] = '-') -> str:
end_word: str, merge_word: str,
word_break_chars: Union[str, Set[str], List[str]] = '-') -> str:
line_text = line.text
if len(line_text) >= 2 and line_text[-1] in word_break_chars and line_text[-2] in word_break_chars:
# remove the redundant line break char
Expand Down Expand Up @@ -402,7 +442,7 @@ def make_line_range(text: str, line: pdm.PageXMLTextLine, line_text: str) -> Dic


def make_text_region_text(lines: List[pdm.PageXMLTextLine],
word_break_chars: List[str] = '-',
word_break_chars: Union[str, Set[str], List[str]] = '-',
wbd: text_stats.WordBreakDetector = None) -> Tuple[Union[str, None], List[Dict[str, any]]]:
"""Turn the text lines in a region into a single paragraph of text, with a list of line ranges
that indicates how the text of each line corresponds to character offsets in the paragraph.
Expand All @@ -428,6 +468,7 @@ def make_text_region_text(lines: List[pdm.PageXMLTextLine],
prev_words = text_helper.get_line_words(prev_line.text, word_break_chars=word_break_chars) \
if prev_line.text else []
if len(lines) > 1:
remove_prefix_word_break = False
for curr_line in lines[1:]:
if curr_line.text is None or curr_line.text == '':
do_merge = False
Expand All @@ -440,10 +481,17 @@ def make_text_region_text(lines: List[pdm.PageXMLTextLine],
if prev_line.text is not None:
do_merge, merge_word = text_stats.determine_word_break(curr_words, prev_words,
wbd=wbd,
word_break_chars=word_break_chars)
word_break_chars=word_break_chars,
debug=False)
# print(do_merge, merge_word)
prev_line_text = make_line_text(prev_line, do_merge, prev_words[-1], merge_word,
word_break_chars=word_break_chars)
if remove_prefix_word_break and prev_line_text.startswith('„'):
prev_line_text = prev_line_text[1:]
if '„' in word_break_chars and prev_words[-1].endswith('„') and curr_line.text.startswith('„'):
remove_prefix_word_break = True
else:
remove_prefix_word_break = False
# print(prev_line_text)
else:
prev_line_text = ''
Expand Down
22 changes: 15 additions & 7 deletions pagexml/helper/text_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
import pagexml.parser as parser


def read_lines_from_line_files(pagexml_line_files: Union[str, List[str]]) -> Generator[str, None, None]:
def read_lines_from_line_files(pagexml_line_files: Union[str, List[str]],
has_headers: bool = True) -> Generator[str, None, None]:
if isinstance(pagexml_line_files, str):
pagexml_line_files = [pagexml_line_files]
for line_file in pagexml_line_files:
for li, line_file in enumerate(pagexml_line_files):
with gzip.open(line_file, 'rt') as fh:
if has_headers is True and li > 0:
_headers = next(fh)
for line in fh:
yield line

Expand Down Expand Up @@ -108,11 +111,11 @@ def __init__(self, pagexml_files: Union[str, List[str]] = None,
raise TypeError(f"MUST use one of the following optional arguments: "
f"'pagexml_files', 'pagexml_docs' or 'pagexml_line_file'.")
if pagexml_line_files:
self.pagexml_line_files = make_list(pagexml_line_files)
self.pagexml_line_files = sorted(make_list(pagexml_line_files))
if pagexml_files:
self.pagexml_files = make_list(pagexml_files)
self.pagexml_files = sorted(make_list(pagexml_files))
if pagexml_docs:
self.pagexml_docs = make_list(pagexml_docs)
self.pagexml_docs = sorted(make_list(pagexml_docs))

def __iter__(self) -> Generator[Dict[str, str], None, None]:
if self.groupby is None:
Expand Down Expand Up @@ -149,7 +152,7 @@ def _iter_from_pagexml_docs(self, pagexml_doc_iterator) -> Generator[Dict[str, a
yield line

def _iter_from_line_file(self) -> Generator[Dict[str, any], None, None]:
line_iterator = read_lines_from_line_files(self.pagexml_line_files)
line_iterator = read_lines_from_line_files(self.pagexml_line_files, has_headers=self.has_headers)
if self.has_headers is True:
header_line = next(line_iterator)
self.line_file_headers = header_line.strip().split('\t')
Expand Down Expand Up @@ -190,7 +193,12 @@ def read_pagexml_docs_from_line_file(line_files: Union[str, List[str]], has_head
# print(line_dict)
doc_coords, tr_coords, line_coords = None, None, None
if add_bounding_box is True:
doc_coords = transform_box_to_coords(line_dict['doc_box'])
try:
doc_coords = transform_box_to_coords(line_dict['doc_box'])
except ValueError:
print(line_dict['doc_box'])
print(line_dict)
raise
tr_coords = transform_box_to_coords(line_dict['textregion_box'])
# print('\t', tr_coords, line_dict['textregion_box'])
if line_dict['line_box'] is None:
Expand Down
Loading

0 comments on commit 4896ae7

Please sign in to comment.