diff --git a/pagexml/helper/file_helper.py b/pagexml/helper/file_helper.py index ca03a62..71a194f 100644 --- a/pagexml/helper/file_helper.py +++ b/pagexml/helper/file_helper.py @@ -1,4 +1,3 @@ -import glob import os import tarfile import zipfile diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py index 1bbeee7..64e5a13 100644 --- a/pagexml/helper/pagexml_helper.py +++ b/pagexml/helper/pagexml_helper.py @@ -350,7 +350,7 @@ def read_line_format_file(line_format_files: Union[str, List[str]], else: if len(row) > len(headers): raise IndexError( - f"Missing columns. Header has {len(headers)} columns while line {li+1} in row " + f"Missing columns. Header has {len(headers)} columns while line {li + 1} in row " f"has {len(row)} columns") yield {header: row[hi] if len(row) > hi else None for hi, header in enumerate(headers)} @@ -380,19 +380,20 @@ def get_custom_tags(doc: pdm.PageXMLDoc) -> List[Dict[str, any]]: offset = tag_el["offset"] length = tag_el["length"] - value = line.text[offset:offset+length] + value = line.text[offset:offset + length] custom_tags.append({ - "type": tag, - "value": value, - "region_id": region.id, - "line_id": line.id, - "offset": offset, + "type": tag, + "value": value, + "region_id": region.id, + "line_id": line.id, + "offset": offset, "length": length, }) return custom_tags + class LineIterable: def __init__(self, line_format_files: Union[str, List[str]], headers: List[str] = None): @@ -532,5 +533,3 @@ def merge_lines(lines: List[pdm.PageXMLTextLine], remove_word_break: bool = Fals text += curr_line.text return pdm.PageXMLTextLine(metadata=copy.deepcopy(lines[0].metadata), coords=coords, text=text) - - diff --git a/tests/physical_document_model_test.py b/tests/physical_document_model_test.py index 3d38d7e..75fb037 100644 --- a/tests/physical_document_model_test.py +++ b/tests/physical_document_model_test.py @@ -2,7 +2,6 @@ from unittest.mock import Mock import pagexml.model.physical_document_model as pdm -# from pagexml.model.physical_document_model import pdm.Coords, pdm.StructureDoc, PhysicalStructureDoc, pdm.LogicalStructureDoc class TestCoords(unittest.TestCase): @@ -52,14 +51,14 @@ def test_list_of_line_point_coords_to_hull_of_coords(self): def test_valid_points_from_str(self): coords = pdm.Coords('1216,1119 1205,1109 1202,1109 1198,1112 1195,1112 1191,1116 1164,1116 1160,1119 1147,1119' - ' 1143,1123 1126,1123 1123,1126 1102,1126 1098,1130 1074,1130 1071,1133 1016,1133 1012,1136' - ' 964,1136 961,1140 957,1140 954,1143 940,1143 937,1147 930,1147 926,1150 916,1150 912,1154' - ' 899,1154 895,1157 888,1157 885,1160 882,1160 878,1164 875,1164 857,1181 847,1181 840,1188' - ' 837,1188 833,1191 830,1191 826,1195 823,1195 820,1198 816,1198 813,1202 809,1202 795,1216' - ' 795,1229 799,1229 802,1233 813,1233 816,1236 875,1236 878,1240 895,1240 899,1243 923,1243' - ' 926,1247 1036,1247 1040,1243 1147,1243 1150,1240 1181,1240 1185,1236 1209,1236 1212,1233' - ' 1216,1233 1219,1229 1219,1226 1222,1222 1222,1216 1219,1212 1219,1209 1216,1205 1216,1150' - ' 1219,1147 1219,1143 1216,1140') + ' 1143,1123 1126,1123 1123,1126 1102,1126 1098,1130 1074,1130 1071,1133 1016,1133 1012,1136' + ' 964,1136 961,1140 957,1140 954,1143 940,1143 937,1147 930,1147 926,1150 916,1150 912,1154' + ' 899,1154 895,1157 888,1157 885,1160 882,1160 878,1164 875,1164 857,1181 847,1181 840,1188' + ' 837,1188 833,1191 830,1191 826,1195 823,1195 820,1198 816,1198 813,1202 809,1202 795,1216' + ' 795,1229 799,1229 802,1233 813,1233 816,1236 875,1236 878,1240 895,1240 899,1243 923,1243' + ' 926,1247 1036,1247 1040,1243 1147,1243 1150,1240 1181,1240 1185,1236 1209,1236 1212,1233' + ' 1216,1233 1219,1229 1219,1226 1222,1222 1222,1216 1219,1212 1219,1209 1216,1205 1216,1150' + ' 1219,1147 1219,1143 1216,1140') x = [p[0] for p in coords.points] print(min(x), max(x)) y = [p[1] for p in coords.points]