|
| 1 | +# -*- encoding: utf-8 -*- |
| 2 | +import numpy as np |
| 3 | + |
| 4 | + |
| 5 | +class ToMarkdown: |
| 6 | + @classmethod |
| 7 | + def to(cls, boxes, txts) -> str: |
| 8 | + # def to(cls, result: RapidOCROutput) -> str: |
| 9 | + """ |
| 10 | + 根据 OCR 结果的坐标信息,将文本还原为近似原始排版的 Markdown。 |
| 11 | +
|
| 12 | + Args: |
| 13 | + result (RapidOCROutput): RapidOCR 的输出结果对象。 |
| 14 | +
|
| 15 | + Returns: |
| 16 | + str: 模拟原始排版的 Markdown 字符串。 |
| 17 | + """ |
| 18 | + if boxes is None or txts is None: |
| 19 | + return "没有检测到任何文本。" |
| 20 | + |
| 21 | + # 1. 将 box 和 text 绑定并排序 |
| 22 | + # 主键:box 的顶部 y 坐标;次键:box 的左侧 x 坐标 |
| 23 | + combined_data = sorted( |
| 24 | + zip(boxes, txts), |
| 25 | + key=lambda item: ( |
| 26 | + cls.get_box_properties(item[0])["top"], |
| 27 | + cls.get_box_properties(item[0])["left"], |
| 28 | + ), |
| 29 | + ) |
| 30 | + |
| 31 | + output_lines = [] |
| 32 | + if not combined_data: |
| 33 | + return "" |
| 34 | + |
| 35 | + # 初始化当前行和前一个框的属性 |
| 36 | + current_line_parts = [combined_data[0][1]] |
| 37 | + prev_props = cls.get_box_properties(combined_data[0][0]) |
| 38 | + |
| 39 | + # 从第二个框开始遍历 |
| 40 | + for box, text in combined_data[1:]: |
| 41 | + current_props = cls.get_box_properties(box) |
| 42 | + |
| 43 | + # 启发式规则来决定如何布局 |
| 44 | + # 条件1:中心线距离是否足够近 |
| 45 | + min_height = min(current_props["height"], prev_props["height"]) |
| 46 | + centers_are_close = abs( |
| 47 | + current_props["center_y"] - prev_props["center_y"] |
| 48 | + ) < (min_height * 0.5) |
| 49 | + |
| 50 | + # 条件2:是否存在垂直方向的重叠 |
| 51 | + # 计算重叠区域的顶部和底部 |
| 52 | + overlap_top = max(prev_props["top"], current_props["top"]) |
| 53 | + overlap_bottom = min(prev_props["bottom"], current_props["bottom"]) |
| 54 | + has_vertical_overlap = overlap_bottom > overlap_top |
| 55 | + |
| 56 | + # 最终判断:满足任一条件即可 |
| 57 | + is_same_line = centers_are_close or has_vertical_overlap |
| 58 | + |
| 59 | + if is_same_line: |
| 60 | + # 在同一行,用空格隔开 |
| 61 | + current_line_parts.append(" ") # 使用多个空格以产生明显间距 |
| 62 | + current_line_parts.append(text) |
| 63 | + else: |
| 64 | + # 不在同一行,需要换行 |
| 65 | + # 先将上一行组合成字符串并添加到输出列表 |
| 66 | + output_lines.append("".join(current_line_parts)) |
| 67 | + |
| 68 | + # 规则2:判断是否需要插入空行(新段落) |
| 69 | + # 如果垂直间距大于上一个框高度的某个比例(如70%),则认为是一个新段落 |
| 70 | + vertical_gap = current_props["top"] - prev_props["bottom"] |
| 71 | + if vertical_gap > prev_props["height"] * 0.7: |
| 72 | + output_lines.append("") # 插入空行来创建段落 |
| 73 | + |
| 74 | + # 开始一个新行 |
| 75 | + current_line_parts = [text] |
| 76 | + |
| 77 | + # 更新前一个框的属性 |
| 78 | + prev_props = current_props |
| 79 | + |
| 80 | + # 添加最后一行 |
| 81 | + output_lines.append("".join(current_line_parts)) |
| 82 | + |
| 83 | + return "\n".join(output_lines) |
| 84 | + |
| 85 | + @staticmethod |
| 86 | + def get_box_properties(box: np.ndarray) -> dict: |
| 87 | + """从坐标数组中计算框的几何属性""" |
| 88 | + # box shape is (4, 2) -> [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] |
| 89 | + ys = box[:, 1] |
| 90 | + xs = box[:, 0] |
| 91 | + |
| 92 | + top = np.min(ys) |
| 93 | + bottom = np.max(ys) |
| 94 | + left = np.min(xs) |
| 95 | + |
| 96 | + return { |
| 97 | + "top": top, |
| 98 | + "bottom": bottom, |
| 99 | + "left": left, |
| 100 | + "height": bottom - top, |
| 101 | + "center_y": top + (bottom - top) / 2, |
| 102 | + } |
0 commit comments