-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathjson2md.py
135 lines (122 loc) · 5.45 KB
/
json2md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from PIL import Image
from tqdm import tqdm
import json
import os
from collections import defaultdict
import langid
import re
def poly2bbox(poly):
L = poly[0]
U = poly[1]
R = poly[2]
D = poly[5]
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
bbox = [L, U, R, D]
return bbox
table_format = 'html' # table format in markdown. optional: latex
save_path = r'../demo_data/omnidocbench_demo/mds'
save_path_imgs = os.path.join(save_path, 'imgs')
os.makedirs(save_path, exist_ok=True)
os.makedirs(save_path_imgs, exist_ok=True)
with open(r'../demo_data/omnidocbench_demo/OmniDocBench_demo.json', 'r', encoding='utf-8') as f:
samples = json.load(f)
def text_norm(text):
after_text = replace_repeated_chars(text)
return after_text.replace('/t', '\t').replace('/n', '\n')
# 标准化所有连续的字符
def replace_repeated_chars(input_str):
input_str = re.sub(r'_{4,}', '____', input_str) # Replace more than 4 consecutive underscores with 4 underscores
input_str = re.sub(r' {4,}', ' ', input_str) # Replace more than 4 consecutive spaces with 4 spaces
return re.sub(r'([^a-zA-Z0-9])\1{10,}', r'\1\1\1\1', input_str) # For other consecutive symbols (except numbers and letters), replace more than 10 occurrences with 4
def remove_unencodable_characters(s, encoding):
return s.encode(encoding, errors='ignore').decode(encoding)
for sample in samples:
annos = []
for x in sample['layout_dets']:
if x.get('order'):
annos.append(x)
# deal with truncated
saved_element_dict = defaultdict(list)
related_truncated = []
truncated_all = {}
for relation in sample["extra"]["relation"]: # Handle truncated text blocks
if relation["relation_type"] == 'truncated':
truncated_all[relation["source_anno_id"]] = ""
truncated_all[relation["target_anno_id"]] = ""
exist_flag = False
for merge_list in related_truncated:
if relation["source_anno_id"] in merge_list or relation["target_anno_id"] in merge_list: # Consider cases where three text blocks might need to be merged
merge_list.append(relation["source_anno_id"])
merge_list.append(relation["target_anno_id"])
exist_flag = True
if not exist_flag:
related_truncated.append([relation["source_anno_id"], relation["target_anno_id"]])
# print('related_truncated: ', related_truncated)
merged_annos = []
for item in annos:
if item['anno_id'] not in truncated_all.keys():
merged_annos.append(item)
else:
truncated_all[item['anno_id']] = item
# print('truncated_all: ', truncated_all)
for merge_list in related_truncated:
text_block_list = [truncated_all[key] for key in merge_list]
# if text_block_a['category_type'] != text_block_b['category_type']:
# print('') # !!check
sorted_block = sorted(text_block_list, key=lambda x: x['order'])
text = ""
for block in sorted_block:
line_content = block['text']
if langid.classify(line_content)[0] == 'en' and line_content[-1] != "-":
text += f" {line_content}"
elif langid.classify(line_content)[0] == 'en' and line_content[-1] == "-":
text = text[:-1] + f"{line_content}"
else:
text += f"{line_content}"
merged_block = {
"category_type": sorted_block[0]["category_type"], # use the info of the first block
"order": sorted_block[0]["order"],
"anno_id": sorted_block[0]["anno_id"],
"text": text,
"merge_list": sorted_block
}
merged_annos.append(merged_block)
print('Merged truncated')
annos = sorted(merged_annos, key=lambda x: x['order'])
img_name = os.path.basename(sample['page_info']['image_path'])
img_path = os.path.join(r'../demo_data/omnidocbench_demo/images', img_name)
img = Image.open(img_path)
md_path = os.path.join(save_path, os.path.basename(sample['page_info']['image_path'])[:-4] + '.md')
with open(md_path, 'w', encoding='utf-8') as f:
for i, anno in enumerate(annos):
if anno["category_type"] == 'figure':
bbox = poly2bbox(anno['poly'])
im = img.crop(bbox).convert('RGB')
anno_id = anno["anno_id"]
crop_img_path = os.path.join(save_path_imgs, f"{img_name[:-4]}_{anno_id}.jpg")
im.save(crop_img_path)
f.write(f'![](./imgs/{img_name[:-4]}_{anno_id}.jpg)')
f.write('\n\n')
sep = '\n\n'
item = anno
if anno["category_type"] == 'table':
f.write(item[table_format])
f.write(sep)
elif item.get('text'):
print (item["category_type"])
if item["category_type"] == 'title':
print ("title", item['text'])
f.write('# ' + text_norm(item['text'].strip('#').strip()))
f.write(sep)
else:
print ("==err==", item["text"])
f.write(text_norm(item['text']))
f.write(sep)
elif item.get('html'):
f.write(item['html'])
f.write(sep)
elif item.get('latex'):
f.write(item['latex'])
f.write(sep)
f.close()