-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmdc.py
259 lines (227 loc) · 11.5 KB
/
mdc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import re
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
def aggregate_and_chunk_text(content):
"""Use Langchain splitter to chunk content for LLMs."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
return text_splitter.split_text(content)
def classify_and_aggregate(content):
"""Classify markdown content and aggregate normal text into paragraphs."""
# Define regex patterns
page_break_pattern = r'(?:---|----)'
heading_pattern = r'(#{1,7})\s*(.*)'
bold_italic_pattern = r'(\*\*\*|___)(.*?)\1'
bold_pattern = r'(\*\*|__)(.*?)\1'
italic_pattern = r'(\*|_)(.*?)\1'
inline_code_pattern = r'`([^`]*)`'
code_block_pattern = r'```(.*?)```'
ordered_list_pattern = r'^\d+\.\s+(.+)'
unordered_list_pattern = r'^[-\*\+]\s+(.+)'
table_pattern = r'^\|(.+)\|'
link_pattern = r'\[(.*?)\]\((.*?)\)'
emoji_pattern = r'(:[a-zA-Z0-9_+-]+:)'
highlight_pattern = r'(==)(.*?)(==)'
subscript_pattern = r'~(.*?)~'
superscript_pattern = r'\^(.*?)\^'
classifications = {
'Page Number': [],
'Index': [],
'Type': [],
'Content': []
}
# Split content by page breaks
pages = re.split(page_break_pattern, content)
global_index = 0 # To keep track of the original order across all pages
for page_number, page_content in enumerate(pages, start=1):
lines = page_content.split('\n')
normal_text_accum = [] # Accumulate normal text to aggregate into paragraphs
for line in lines:
# If the line is a page break, classify it as "Page Break"
if re.match(page_break_pattern, line.strip()):
classifications['Type'].append('Page Break')
classifications['Content'].append(line.strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
continue
# Handle headings
heading_match = re.match(heading_pattern, line)
if heading_match:
if normal_text_accum:
# If there is accumulated normal text, add it as a paragraph before the heading
classifications['Type'].append('Paragraph')
classifications['Content'].append(' '.join(normal_text_accum).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
normal_text_accum = []
# Add heading
level = len(heading_match.group(1))
classifications['Type'].append(f'Heading Level {level}')
classifications['Content'].append(heading_match.group(2).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
continue
# Classify other types
match_found = False
# Bold-italic
bold_italic_match = re.search(bold_italic_pattern, line)
if bold_italic_match:
classifications['Type'].append('Bold Italic Text')
classifications['Content'].append(bold_italic_match.group(2).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Bold
bold_match = re.search(bold_pattern, line)
if bold_match:
classifications['Type'].append('Bold Text')
classifications['Content'].append(bold_match.group(2).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Italic
italic_match = re.search(italic_pattern, line)
if italic_match:
classifications['Type'].append('Italic Text')
classifications['Content'].append(italic_match.group(2).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Inline code
inline_code_match = re.search(inline_code_pattern, line)
if inline_code_match:
classifications['Type'].append('Inline Code')
classifications['Content'].append(inline_code_match.group(1).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Code block
code_block_match = re.search(code_block_pattern, line, re.DOTALL)
if code_block_match:
if normal_text_accum:
# Add aggregated normal text as a paragraph
classifications['Type'].append('Paragraph')
classifications['Content'].append(' '.join(normal_text_accum).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
normal_text_accum = []
classifications['Type'].append('Code Block')
classifications['Content'].append(code_block_match.group(1).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Ordered and unordered lists
ordered_list_match = re.match(ordered_list_pattern, line)
unordered_list_match = re.match(unordered_list_pattern, line)
if ordered_list_match or unordered_list_match:
if normal_text_accum:
# Add aggregated normal text as a paragraph
classifications['Type'].append('Paragraph')
classifications['Content'].append(' '.join(normal_text_accum).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
normal_text_accum = []
list_type = 'Ordered List' if ordered_list_match else 'Unordered List'
list_content = ordered_list_match.group(1).strip() if ordered_list_match else unordered_list_match.group(1).strip()
classifications['Type'].append(list_type)
classifications['Content'].append(list_content)
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Table
table_match = re.match(table_pattern, line)
if table_match:
classifications['Type'].append('Table')
classifications['Content'].append(table_match.group(1).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Link
link_match = re.search(link_pattern, line)
if link_match:
classifications['Type'].append('Link')
classifications['Content'].append(f"{link_match.group(1)}|{link_match.group(2)}")
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Emoji
emoji_match = re.search(emoji_pattern, line)
if emoji_match:
classifications['Type'].append('Emoji')
classifications['Content'].append(emoji_match.group(1))
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Highlight
highlight_match = re.search(highlight_pattern, line)
if highlight_match:
classifications['Type'].append('Highlighted Text')
classifications['Content'].append(highlight_match.group(2).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Subscript
subscript_match = re.search(subscript_pattern, line)
if subscript_match:
classifications['Type'].append('Subscript')
classifications['Content'].append(subscript_match.group(1).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Superscript
superscript_match = re.search(superscript_pattern, line)
if superscript_match:
classifications['Type'].append('Superscript')
classifications['Content'].append(superscript_match.group(1).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
match_found = True
# Classify normal text
if not match_found and line.strip():
normal_text_accum.append(line.strip())
# If any normal text remains at the end of the page, add it as a paragraph
if normal_text_accum:
classifications['Type'].append('Paragraph')
classifications['Content'].append(' '.join(normal_text_accum).strip())
classifications['Page Number'].append(page_number)
classifications['Index'].append(global_index)
global_index += 1
# Create the DataFrame
markdown_df = pd.DataFrame(classifications)
# Drop rows with empty content
markdown_df = markdown_df[markdown_df['Content'].str.strip() != '']
# Sort by page number and then by index
markdown_df = markdown_df.sort_values(by=['Page Number', 'Index'])
# Set 'Index' as the DataFrame index and drop the column
markdown_df.set_index('Index', inplace=True)
return markdown_df
def classify_markdown_file(file_path):
"""Classify markdown content from a file, aggregate normal text, and chunk if necessary."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
classified_df = classify_and_aggregate(content)
# Optionally, chunk the content for LLMs
for idx, row in classified_df.iterrows():
if row['Type'] == 'Paragraph':
row['Content'] = aggregate_and_chunk_text(row['Content']) # Use Langchain for chunking if needed
return classified_df
except FileNotFoundError:
print(f"Error: File {file_path} not found.")
return None