-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate.py
294 lines (238 loc) · 9.56 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
PDF to JSON Data Conversion Script
This script processes PDF files, extracts structured data from them, and saves the data in a JSON format.
The PDF files should be downloaded in the 'pdf' folder from any of Turkish 'Probel' hospital system
Requirements:
- chardet: Character encoding detection library
- pdf2htmlEX: PDF to HTML conversion tool in 'exe' folder
Usage:
1. Place your PDF files in the 'pdf' folder.
2. Run the script to process the PDF files and generate JSON data.
The script performs the following steps:
1. Converts PDF files to HTML format using pdf2htmlEX.
2. Extracts data from the generated HTML files.
3. Cleans and processes the extracted data.
4. Saves the cleaned data as a JSON file named 'data.json'.
Author: Eray Ozturk | erayozturk1@gmail.com
URL: github.com/diffstorm
Date: 01/10/2023
"""
import subprocess
import chardet
import re
import os
import html
import json
import sys
import shutil
from datetime import datetime
# Exe directory is the folder where pdf2htmlEX is
exe_dir = "exe"
# pdf directory is where the pdf files are
pdf_dir = "pdf"
# Out directory is the temporary folder that pdf2htmlEX generates its data
out_dir = "out"
# Database file that all the parsed data recorded
data_file = "data.json"
def check_directory(directory_name):
"""
Check if the specified directory exists.
Args:
directory_name (str): The name of the directory to check.
Raises:
SystemExit: Exits the script if the directory does not exist.
"""
current_directory = os.path.dirname(os.path.abspath(__file__))
target_directory = os.path.join(current_directory, directory_name)
if not os.path.exists(target_directory) or not os.path.isdir(target_directory):
return False
return True
def remove_directory(directory_name):
"""
Remove the specified directory if it exists.
Args:
directory_name (str): The name of the directory to remove.
"""
current_directory = os.path.dirname(os.path.abspath(__file__))
target_directory = os.path.join(current_directory, directory_name)
if os.path.exists(target_directory) and os.path.isdir(target_directory):
shutil.rmtree(target_directory)
print(f"'{directory_name}' directory removed.")
def run_process(command):
"""
Run a shell command and capture its output.
Args:
command (str): The shell command to run.
"""
print(command)
current_directory = os.path.dirname(os.path.abspath(__file__))
try:
result = subprocess.run(os.path.join(current_directory, command), shell=True, capture_output=True, text=True, check=True)
print("Output:", result.stdout)
print("Errors:", result.stderr)
except subprocess.CalledProcessError as e:
print("Error occurred with return code:", e.returncode)
print("Error message:", e.stderr)
def save_as_json(data, file_path):
"""
Save a list of data entries as a JSON file.
Args:
data (list): List of data entries (dictionaries).
file_path (str): The path to the JSON file where data will be saved.
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
existing_data = json.load(file)
except (json.JSONDecodeError, FileNotFoundError):
existing_data = []
# Create a dictionary to hold data with unique identifiers as keys
data_dict = {(entry["date"], entry["name"]): entry for entry in existing_data}
# Update the dictionary with new data
for entry in data:
identifier = (entry["date"], entry["name"])
data_dict[identifier] = entry
# Convert the dictionary back to a list of data entries
existing_data = list(data_dict.values())
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(existing_data, file, indent=4, ensure_ascii=False)
def remove_attribute(text, attribute_name, quote_character='"'):
"""
Remove a specified HTML attribute from the text.
Args:
text (str): The input text containing HTML attributes.
attribute_name (str): The name of the attribute to remove.
quote_character (str): The character used for attribute quoting.
Returns:
str: The text with the specified attribute removed.
"""
pattern = r'{}={}[^{}]*{}'.format(attribute_name, quote_character, quote_character, quote_character)
return re.sub(pattern, '', text)
def insert_newline_before_datetime(text):
"""
Insert a newline character before datetime patterns in the text.
Args:
text (str): The input text.
Returns:
str: The text with newlines inserted before datetime patterns.
"""
pattern = r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}'
updated_text = re.sub(pattern, r'\n\g<0>', text)
return updated_text
def extract_datetime_lines_with_text(text):
"""
Extract lines with datetime patterns from the text.
Args:
text (str): The input text.
Returns:
str: Extracted lines with datetime patterns.
"""
pattern = r'^(\d{2}/\d{2}/\d{4} \d{2}:\d{2}.*)'
datetime_lines = re.findall(pattern, text, re.MULTILINE)
return "\n".join(datetime_lines)
def fix_floating_point_numbers(text):
"""
Replace comma separators with dots in floating-point numbers.
Args:
text (str): The input text.
Returns:
str: The text with fixed floating-point numbers.
"""
def replace_comma_with_dot(match):
number = match.group(0)
return number.replace(',', '.')
pattern = r'\d+(?:,\d+)+|\d+,\d+'
return re.sub(pattern, replace_comma_with_dot, text)
def extract_lines_with_specified_format(text):
"""
Extract lines with a specified format from the text.
Args:
text (str): The input text.
Returns:
str: Extracted lines with the specified format.
"""
pattern = r'(?m)^(\d{2}/\d{2}/\d{4} \d{2}:\d{2})\|(\d+(?:[.,]\d+)?)\|([^|]+)\|([^|]+)\|(\d+(?:[.,]\d+)?)\|([^|]*)\|(.*)$'
datetime_lines = re.findall(pattern, text)
extracted_data = []
for date, oldvalue, range, unit, value, name, rest in datetime_lines:
data_dict = {
'date': date,
'oldvalue': oldvalue,
'range': range,
'unit': unit,
'value': value,
'name': name
}
extracted_data.append(data_dict)
save_as_json(extracted_data, data_file)
return "\n".join(f"{date}|{oldvalue}|{range}|{unit}|{value}|{name}" for date, oldvalue, range, unit, value, name, rest in datetime_lines)
def process_html_data(read_path, write_path):
"""
Process data from an HTML file and save it as a text file.
Args:
read_path (str): The path to the input HTML file.
write_path (str): The path to the output text file.
"""
# Detect encoding
with open(read_path, 'rb') as file:
raw_data = file.read()
result = chardet.detect(raw_data)
encoding = result['encoding']
# Read
with open(read_path, 'r', encoding=encoding) as file:
content = file.read()
# Modify
content = remove_attribute(content, "class")
content = remove_attribute(content, "id")
content = remove_attribute(content, "data-data", "'")
content = remove_attribute(content, "data-page-no")
while " >" in content:
content = content.replace(" >", ">")
content = content.replace("</div></div><div><div>", "|")
content = content.replace("</div><div>", " ")
content = content.replace("<div>", "")
content = content.replace("</div>", "")
content = content.replace("<span>", "")
content = content.replace("</span>", "")
content = html.unescape(content)# Decode HTML entities
#content = content.replace(">", "").replace("<", "")# Remove any remaining < and > characters
content = content.replace("*", "")
content = content.replace("| ", "|")
content = content.replace(" |", "|")
content = insert_newline_before_datetime(content)
content = extract_datetime_lines_with_text(content)
content = fix_floating_point_numbers(content)
content = extract_lines_with_specified_format(content)
# Write
with open(write_path, 'w', encoding=encoding) as file:
file.write(content)
if __name__ == "__main__":
sys.stdout.reconfigure(encoding='utf-8')
# Check if directories exist
if(not check_directory(exe_dir)):
print(f"ERROR: '{exe_dir}' folder does not exist.")
exit(1)
if(not check_directory(pdf_dir)):
print(f"ERROR: '{pdf_dir}' folder does not exist.")
exit(1)
# Remove out directory if exists
remove_directory(out_dir)
pdf_files = [file for file in os.listdir(pdf_dir) if file.lower().endswith('.pdf')]
if not pdf_files:
print(f"No PDF files found in the {pdf_dir} directory")
exit(1)
print("PDF files in the directory: {}".format(len(pdf_files)))
pdf_file_count = 0
for pdf_file in pdf_files:
pdf_file_count = pdf_file_count + 1
print(f"----- processing {pdf_file} {pdf_file_count}/{len(pdf_files)}")
# Run a command and capture its output
command = "{}/pdf2htmlEX.exe --embed cfijo --dest-dir \"{}\" --optimize-text 1 --process-nontext 0 \"{}/{}\"".format(exe_dir, out_dir, pdf_dir, pdf_file)
run_process(command)
fname = os.path.splitext(os.path.basename(pdf_file))[0]
read_path = "{}/{}.html".format(out_dir, fname)
write_path = "{}/{}.txt".format(out_dir, fname)
process_html_data(read_path, write_path)
# Remove out directory
remove_directory(out_dir)
print(f"----- {pdf_file} done {pdf_file_count}/{len(pdf_files)}", flush=True)
print(f"All done {pdf_file_count}/{len(pdf_files)}")