-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
106 lines (78 loc) · 2.74 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
from bs4 import BeautifulSoup as bs
import sys
import re
from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool
import random
NUM_CORES = multiprocessing.cpu_count()
data_in = os.path.join("..","data","sgml")
data_out = os.path.join("..","data","txt")
data_ref = os.path.join("..","ref")
passim_output_path = os.path.join("..","passim_in.json")
if os.path.exists(data_out) == False:
os.mkdir(data_out)
def SGML_to_TXT(path_in):
''' Will take the path to an SGML file as input and write the text in a TXT file'''
path_out = path_in.replace(data_in,data_out).replace(".sgm",".txt")
if os.path.exists(path_out) == True:
return None
with open(path_in) as f:
sgml = f.read()
soup = bs(sgml,"lxml-xml")
with open(path_out,"w") as f:
tag = soup.find("TEXT")
f.write(tag.get_text())
def TXT_to_passimJSON(path_in,final_json):
'''Will take the path to a TXT file as input
as well as the path to an outfile,
will write the content in correct PASSIM json format'''
if os.path.exists(final_json) == True:
f = open(final_json,"a")
else:
f = open(final_json,"w")
#print(path_in)
with open(path_in) as f_in:
id_ = os.path.basename(path_in).replace(".txt","")
series = "not_bible"
text = " ".join(f_in.read().replace("\n"," ").replace('"',"'").split())
json = '{"id": "'+id_+'", "series": "'+series+'", "text": "'+text+'"}\n' # yolo
f.write(json)
f.close()
if __name__ == "__main__":
files = []
for file in os.listdir(data_in):
files.append(os.path.join(data_in,file))
print("Transforming SGML to TXT")
with Pool(NUM_CORES-1) as pool:
r = list(tqdm(pool.imap(SGML_to_TXT, files), total=len(files)))
## I do it in two times because lazy
## multiprocessing would fail with writing to the same file i think
files = []
for file in os.listdir(data_out):
files.append(os.path.join(data_out,file)) # adding to list so i can tqdm
if len(sys.argv) > 1:
limit_files_in_json = sys.argv[1]
try:
int(limit_files_in_json)
except ValueError:
sys.exit("sys.argv[1] must be an int, you put",sys.argv[1])
else:
limit_files_in_json = len(files)
try:
os.remove(passim_output_path)
except FileNotFoundError:
z = 0
random.shuffle(files) # in case we want a smaller subset this ensure we don't always pick the first x
print("Formatting TXT into passim format")
for file in tqdm(files[:int(limit_files_in_json)]):
TXT_to_passimJSON(file,passim_output_path)
with open(os.path.join(data_ref,"king_james.txt")) as f:
text = " ".join(f.read().replace("\n"," ").split())
id_ = "king_james"
series = "bible"
json = '{"id": "'+id_+'", "series": "'+series+'", "text": "'+text+'"}\n'
with open(passim_output_path,"a") as f_out:
f_out.write(json)
print("Passim input file written to disk.")