-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathffts_ps_cleanup.py
62 lines (55 loc) · 1.6 KB
/
ffts_ps_cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Document parsing script
# For cleanup of ffts_ps.html
filein = 'ffts_ps.html'
#filein = "test.txt"
fileout = 'ffts_ps.html'
#fileout = 'testproduct.txt'
#start_tag = '<span class="GramE">'
#start_tag = '<span style="mso-spacerun:yes">'
start_tag = ' </p>'
#end_tag = '</span>'
end_tag = '</p>'
#open the file and read it into memory as "data"
f = open(filein, 'r')
data = f.read()
f.close()
#print(data[:50])
while True:
#parse looking for keyphrase
start_pos = data.find(start_tag)
#print(start_pos)
#loop until end of file reached
if start_pos == -1: break
#parse looking for end keyphrase
'''end_pos = data.find(end_tag, start_pos)
#print(end_pos)
if end_pos == -1:
print("no close tag found, abort")
break'''
'''
# remove the last first
# this preserves the position data
cut_start = end_pos
cut_end = cut_start + len(end_tag)
data = data[:cut_start] + data[cut_end:]
#then remove the first
cut_start = start_pos
cut_end = cut_start + len(start_tag)
data = data[:cut_start] + data[cut_end:]
'''
'''
# remove the whole tag and anything inbetween
# replace it with a single space
cut_start = start_pos
cut_end = end_pos + len(end_tag)
data = data[:cut_start] + ' ' + data[cut_end:]
'''
# remove just the start tag
# replace it with the end tag
cut_start = start_pos
cut_end = start_pos + len(start_tag)
data = data[:cut_start] + end_tag + data[cut_end:]
#save the file out
f = open(fileout, 'w')
f.write(data)
f.close()