-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text_cleaner.py
37 lines (35 loc) · 1.03 KB
/
Text_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
# This code removes the problems which were occuring before and now we have better clean and parsed data
replace = ", ‚ ' ; : / ™ [ ] { } ( ) * - + & ! @ # $ % ^ _ = ` ‘ “ ~"
if __name__ == "__main__":
i = 0
# change the value of this variable with the number of files that have been parsed
max1 = 80
with open("./To_download_links.txt",'r') as t:
x = t.read()
for link in x.split():
if i < max1:
i += 1
link = link.split("/")[-1]
print(link)
try:
s = 0
# print(link)
if os.path.isfile("./Better_cleaned/"+link+'.txt'):
print("The file has already been parsed")
print()
else:
with open('./Newpaper_Cleaned/'+link+'.txt','rb') as f:
x = f.read()
x = x.decode("utf-8")
t = str(x)
for r in replace.split():
t = t.replace(r,'')
t = t.replace("\n"," ")
t = t.replace("-","")
s = t
with open('./Better_cleaned/'+link+'.txt','wb') as f:
f.write(s.encode("utf-8"))
f.close()
except:
pass