-
Notifications
You must be signed in to change notification settings - Fork 15
/
convert_corpfile.py
71 lines (53 loc) · 2.61 KB
/
convert_corpfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import os
import csv
import glob
def makeCorpusFile(inDir : str, outFileName : str) -> None:
"""Converts a folder of text documents to a single text file with one document per line
Keyword arguments:
inDir -- directory containing text documents
outFileName -- prefered name of the output corpusfile
"""
print("\n writing corpusfile ... \n")
mastercorpus = os.path.join(os.getcwd(), outFileName)
with open(mastercorpus, 'w', encoding = "utf-8") as data:
inPath = os.path.join(os.getcwd(), inDir)
folder = glob.glob(os.path.join(inPath, "*.txt"))
filenames = []
for i, text in enumerate(folder):
with open(text, 'r', encoding = "utf-8") as content:
textline = [re.sub(r'\\n\\r', '', document) for document in ' '.join(content.read().split())]
filenames.append(os.path.splitext(os.path.basename(text))[0])
if i != len(folder) - 1:
data.write("".join(textline) + "\n")
else:
data.write("".join(textline) + "\n" + ",".join(filenames))
print("\n corpusfile written successfully \n")
def readFromCorpusFile(corpusfile : str, outfolder : str) -> None:
"""Reads from a corpus file with one document per line and writes a folder with one text per document
Keyword arguments:
corpusfile -- name of the input corpusfile
outfolder -- prefered name of the output folder
"""
print("\n reading from corpus file ... \n")
mastercorpus = os.path.join(os.getcwd(), corpusfile)
outfolderpath = os.path.join(os.getcwd(), outfolder)
filenames = list(open(corpusfile, 'r', encoding = "utf-8"))[-1].split(",")
if not os.path.exists(outfolderpath):
print("\n creating output folder ... \n")
os.makedirs(outfolder)
with open(mastercorpus, 'r', encoding = "utf-8") as f:
for name, line in zip(filenames, f):
contentPath = os.path.join(outfolderpath, name + ".txt")
print("\n writing", os.path.basename(contentPath) ,"from corpusfile to ", outfolder, "\n")
with open(contentPath, 'w', encoding = "utf-8") as file:
file.write(line)
print("\n reading successfull \n")
print("\n files from corpus written successfully \n")
'''
#
#Uncomment one of the following to use or call from console or other script
#
'''
#makeCorpusFile("swcorp_off", "smallcorpusfile.txt")
#readFromCorpusFile("smallcorpusfile.txt", "tesoutfolder")