-
Notifications
You must be signed in to change notification settings - Fork 0
/
getSentences.py
40 lines (35 loc) · 1.37 KB
/
getSentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# getSentences.py
# Required folder structure: folder of documents
# Running: python getSentences.py <folder of documents> "<outputFile>"
# Gets all of the textfiles of the given folder and creates a master corpus file where each line is a sentence
import glob
import os
import string
from nltk.tokenize import sent_tokenize
import sys
#get filepath of folder with documents
docsFolder = os.getcwd() + "/" + sys.argv[1]
corpusFilePath = sys.argv[2]
tokenizedDocs = os.getcwd() + "/docInTokens"
def main():
masterString = ""
for filename in os.listdir(docsFolder):
if filename.endswith(".txt"):
filePath = os.path.join(docsFolder, filename)
with open (filePath, 'r', errors="ignore") as myfile:
#make files in one long string without new line characters
masterString += myfile.read().replace('\n', ' ')
else:
continue
#split this string into senteences using NLTK tokenize library
sentences = sent_tokenize(masterString)
masterCorpus = open(corpusFilePath, "w")
#write each sentence into a file
for sentence in sentences:
#delete punctuation
sentence = sentence.translate(str.maketrans('', ' ', string.punctuation))
#make lowercase
sentence = sentence.lower()
masterCorpus.write(sentence + "\n")
if __name__ == "__main__":
main()