From 97aa0d37d2578172a5985e6523b446724c2473f5 Mon Sep 17 00:00:00 2001 From: peterbhase Date: Mon, 3 Apr 2017 17:42:46 -0400 Subject: [PATCH 1/3] Add files via upload --- chapter-parse.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 chapter-parse.py diff --git a/chapter-parse.py b/chapter-parse.py new file mode 100644 index 0000000..1d7d9bf --- /dev/null +++ b/chapter-parse.py @@ -0,0 +1,76 @@ +import re + +allWords = list() +verses = list() + +def clean(): + #ensures each verse begins on its one line, separates each verse with newline if not already done in text + with open("kingjames.txt", 'r') as bible, open('cleanedBible.txt', 'w') as cleaned: + for line in bible: + lineBreaks = False + if re.match("^[0-9]*$",line[0]): + cleaned.write("\n") + line = line.strip("\n") + for i in range(6, len(line) - 1): + if (line[i].isdigit() or re.match("^[0-9]*$",line[i]) or line[i] == "6") and(line[i+1] == ":" or re.match("^[0-9]*$",line[i+1])): #sorry this is hardcoded, randomly one 6 wasnt making this conditional true idek + cleaned.write(line[:i] + "\n" + "\n" + line[i:] + " ") + lineBreaks = True + break + if not lineBreaks and len(line) > 1: + cleaned.write(line + " ") + + +def parseJames(): + with open("cleanedBible.txt", 'r') as bible, open('parsed.txt', 'w') as parsed: + for line in bible: + getWords(line) + parsed.write(line) + + +def getWords(line): + words = line.split() + if len(words) > 0: + if re.match("^[0-9:]*$",words[0]): + verse = words[0] + verses.append(verse) + for i in range(1, len(words)): + allWords.append(words[i].strip(",:;.?!")) + + #strips of some chars -- how to deal w apostrophes still not decided + +def uniqueWords(): + print(len(set(allWords))) + +def numWords(): + print(len(allWords)) + +def toTSV(): + with open("parsed.txt", 'r') as bible, open('kingjames.tsv', 'w') as tsv: + tsv.write("Book\t Chapter\t Verse Number\t Verse Text\n") #header for csv + book = "" + for lno, line in enumerate(bible): + if lno < 29: + continue + if 'End of the Project Gutenberg EBook of The King James Bible' in line: + break + if line[:4] == "The " and not line[4].islower(): + book = line[:len(line) -1].strip() + continue + if line.find(":") > -1: + print line + words = line.split() + print words + nums = words[0].split(":") + print nums + tsv.write(book + "\t" + nums[0] + "\t" + nums[1] + "\t" + line[len(words[0]) + 1:]) + +#updates: yay!! csv file looks damn good + + +clean() +parseJames() +toTSV() +#print(sorted(set(allWords))) + +#fixed the newline problem! +#fixing this I think will help with the bible csv column issue too From 9aeb2bb97b72a29df6c9a4ef8bb9bcf93142a9e8 Mon Sep 17 00:00:00 2001 From: sbcogan Date: Tue, 4 Apr 2017 12:36:56 -0400 Subject: [PATCH 2/3] commented out the continue block fixed the problem peter texted me about --- chapter-parse.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/chapter-parse.py b/chapter-parse.py index 1d7d9bf..163cc73 100644 --- a/chapter-parse.py +++ b/chapter-parse.py @@ -49,22 +49,18 @@ def toTSV(): tsv.write("Book\t Chapter\t Verse Number\t Verse Text\n") #header for csv book = "" for lno, line in enumerate(bible): - if lno < 29: - continue + #if lno < 29: + #continue if 'End of the Project Gutenberg EBook of The King James Bible' in line: break if line[:4] == "The " and not line[4].islower(): book = line[:len(line) -1].strip() continue if line.find(":") > -1: - print line words = line.split() - print words nums = words[0].split(":") - print nums tsv.write(book + "\t" + nums[0] + "\t" + nums[1] + "\t" + line[len(words[0]) + 1:]) -#updates: yay!! csv file looks damn good clean() From 1c6bba59fe6e466feced0f81b5f5b2ef73ffce90 Mon Sep 17 00:00:00 2001 From: sbcogan Date: Sun, 9 Apr 2017 21:15:42 -0400 Subject: [PATCH 3/3] idk why u got out of bounds errors