From 97aa0d37d2578172a5985e6523b446724c2473f5 Mon Sep 17 00:00:00 2001
From: peterbhase <peter.hase@duke.edu>
Date: Mon, 3 Apr 2017 17:42:46 -0400
Subject: [PATCH 1/3] Add files via upload

---
 chapter-parse.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 chapter-parse.py

diff --git a/chapter-parse.py b/chapter-parse.py
new file mode 100644
index 0000000..1d7d9bf
--- /dev/null
+++ b/chapter-parse.py
@@ -0,0 +1,76 @@
+import re
+
+allWords = list()
+verses = list()
+
+def clean():
+    #ensures each verse begins on its one line, separates each verse with newline if not already done in text
+    with open("kingjames.txt", 'r') as bible, open('cleanedBible.txt', 'w') as cleaned:
+        for line in bible:
+                lineBreaks = False
+                if re.match("^[0-9]*$",line[0]):
+                    cleaned.write("\n")
+                line = line.strip("\n")
+                for i in range(6, len(line) - 1):
+                    if (line[i].isdigit() or re.match("^[0-9]*$",line[i]) or line[i] == "6") and(line[i+1] == ":" or re.match("^[0-9]*$",line[i+1])): #sorry this is hardcoded, randomly one 6 wasnt making this conditional true idek
+                        cleaned.write(line[:i] + "\n" + "\n" + line[i:] + " ")
+                        lineBreaks = True
+                        break
+                if not lineBreaks and len(line) > 1:
+                    cleaned.write(line + " ")
+
+
+def parseJames():
+    with open("cleanedBible.txt", 'r') as bible, open('parsed.txt', 'w') as parsed:
+        for line in bible:
+            getWords(line)
+            parsed.write(line)
+
+
+def getWords(line):
+    words = line.split()
+    if len(words) > 0:
+        if re.match("^[0-9:]*$",words[0]):
+            verse = words[0]
+            verses.append(verse)
+        for i in range(1, len(words)):
+            allWords.append(words[i].strip(",:;.?!"))
+
+            #strips of some chars -- how to deal w apostrophes still not decided
+
+def uniqueWords():
+    print(len(set(allWords)))
+
+def numWords():
+    print(len(allWords))
+
+def toTSV():
+    with open("parsed.txt", 'r') as bible, open('kingjames.tsv', 'w') as tsv:
+        tsv.write("Book\t Chapter\t Verse Number\t Verse Text\n") #header for csv
+        book = ""
+        for lno, line in enumerate(bible):
+            if lno < 29:
+                continue
+            if 'End of the Project Gutenberg EBook of The King James Bible' in line:
+                break
+            if line[:4] == "The " and not line[4].islower():
+                book = line[:len(line) -1].strip()
+                continue
+            if line.find(":") > -1:
+                print line
+                words = line.split()
+                print words
+                nums = words[0].split(":")
+                print nums
+                tsv.write(book + "\t" + nums[0] + "\t" + nums[1] + "\t" + line[len(words[0]) + 1:])
+
+#updates: yay!! csv file looks damn good
+
+
+clean()
+parseJames()
+toTSV()
+#print(sorted(set(allWords)))
+
+#fixed the newline problem!
+#fixing this I think will help with the bible csv column issue too

From 9aeb2bb97b72a29df6c9a4ef8bb9bcf93142a9e8 Mon Sep 17 00:00:00 2001
From: sbcogan <sarahcogan1@gmail.com>
Date: Tue, 4 Apr 2017 12:36:56 -0400
Subject: [PATCH 2/3] commented out the continue block

fixed the problem peter texted me about
---
 chapter-parse.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/chapter-parse.py b/chapter-parse.py
index 1d7d9bf..163cc73 100644
--- a/chapter-parse.py
+++ b/chapter-parse.py
@@ -49,22 +49,18 @@ def toTSV():
         tsv.write("Book\t Chapter\t Verse Number\t Verse Text\n") #header for csv
         book = ""
         for lno, line in enumerate(bible):
-            if lno < 29:
-                continue
+            #if lno < 29:
+                #continue
             if 'End of the Project Gutenberg EBook of The King James Bible' in line:
                 break
             if line[:4] == "The " and not line[4].islower():
                 book = line[:len(line) -1].strip()
                 continue
             if line.find(":") > -1:
-                print line
                 words = line.split()
-                print words
                 nums = words[0].split(":")
-                print nums
                 tsv.write(book + "\t" + nums[0] + "\t" + nums[1] + "\t" + line[len(words[0]) + 1:])
 
-#updates: yay!! csv file looks damn good
 
 
 clean()

From 1c6bba59fe6e466feced0f81b5f5b2ef73ffce90 Mon Sep 17 00:00:00 2001
From: sbcogan <sarahcogan1@gmail.com>
Date: Sun, 9 Apr 2017 21:15:42 -0400
Subject: [PATCH 3/3] idk why u got out of bounds errors