From 2ea5f00eb6da33df119a006920dd96cf4a31dbe2 Mon Sep 17 00:00:00 2001
From: Davious1 <jarmfiel@ualberta.ca>
Date: Fri, 9 Feb 2018 10:27:19 -0700
Subject: [PATCH 1/2] Update proj1.py

---
 proj1.py | 51 +++++++++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/proj1.py b/proj1.py
index 83ba722..e27fdc0 100644
--- a/proj1.py
+++ b/proj1.py
@@ -43,17 +43,26 @@
 
 #There should be two zones: title and body. The body is the content of the file, while the title is encoded in the file name. IMPORTANT: when tokenizing the titles, make sure to separate words connected by _ (e.g., 101_Dalmatians_1996 is tokenized into 101 Dalmatians and 1996).
 
-
 import sys
 import re
 import os
 import argparse
 from collections import defaultdict
-import nltk 
-nltk.download('punkt')
+#import nltk 
+#nltk.download('punkt')
 takein = sys.stdin
 #ZONE SCORING
-
+'''
+How do we compute the score of a query-document pair?
+• If no query term occurs in the document: score should be 0.
+• The more frequent a query term in the document, the higher
+the score
+• The more query terms occur in the document, the higher the
+score
+We will look at a number of alternatives for doing this.
+
+Weight t,d = (1 + log TFt,d) · log N/
+                            DFt'''
 
 #for arg in sys.argv:
     #print(arg)
@@ -70,37 +79,23 @@ def create_zone_index(doc_dir, ind_dir):
         print(title)
         file = open(file_path, 'r')
         index = defaultdict(list)
-
-        tokenized = nltk.data.load('tokenizers/punkt/english.pickle')
         for words in file:
+            #word = line.strip()
+            word = words.split(" ")
+            #print(word)
 
-           
-            tokenized = nltk.data.load('tokenizers/punkt/english.pickle')
-            #word = tokenized.split(" ")
-            print(''.join(tokenized.tokenize(file.strip())))
+            #poop = create_index(word, docnum) 
+            #index = defaultdict(list)
+    
             for i, tokens in enumerate(word):
-                
-                #nltk part
-
-
-                tokenized = nltk.data.load('tokenizers/punkt/english.pickle')
-                print(''.join(tokenized.tokenize(text.strip())))
+        #print(tokens) #good one
+        #for token in tokens:
+            #print(token)
+                #tokens.strip("'")
                 doc_pos = "{}:{},".format(  docnum, i,)
                 index[tokens].append(doc_pos)
       
             print(index)
-
-            line1 = "{}\n".format(index)
-            index_file.write(line1)
-
-        file.close()
-    index_file.close()
-
-
-
-
-def create_index (data, docnum):
-    index = defaultdict(list)
     
     for i, tokens in enumerate(data):
         #print(tokens) #good one

From 3f9b5fd528c19097a381ac3383186a47703f37e9 Mon Sep 17 00:00:00 2001
From: Davious1 <jarmfiel@ualberta.ca>
Date: Fri, 9 Feb 2018 11:10:09 -0700
Subject: [PATCH 2/2] Update proj1.py

tried to merge dictionary entries so that there is only one posting for 'there' and so on
---
 proj1.py | 62 +++++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/proj1.py b/proj1.py
index e27fdc0..fa781da 100644
--- a/proj1.py
+++ b/proj1.py
@@ -52,17 +52,6 @@
 #nltk.download('punkt')
 takein = sys.stdin
 #ZONE SCORING
-'''
-How do we compute the score of a query-document pair?
-• If no query term occurs in the document: score should be 0.
-• The more frequent a query term in the document, the higher
-the score
-• The more query terms occur in the document, the higher the
-score
-We will look at a number of alternatives for doing this.
-
-Weight t,d = (1 + log TFt,d) · log N/
-                            DFt'''
 
 #for arg in sys.argv:
     #print(arg)
@@ -78,44 +67,35 @@ def create_zone_index(doc_dir, ind_dir):
         title = doc_id[2:]
         print(title)
         file = open(file_path, 'r')
-        index = defaultdict(list)
+        index = idict(list)
         for words in file:
-            #word = line.strip()
+            
             word = words.split(" ")
-            #print(word)
 
-            #poop = create_index(word, docnum) 
-            #index = defaultdict(list)
-    
             for i, tokens in enumerate(word):
-        #print(tokens) #good one
-        #for token in tokens:
-            #print(token)
-                #tokens.strip("'")
+                index2 = idict(list)
                 doc_pos = "{}:{},".format(  docnum, i,)
-                index[tokens].append(doc_pos)
-      
-            print(index)
-    
-    for i, tokens in enumerate(data):
-        #print(tokens) #good one
-        #for token in tokens:
-            #print(token)
-        tokens.strip("'")
-        index[tokens].append(i)
-        #poop = (tokens, docnum\n)
-        #print('\n')
-        ##line1 = "{}\t{}:{}{}".format(data, docnum, tokens, '\n') #tokens = word index[tokens]=position of word in document
-        ##print(line1)
-    #print(index)
-        #index.write(line1)
-    #print(index)
+                index2[tokens].append(doc_pos)
+        index.update(index2)
+        print(index)
+        line1 = "{}".format(index)
+        index_file.write(line1)
+
+        file.close()
+    #index_string = "{}".format(index_file)
+    #for words in index_file:
 
+    index_file.close()
+ 
 
-    return index
 
 
 
+def print_index (data):
+    index_file = open(ind_dir+'/index.txt', 'r')
+    print(index_file)
+    index_file.close()
+
 
 
 
@@ -129,7 +109,8 @@ def main():
     what_to_do =sys.argv[1]
     if what_to_do == './create_zone_index':
         create_zone_index(sys.argv[2], sys.argv[3])
-        #print_index(sys.argv[2])
+    elif  what_to_do == './print_index':
+        print_index(sys.argv[2])
     elif what_to_do == './zone_scorer':
         zone_scorer(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
     else:
@@ -139,4 +120,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-