From 2ea5f00eb6da33df119a006920dd96cf4a31dbe2 Mon Sep 17 00:00:00 2001 From: Davious1 Date: Fri, 9 Feb 2018 10:27:19 -0700 Subject: [PATCH 1/2] Update proj1.py --- proj1.py | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/proj1.py b/proj1.py index 83ba722..e27fdc0 100644 --- a/proj1.py +++ b/proj1.py @@ -43,17 +43,26 @@ #There should be two zones: title and body. The body is the content of the file, while the title is encoded in the file name. IMPORTANT: when tokenizing the titles, make sure to separate words connected by _ (e.g., 101_Dalmatians_1996 is tokenized into 101 Dalmatians and 1996). - import sys import re import os import argparse from collections import defaultdict -import nltk -nltk.download('punkt') +#import nltk +#nltk.download('punkt') takein = sys.stdin #ZONE SCORING - +''' +How do we compute the score of a query-document pair? +• If no query term occurs in the document: score should be 0. +• The more frequent a query term in the document, the higher +the score +• The more query terms occur in the document, the higher the +score +We will look at a number of alternatives for doing this. + +Weight t,d = (1 + log TFt,d) · log N/ + DFt''' #for arg in sys.argv: #print(arg) @@ -70,37 +79,23 @@ def create_zone_index(doc_dir, ind_dir): print(title) file = open(file_path, 'r') index = defaultdict(list) - - tokenized = nltk.data.load('tokenizers/punkt/english.pickle') for words in file: + #word = line.strip() + word = words.split(" ") + #print(word) - - tokenized = nltk.data.load('tokenizers/punkt/english.pickle') - #word = tokenized.split(" ") - print(''.join(tokenized.tokenize(file.strip()))) + #poop = create_index(word, docnum) + #index = defaultdict(list) + for i, tokens in enumerate(word): - - #nltk part - - - tokenized = nltk.data.load('tokenizers/punkt/english.pickle') - print(''.join(tokenized.tokenize(text.strip()))) + #print(tokens) #good one + #for token in tokens: + #print(token) + #tokens.strip("'") doc_pos = "{}:{},".format( docnum, i,) index[tokens].append(doc_pos) print(index) - - line1 = "{}\n".format(index) - index_file.write(line1) - - file.close() - index_file.close() - - - - -def create_index (data, docnum): - index = defaultdict(list) for i, tokens in enumerate(data): #print(tokens) #good one From 3f9b5fd528c19097a381ac3383186a47703f37e9 Mon Sep 17 00:00:00 2001 From: Davious1 Date: Fri, 9 Feb 2018 11:10:09 -0700 Subject: [PATCH 2/2] Update proj1.py tried to merge dictionary entries so that there is only one posting for 'there' and so on --- proj1.py | 62 +++++++++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/proj1.py b/proj1.py index e27fdc0..fa781da 100644 --- a/proj1.py +++ b/proj1.py @@ -52,17 +52,6 @@ #nltk.download('punkt') takein = sys.stdin #ZONE SCORING -''' -How do we compute the score of a query-document pair? -• If no query term occurs in the document: score should be 0. -• The more frequent a query term in the document, the higher -the score -• The more query terms occur in the document, the higher the -score -We will look at a number of alternatives for doing this. - -Weight t,d = (1 + log TFt,d) · log N/ - DFt''' #for arg in sys.argv: #print(arg) @@ -78,44 +67,35 @@ def create_zone_index(doc_dir, ind_dir): title = doc_id[2:] print(title) file = open(file_path, 'r') - index = defaultdict(list) + index = idict(list) for words in file: - #word = line.strip() + word = words.split(" ") - #print(word) - #poop = create_index(word, docnum) - #index = defaultdict(list) - for i, tokens in enumerate(word): - #print(tokens) #good one - #for token in tokens: - #print(token) - #tokens.strip("'") + index2 = idict(list) doc_pos = "{}:{},".format( docnum, i,) - index[tokens].append(doc_pos) - - print(index) - - for i, tokens in enumerate(data): - #print(tokens) #good one - #for token in tokens: - #print(token) - tokens.strip("'") - index[tokens].append(i) - #poop = (tokens, docnum\n) - #print('\n') - ##line1 = "{}\t{}:{}{}".format(data, docnum, tokens, '\n') #tokens = word index[tokens]=position of word in document - ##print(line1) - #print(index) - #index.write(line1) - #print(index) + index2[tokens].append(doc_pos) + index.update(index2) + print(index) + line1 = "{}".format(index) + index_file.write(line1) + + file.close() + #index_string = "{}".format(index_file) + #for words in index_file: + index_file.close() + - return index +def print_index (data): + index_file = open(ind_dir+'/index.txt', 'r') + print(index_file) + index_file.close() + @@ -129,7 +109,8 @@ def main(): what_to_do =sys.argv[1] if what_to_do == './create_zone_index': create_zone_index(sys.argv[2], sys.argv[3]) - #print_index(sys.argv[2]) + elif what_to_do == './print_index': + print_index(sys.argv[2]) elif what_to_do == './zone_scorer': zone_scorer(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) else: @@ -139,4 +120,3 @@ def main(): if __name__ == "__main__": main() -