Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 21 additions & 46 deletions proj1.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,16 @@

#There should be two zones: title and body. The body is the content of the file, while the title is encoded in the file name. IMPORTANT: when tokenizing the titles, make sure to separate words connected by _ (e.g., 101_Dalmatians_1996 is tokenized into 101 Dalmatians and 1996).


import sys
import re
import os
import argparse
from collections import defaultdict
import nltk
nltk.download('punkt')
#import nltk
#nltk.download('punkt')
takein = sys.stdin
#ZONE SCORING


#for arg in sys.argv:
#print(arg)

Expand All @@ -69,57 +67,34 @@ def create_zone_index(doc_dir, ind_dir):
title = doc_id[2:]
print(title)
file = open(file_path, 'r')
index = defaultdict(list)

tokenized = nltk.data.load('tokenizers/punkt/english.pickle')
index = idict(list)
for words in file:

word = words.split(" ")


tokenized = nltk.data.load('tokenizers/punkt/english.pickle')
#word = tokenized.split(" ")
print(''.join(tokenized.tokenize(file.strip())))
for i, tokens in enumerate(word):

#nltk part


tokenized = nltk.data.load('tokenizers/punkt/english.pickle')
print(''.join(tokenized.tokenize(text.strip())))
index2 = idict(list)
doc_pos = "{}:{},".format( docnum, i,)
index[tokens].append(doc_pos)

print(index)

line1 = "{}\n".format(index)
index_file.write(line1)
index2[tokens].append(doc_pos)
index.update(index2)
print(index)
line1 = "{}".format(index)
index_file.write(line1)

file.close()
index_file.close()
#index_string = "{}".format(index_file)
#for words in index_file:

index_file.close()




def create_index (data, docnum):
index = defaultdict(list)

for i, tokens in enumerate(data):
#print(tokens) #good one
#for token in tokens:
#print(token)
tokens.strip("'")
index[tokens].append(i)
#poop = (tokens, docnum\n)
#print('\n')
##line1 = "{}\t{}:{}{}".format(data, docnum, tokens, '\n') #tokens = word index[tokens]=position of word in document
##print(line1)
#print(index)
#index.write(line1)
#print(index)


return index


def print_index (data):
index_file = open(ind_dir+'/index.txt', 'r')
print(index_file)
index_file.close()



Expand All @@ -134,7 +109,8 @@ def main():
what_to_do =sys.argv[1]
if what_to_do == './create_zone_index':
create_zone_index(sys.argv[2], sys.argv[3])
#print_index(sys.argv[2])
elif what_to_do == './print_index':
print_index(sys.argv[2])
elif what_to_do == './zone_scorer':
zone_scorer(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
else:
Expand All @@ -144,4 +120,3 @@ def main():

if __name__ == "__main__":
main()