-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtf-idf.py
57 lines (44 loc) · 1.36 KB
/
tf-idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import nltk
import math
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
corpus_path='C:/Users/Punyajoy Saha/Desktop/corpus/c '
def filtered(filename):
# filename=raw_input('Enter the filename: ')
doc=open(filename,'r')
str1 =doc.read()
str1=str1.decode('ascii','ignore') #the whole document is read in one string
tokens=nltk.word_tokenize(str1);
token_new=[word for word in tokens if word not in stopwords.words('english')]
words=[word.lower() for word in token_new if word.isalpha()]
str1=""
for u in words:
str1=str1+" "+u
return str1
corpus=[]
for i in range(0,2364):
print(i)
str1=corpus_path+'('+str(i+1)+')'+'.txt'
word_vec=filtered(str1)
corpus.append(word_vec)
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
idf = vectorizer._tfidf.idf_
idf_1=sorted(idf)
print dict(zip(vectorizer.get_feature_names(), idf_1))
#dist=[(0,[(0,0)])]
#count=0
#for v in range(1698,2366):
# dist.append((v,[]))
# for u in range(0,1698):
# dista=0;
# for w in range(0,27528):
# dista=dista+pow((X[v,w]-X[u,w]),2)
# dista=math.sqrt(dista)
# dist[count][1].append((u ,dista))
# print u
# count =count+1
#
#
#
#