-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTF-IDF_Matrix.py
150 lines (128 loc) · 4.52 KB
/
TF-IDF_Matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# CALCULATION OF TF VALUES
# TF_IDF= TF*IDF
import nltk
import math
from decimal import *
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
corpus_root = 'C:\MyData\PythonPractice\Mycorpus'
wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt')
print('\nFollowing file ids are there in this corpus: \n ')
print(wordlists.fileids())
print("\nNumber of sentences in the file are :")
sencount=len(wordlists.sents(fileids=['resort.txt']))
print(sencount)
print('\n Sentences are : \n')
sentences=wordlists.sents(fileids='resort.txt')
print(sentences)
sample=wordlists.raw("resort.txt")
s=sample.split('.')
# NUMBER OF TIMES A TERM APPEAR IN EACH SENTENCE
# NUMBER OF TERMS IN EACH SENTENCE
wordfreq = []
term_freq=[]
terms_count_doc=[]
for i in range(sencount):
print("\n Sentence "+ str(i+1))
print(s[i])
#print('\n Tokenization \n')
word_tokens = word_tokenize(s[i])
#print('\n Removing PUNCTUATIONS')
word_tokens=[word.lower() for word in word_tokens if word.isalpha()]
#print('\n Removing STOPWORDS Now')
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
for w in filtered_sentence:
wordfreq.append(filtered_sentence.count(w))
print("\n Pair wise --(Words,Frequences) for Sentence"+ str(i+1)+ "\n" )
print(list(zip(filtered_sentence, wordfreq)))
fs, wf = zip(*(list(zip(filtered_sentence, wordfreq))))
print("\n Number of times a term appear in sentence "+str(i+1)+"\n")
print(wf)
term_freq.append(wf)
unique_tokens = []
for x in filtered_sentence:
if x not in unique_tokens:
unique_tokens.append(x)
print('\n Number of tokens:'+ str(len(unique_tokens)))
print(unique_tokens)
terms_count_doc.append(len(unique_tokens))
print("\n Numbers of terms in each sentence is :")
print(terms_count_doc)
print("\n Numbers of times a term appear in sentences 1-10 \n")
print(term_freq)
# TF for each term
for i in range(sencount):
TF=[]
for j in range(len(term_freq[i])):
x=((term_freq[i])[j])/(terms_count_doc[i])
TF.append(x)
print("\n TF values for Sentence : "+ str(i+1))
print(TF)
# CALCULATION OF IDF VALUES
# IDF= log(N/n) N : NUMBER OF DOCUMENTS n : NUMBER OF DOCUMENTS A TERM HAS APPEARED IN
#N : no of sentences
N=sencount
unique_tokens = []
for i in range(sencount):
#print('\n Tokenization \n')
word_tokens = word_tokenize(s[i])
#print('\n Removing PUNCTUATIONS')
word_tokens=[word.lower() for word in word_tokens if word.isalpha()]
#print('\n Removing STOPWORDS Now')
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
for x in filtered_sentence:
if x not in unique_tokens:
unique_tokens.append(x)
print("\n Total unique tokens are: "+str(len(unique_tokens)) +"\n")
print(unique_tokens)
# n : NUMBER OF DOCUMENTS A TERM HAS APPEARED IN
count=[]
for i in range(len(unique_tokens)):
x=0
for j in range(N):
if unique_tokens[i] in s[j]:
x=x+1
count.append(x)
print("\n Number of sentences " + str(unique_tokens[i])+" has appeared in :"+ str(x))
count=count[5:]
print("\n Combined Number of Sentences a term has appeared in : \n")
print(count)
# IDF Values
IDF=[]
for i in range(len(unique_tokens)):
x=math.log10(N/(count[i]))
print("\n IDF value of " +str(unique_tokens[i])+ " is : " + str(x))
IDF.append(x)
# COMBINED TF MATRIX
TFM=[]
for i in range(sencount):
for j in range(len(term_freq[i])):
x=((term_freq[i])[j])/(terms_count_doc[i])
TFM.append(x)
print("\n COMBINED REPRESENTATION of TF Values : \n ")
print(TFM)
# COMBINED IDF MATRIX
IDFM=[]
for i in range(len(unique_tokens)):
x=math.log10(N/(count[i]))
IDFM.append(x)
print("\n COMBINED REPRESENTATION of IDF Values : \n ")
print(IDFM)
# COMBINED TF-IDF MATRIX
#TF_IDF = TF* IDF
TF_IDF=[]
for i in range(len(unique_tokens)):
TF_IDF.append(TFM[i]*IDF[i])
print("\n COMBINED REPRESENTATION of TF-IDF Values : \n ")
print(TF_IDF)