-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathtfidf.py
80 lines (68 loc) · 2.36 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
import math
from document import Document
from textprocessor import TokenProcessor
class TFIDF:
def __init__(self):
self.documents = {}
def calculate_term_frequency(self, document, term):
"""
This method calculates the normalized frequency of the term in the
given document
"""
frequency_map = document.frequency_map
total_terms = 0
for key in frequency_map:
total_terms += frequency_map[key]
term_count = 0
try:
term_count = frequency_map[term]
except KeyError:
term_count = 0
return term_count / total_terms
def calculate_inverse_document_frequency(self, documents, term):
"""
This method calculates the rareness of the term that may appear in multiple documents
"""
total_documents = len(documents)
total_document_with_term = 0
for document in documents:
term_count = document.get_term_count(term)
if term_count > 0:
total_document_with_term += 1
idf = 0
#print("total docs : {}, total doc with term : {}".format(total_documents, total_document_with_term))
if total_documents > 0 and total_document_with_term > 0:
idf = math.log( total_documents / total_document_with_term)
return idf
def calculate_tfidf_document(self, documents, document):
frequency_map = document.frequency_map
tfidf_list = []
for key in frequency_map:
tf = self.calculate_term_frequency(document, key)
idf = self.calculate_inverse_document_frequency(documents, key)
d = {}
d["term"] = key
d["tf"] = tf
d["idf"] = idf
tfidf_list.append(d)
return tfidf_list
def main():
tokenizer = TokenProcessor()
doc = Document(1)
doc.load_from_file("data/test")
doc.extract_terms(tokenizer)
doc.generate_frequency_map()
print(doc)
doc1 = Document(2)
doc1.load_from_file("data/test2.txt")
doc1.extract_terms(tokenizer)
doc1.generate_frequency_map()
print(doc1)
tfidf = TFIDF()
tf = tfidf.calculate_term_frequency(doc, "i")
print(tf)
idf = tfidf.calculate_inverse_document_frequency([doc, doc1], "i")
print(idf)
if __name__ == "__main__":
main()