-
Notifications
You must be signed in to change notification settings - Fork 0
/
TFIDF.py
76 lines (52 loc) · 2.12 KB
/
TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
def bag_of_words(texts: list[str]):
""" For each word, save the number of word usage in each sentence (word-count) """
words = []
for sentence in texts:
words.extend(sentence.split())
matrix = pd.DataFrame(columns=words, index=range(len(texts)))
for i, sentence in enumerate(texts):
sentence_words = sentence.split()
for word in words:
matrix.loc[i, word] = sentence_words.count(word)
return matrix
def term_frequency(texts: list[str]):
""" For each word, save the density of word (word-usage / all-words) """
words = []
for sentence in texts:
words.extend(sentence.split())
matrix = pd.DataFrame(columns=words, index=range(len(texts)))
for i, sentence in enumerate(texts):
sentence_words = sentence.split()
for word in words:
matrix.loc[i, word] = sentence_words.count(word) / len(sentence_words)
return matrix
def invert_document_frequency(texts: list[str]):
""" For each word, save (all-sentences / consist-sentences) """
def count_in_texts(clause):
return sum(clause in sentence.split() for sentence in texts)
words = []
for sentence in texts:
words.extend(sentence.split())
matrix = pd.DataFrame(columns=words, index=range(len(texts)))
for i, sentence in enumerate(texts):
sentence_words = sentence.split()
for word in words:
matrix.loc[i, word] = len(texts) / count_in_texts(word)
return matrix
def TF_IDF(texts):
return term_frequency(texts) * invert_document_frequency(texts)
if __name__ == "__main__":
texts = [
'learning is my best entertainment',
'she has some beautiful antique furniture',
'I am not really depressed',
'I would like to play tennis with you some day',
'may I have another one, please',
'why you did not come to the class yesterday',
'did I play in your new tennis gym',
'would you like to give me some suace',
'how beautiful is your new furniture'
]
print("The TF-IDF calculated for sentences:")
print(TF_IDF(texts))