-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcoherence.py
80 lines (65 loc) · 2.81 KB
/
coherence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from preprocessing import Pulizia
from gensim.models import CoherenceModel
import gensim.corpora as corpora
import matplotlib.pyplot as plt
from preprocessing_es import *
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=num_topics,
random_state=100,
update_every=1,
chunksize=10,
passes=10,
alpha='symmetric',
iterations=100,
per_word_topics=True)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
def graph_coherence(coherence_values, limit, start=2, step=1):
# Show graph
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
for m, cv in zip(x, coherence_values):
print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
if __name__ == '__main__':
# lettura dataset
df = pd.read_csv("dataset/Dataset.csv", error_bad_lines=False, sep=',')
'''Coherence Dataset Totale '''
df = df.loc[(df['Lingua'] == 'en')]
df = df[0:100]
'''Coherence Latin e lingua spagnola'''
# df = df.loc[(df['Lingua'] == 'es') & (df['Genere'] == 'Latin')]
# data_classes = ['Latin']
testo = df['Testo']
data_ready = Pulizia(testo)
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_ready, start=2, limit=8, step=1)
# Show graph
limit = 8
graph_coherence(coherence_values, limit, start=2, step=1)