-
Notifications
You must be signed in to change notification settings - Fork 0
/
gensim_helpers.py
129 lines (101 loc) · 4.76 KB
/
gensim_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
pyLDAvis Gensim
===============
Helper functions to visualize LDA models trained by Gensim
"""
from __future__ import absolute_import
import funcy as fp
import numpy as np
from scipy.sparse import issparse
import _prepare as vis_prepare
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
import gensim
if not gensim.matutils.ismatrix(corpus):
corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))
else:
corpus_csc = corpus
# Need corpus to be a streaming gensim list corpus for len and inference functions below:
corpus = gensim.matutils.Sparse2Corpus(corpus_csc)
vocab = list(dictionary.token2id.keys())
# TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
# for now, I'll just make sure we don't ever get zeros...
beta = 0.01
fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
term_freqs[term_freqs == 0] = beta
doc_lengths = corpus_csc.sum(axis=0).A.ravel()
assert term_freqs.shape[0] == len(dictionary),\
'Term frequencies and dictionary have different shape {} != {}'.format(
term_freqs.shape[0], len(dictionary))
assert doc_lengths.shape[0] == len(corpus),\
'Document lengths and corpus have different sizes {} != {}'.format(
doc_lengths.shape[0], len(corpus))
if hasattr(topic_model, 'lda_alpha'):
num_topics = len(topic_model.lda_alpha)
else:
num_topics = topic_model.num_topics
if doc_topic_dists is None:
# If its an HDP model.
if hasattr(topic_model, 'lda_beta'):
gamma = topic_model.inference(corpus)
else:
gamma, _ = topic_model.inference(corpus)
doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
else:
if isinstance(doc_topic_dists, list):
doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T
elif issparse(doc_topic_dists):
doc_topic_dists = doc_topic_dists.T.todense()
doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)
assert doc_topic_dists.shape[1] == num_topics,\
'Document topics and number of topics do not match {} != {}'.format(
doc_topic_dists.shape[1], num_topics)
# get the topic-term distribution straight from gensim without
# iterating over tuples
if hasattr(topic_model, 'lda_beta'):
topic = topic_model.lda_beta
else:
topic = topic_model.state.get_lambda()
topic = topic / topic.sum(axis=1)[:, None]
topic_term_dists = topic[:, fnames_argsort]
assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]
return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists,
'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
def prepare(topic_model, corpus, dictionary, doc_topic_dist=None, **kwargs):
"""Transforms the Gensim TopicModel and related corpus and dictionary into
the data structures needed for the visualization.
Parameters
----------
topic_model : gensim.models.ldamodel.LdaModel
An already trained Gensim LdaModel. The other gensim model types are
not supported (PRs welcome).
corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix
The corpus in bag of word form, the same docs used to train the model.
The corpus is transformed into a csc matrix internally, if you intend to
call prepare multiple times it is a good idea to first call
`gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead.
For example: [(50, 3), (63, 5), ....]
dictionary: gensim.corpora.Dictionary
The dictionary object used to create the corpus. Needed to extract the
actual terms (not ids).
doc_topic_dist (optional): Document topic distribution from LDA (default=None)
The document topic distribution that is eventually visualised, if you will
be calling `prepare` multiple times it's a good idea to explicitly pass in
`doc_topic_dist` as inferring this for large corpora can be quite
expensive.
**kwargs :
additional keyword arguments are passed through to :func:`pyldavis.prepare`.
Returns
-------
prepared_data : PreparedData
the data structures used in the visualization
Example
--------
For example usage please see this notebook:
http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb
See
------
See `pyLDAvis.prepare` for **kwargs.
"""
opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
return opts