From 3bb054869c6a81fc946740429fbc235e0daddaad Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Thu, 4 Jan 2024 22:36:20 +0300 Subject: [PATCH 01/13] Reflect renamed module for LdaModel in gensim4. `LdaModel`, that was in `gensim.models.lda`(gensim3), is implemented by `gensim.models.ldamodel` now (gensim4). The proposed solution tries both and exits with an error if neither module can be imported. --- litstudy/nlp.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index b0486fa..fd85949 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -311,9 +311,16 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: :param num_topics: The number of topics to train. :param seed: The seed used for random number generation. - :param kwargs: Arguments passed to `gensim.models.lda.LdaModel`. + :param kwargs: Arguments passed to `gensim.models.lda.LdaModel` (gensim3) + or `gensim.models.ldamodel.LdaModel` (gensim4). """ - from gensim.models.lda import LdaModel + try: + from gensim.models.lda import LdaModel + except: + try: + from gensim.models.ldamodel import LdaModel + except: + sys.exit('LdaModel could not be imported from gensim 3 or 4.') dic = corpus.dictionary freqs = corpus.frequencies From ed4c14d3503825ef15700e71fefe6629383f2932 Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:10:58 +0300 Subject: [PATCH 02/13] Support gensim4 ldamodel Check wether version 3 or 4 of gensim is loaded and train the lda model accordingly. --- litstudy/nlp.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index fd85949..aa61cd4 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -314,18 +314,21 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: :param kwargs: Arguments passed to `gensim.models.lda.LdaModel` (gensim3) or `gensim.models.ldamodel.LdaModel` (gensim4). """ - try: - from gensim.models.lda import LdaModel - except: - try: - from gensim.models.ldamodel import LdaModel - except: - sys.exit('LdaModel could not be imported from gensim 3 or 4.') dic = corpus.dictionary freqs = corpus.frequencies - model = LdaModel(list(corpus), **kwargs) + from importlib.metadata import version + gensim_mayor=version('gensim').split('.')[0] + + if gensim_mayor == 3: + from gensim.models.lda import LdaModel + model = LdaModel(list(corpus), **kwargs) + elif gensim_mayor == 4: + from gensim.models.ldamodel import LdaModel + model = LdaModel(freqs,id2word=dic, **kwargs) + else: + sys.exit('LdaModel could not be imported from gensim 3 or 4.') doc2topic = corpus2dense(model[freqs], num_topics) topic2token = model.get_topics() From 98b7eb56aba51eb0ea87204c92b3246350fc2cb8 Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:12:46 +0300 Subject: [PATCH 03/13] Minor clean-up Just added a missing space. --- litstudy/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index aa61cd4..7cb46d4 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -326,7 +326,7 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: model = LdaModel(list(corpus), **kwargs) elif gensim_mayor == 4: from gensim.models.ldamodel import LdaModel - model = LdaModel(freqs,id2word=dic, **kwargs) + model = LdaModel(freqs, id2word=dic, **kwargs) else: sys.exit('LdaModel could not be imported from gensim 3 or 4.') From bcec9c03c285d73ef0499e326e84402400892517 Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:15:11 +0300 Subject: [PATCH 04/13] Import sys.exit() to raise error if gensim not available This error-handling may not comply with the general litstudy convention. --- litstudy/nlp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index 7cb46d4..f964b51 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -328,7 +328,8 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: from gensim.models.ldamodel import LdaModel model = LdaModel(freqs, id2word=dic, **kwargs) else: - sys.exit('LdaModel could not be imported from gensim 3 or 4.') + from sys import exit + exit('LdaModel could not be imported from gensim 3 or 4.') doc2topic = corpus2dense(model[freqs], num_topics) topic2token = model.get_topics() From b73962655c9d1a2ead93590857878ab139a74f2c Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:20:58 +0300 Subject: [PATCH 05/13] Fix gensim version check Convert the mayor version number to int as expected in the conditional. --- litstudy/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index f964b51..a4fc0c5 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -319,7 +319,7 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: freqs = corpus.frequencies from importlib.metadata import version - gensim_mayor=version('gensim').split('.')[0] + gensim_mayor=int(version('gensim').split('.')[0]) if gensim_mayor == 3: from gensim.models.lda import LdaModel From db995c146a311888ac2c8dee8cc865b177455828 Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:24:36 +0300 Subject: [PATCH 06/13] Pass num_topics to LdaModel Pass the num_topics parameter to LdaModel with gensim4. --- litstudy/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index a4fc0c5..48f8a30 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -326,7 +326,7 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: model = LdaModel(list(corpus), **kwargs) elif gensim_mayor == 4: from gensim.models.ldamodel import LdaModel - model = LdaModel(freqs, id2word=dic, **kwargs) + model = LdaModel(freqs, id2word=dic, num_topics=num_topics, **kwargs) else: from sys import exit exit('LdaModel could not be imported from gensim 3 or 4.') From 6f43d34781d63bb954509c2bb9b0565a62441daf Mon Sep 17 00:00:00 2001 From: Lars O Grobe <38878584+larsgrobe@users.noreply.github.com> Date: Fri, 5 Jan 2024 15:43:58 +0300 Subject: [PATCH 07/13] Fix train_lda_model() Added the missing '.T'. --- litstudy/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index 48f8a30..12945e6 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -331,7 +331,7 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: from sys import exit exit('LdaModel could not be imported from gensim 3 or 4.') - doc2topic = corpus2dense(model[freqs], num_topics) + doc2topic = corpus2dense(model[freqs], num_topics).T topic2token = model.get_topics() return TopicModel(dic, doc2topic, topic2token) From 53903a60db66036be43c68e23738e5491495b788 Mon Sep 17 00:00:00 2001 From: "Lars O. Grobe" Date: Wed, 31 Jan 2024 18:29:34 +0100 Subject: [PATCH 08/13] Cleaned formatting using black. --- litstudy/nlp.py | 8 ++++++-- litstudy/sources/scopus_csv.py | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index 12945e6..12fe1d2 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -319,17 +319,21 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: freqs = corpus.frequencies from importlib.metadata import version - gensim_mayor=int(version('gensim').split('.')[0]) + + gensim_mayor = int(version("gensim").split(".")[0]) if gensim_mayor == 3: from gensim.models.lda import LdaModel + model = LdaModel(list(corpus), **kwargs) elif gensim_mayor == 4: from gensim.models.ldamodel import LdaModel + model = LdaModel(freqs, id2word=dic, num_topics=num_topics, **kwargs) else: from sys import exit - exit('LdaModel could not be imported from gensim 3 or 4.') + + exit("LdaModel could not be imported from gensim 3 or 4.") doc2topic = corpus2dense(model[freqs], num_topics).T topic2token = model.get_topics() diff --git a/litstudy/sources/scopus_csv.py b/litstudy/sources/scopus_csv.py index 1ee3fad..2679f15 100644 --- a/litstudy/sources/scopus_csv.py +++ b/litstudy/sources/scopus_csv.py @@ -1,6 +1,7 @@ """ support loading Scopus CSV export. """ + from typing import List, Optional from ..types import Document, Author, DocumentSet, DocumentIdentifier, Affiliation from ..common import robust_open From 1cefc827c627bde601bdd1072a0e4041b0cdc956 Mon Sep 17 00:00:00 2001 From: "Lars O. Grobe" Date: Wed, 31 Jan 2024 19:14:01 +0100 Subject: [PATCH 09/13] Added experimental support for ensemble LDA implemented in gensim4. --- litstudy/nlp.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index 12fe1d2..dc05268 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -341,6 +341,29 @@ def train_lda_model(corpus: Corpus, num_topics, seed=0, **kwargs) -> TopicModel: return TopicModel(dic, doc2topic, topic2token) +def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) -> TopicModel: + """Train a topic model using ensemble LDA. + + :param num_topics: The number of topics to train. + :param num_models: The number of models to train. + :param seed: The seed used for random number generation. + :param kwargs: Arguments passed to `gensim.models.ensembelda.EnsembleLda` (gensim4). + """ + + if gensim_mayor <= 3: + from sys import exit + + exit("EnsembleLda requires at least gensim 4.") + + from gensim.models.ensembelda import EnsembleLda + + model = EnsembleLda(corpus=freqs, id2word=dic, num_topics=num_topics, num_models=num_models, **kwargs) + + doc2topic = corpus2dense(model[freqs], num_topics).T + topic2token = model.get_topics() + + return TopicModel(dic, doc2topic, topic2token) + def compute_word_distribution(corpus: Corpus, *, limit=None) -> pd.DataFrame: """Returns dataframe that indicates, for each word, the number of documents that mention that word. From 8bc42b21778e257f6fc19c1748f0fb0ecc048835 Mon Sep 17 00:00:00 2001 From: "Lars O. Grobe" Date: Wed, 31 Jan 2024 19:32:36 +0100 Subject: [PATCH 10/13] Enabled experimental ensemble LDA support. --- litstudy/nlp.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index dc05268..483c552 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -350,6 +350,13 @@ def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) :param kwargs: Arguments passed to `gensim.models.ensembelda.EnsembleLda` (gensim4). """ + dic = corpus.dictionary + freqs = corpus.frequencies + + from importlib.metadata import version + + gensim_mayor = int(version("gensim").split(".")[0]) + if gensim_mayor <= 3: from sys import exit @@ -357,13 +364,16 @@ def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) from gensim.models.ensembelda import EnsembleLda - model = EnsembleLda(corpus=freqs, id2word=dic, num_topics=num_topics, num_models=num_models, **kwargs) + model = EnsembleLda( + corpus=freqs, id2word=dic, num_topics=num_topics, num_models=num_models, **kwargs + ) doc2topic = corpus2dense(model[freqs], num_topics).T topic2token = model.get_topics() return TopicModel(dic, doc2topic, topic2token) + def compute_word_distribution(corpus: Corpus, *, limit=None) -> pd.DataFrame: """Returns dataframe that indicates, for each word, the number of documents that mention that word. From 28b332481b316ffdc61bbeb8a99f0ce45c619874 Mon Sep 17 00:00:00 2001 From: "Lars O. Grobe" Date: Wed, 31 Jan 2024 19:35:55 +0100 Subject: [PATCH 11/13] Enabled experimental ensemble LDA support. --- litstudy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litstudy/__init__.py b/litstudy/__init__.py index fcb125e..d96ce64 100644 --- a/litstudy/__init__.py +++ b/litstudy/__init__.py @@ -61,6 +61,7 @@ build_corpus, train_nmf_model, train_lda_model, + train_elda_model, compute_word_distribution, calculate_embedding, ) # noqa: F401 From ffbf7d425b333f278bc0593bc2173b91e973afda Mon Sep 17 00:00:00 2001 From: "Lars O. Grobe" Date: Wed, 31 Jan 2024 19:41:26 +0100 Subject: [PATCH 12/13] Corrected typo when importing ensemblelda. --- litstudy/nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index 483c552..81f3580 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -347,7 +347,7 @@ def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) :param num_topics: The number of topics to train. :param num_models: The number of models to train. :param seed: The seed used for random number generation. - :param kwargs: Arguments passed to `gensim.models.ensembelda.EnsembleLda` (gensim4). + :param kwargs: Arguments passed to `gensim.models.ensemblelda.EnsembleLda` (gensim4). """ dic = corpus.dictionary @@ -362,7 +362,7 @@ def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) exit("EnsembleLda requires at least gensim 4.") - from gensim.models.ensembelda import EnsembleLda + from gensim.models.ensemblelda import EnsembleLda model = EnsembleLda( corpus=freqs, id2word=dic, num_topics=num_topics, num_models=num_models, **kwargs From 48f42ea9a5df7d60fa1c645ce8eb7941845d7d76 Mon Sep 17 00:00:00 2001 From: "Lars O. Grobe" Date: Wed, 31 Jan 2024 20:59:10 +0100 Subject: [PATCH 13/13] Set default algorithm back to ldamulticore, which requires to define a main() function in the calling module. --- litstudy/nlp.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/litstudy/nlp.py b/litstudy/nlp.py index 81f3580..b939e6e 100644 --- a/litstudy/nlp.py +++ b/litstudy/nlp.py @@ -350,9 +350,6 @@ def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) :param kwargs: Arguments passed to `gensim.models.ensemblelda.EnsembleLda` (gensim4). """ - dic = corpus.dictionary - freqs = corpus.frequencies - from importlib.metadata import version gensim_mayor = int(version("gensim").split(".")[0]) @@ -362,10 +359,18 @@ def train_elda_model(corpus: Corpus, num_topics, num_models=4, seed=0, **kwargs) exit("EnsembleLda requires at least gensim 4.") + dic = corpus.dictionary + freqs = corpus.frequencies + from gensim.models.ensemblelda import EnsembleLda model = EnsembleLda( - corpus=freqs, id2word=dic, num_topics=num_topics, num_models=num_models, **kwargs + topic_model_class="ldamulticore", + corpus=freqs, + id2word=dic, + num_topics=num_topics, + num_models=num_models, + **kwargs ) doc2topic = corpus2dense(model[freqs], num_topics).T