From 8ec413a6cc931cacabb718e22ea79fb4b90cf24f Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 09:21:06 +0200 Subject: [PATCH 01/15] fix docs Signed-off-by: Tim Schopf --- keyphrase_vectorizers/keyphrase_count_vectorizer.py | 13 +++++++++---- keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index d379bf2..e39fa29 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -178,6 +178,7 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm' self.decay = decay self.delete_min_df = delete_min_df self.running_fit_transform = False + self.X_ = None def fit(self, raw_documents: List[str]) -> object: """ @@ -458,11 +459,15 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix: that do not exceed `self.delete_min_df` are removed from its vocabulary and bag-of-keywords matrix. - Arguments: - raw_documents: A list of documents + Parameters + ---------- + raw_documents : iterable + An iterable of strings. - Returns: - X_: Bag-of-keywords matrix + Returns + ------- + X_ : scipy.sparse.csr_matrix + Bag-of-keywords matrix """ if hasattr(self, "X_"): diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py index 596bc0b..ab553a4 100644 --- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py @@ -185,6 +185,7 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm' self.use_idf = use_idf self.smooth_idf = smooth_idf self.sublinear_tf = sublinear_tf + self.X_ = None self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf, sublinear_tf=self.sublinear_tf) From 2173e4f6179508fabd162ab55d54ef6c34e29455 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 09:56:31 +0200 Subject: [PATCH 02/15] fix docs Signed-off-by: Tim Schopf --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00dbc9d..d8ad4b8 100644 --- a/README.md +++ b/README.md @@ -434,7 +434,7 @@ vectorizer.fit(docs) keyphrases = vectorizer.get_feature_names_out() print(keyphrases) ->>>['output value' 'information retrieval' 'algorithm' 'vector' 'groups' +>> > ['output value' 'information retrieval' 'algorithm' 'vector' 'groups' 'main topics' 'task' 'precise summary' 'supervised learning' 'inductive bias' 'information retrieval environment' 'supervised learning algorithm' 'function' 'input' 'pair' @@ -790,7 +790,8 @@ vectorizer.X_.toarray() [Back to Table of Contents](#toc) When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry: -``` + +```plaintext @conference{schopf_etal_kdir22, author={Tim Schopf and Simon Klimek and Florian Matthes}, title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction}, From be1e841b25c27ef24b446061ff54f538e6cdd5c5 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 09:58:35 +0200 Subject: [PATCH 03/15] fix docs Signed-off-by: Tim Schopf --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d8ad4b8..21dd684 100644 --- a/README.md +++ b/README.md @@ -255,7 +255,7 @@ vectorizer = KeyphraseTfidfVectorizer() # Print parameters print(vectorizer.get_params()) ->> > {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < +>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < class 'numpy.int64'>, 'lowercase': True, 'max_df': None @@ -434,7 +434,7 @@ vectorizer.fit(docs) keyphrases = vectorizer.get_feature_names_out() print(keyphrases) ->> > ['output value' 'information retrieval' 'algorithm' 'vector' 'groups' +>>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups' 'main topics' 'task' 'precise summary' 'supervised learning' 'inductive bias' 'information retrieval environment' 'supervised learning algorithm' 'function' 'input' 'pair' @@ -735,12 +735,12 @@ vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3) # intitial vectorizer fit vectorizer.fit_transform([docs[0]]).toarray() ->> > array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, +>>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1]]) # check learned keyphrases print(vectorizer.get_feature_names_out()) ->> > ['output pairs', 'output value', 'function', 'optimal scenario', +>>> ['output pairs', 'output value', 'function', 'optimal scenario', 'pair', 'supervised learning', 'supervisory signal', 'algorithm', 'supervised learning algorithm', 'way', 'training examples', 'input object', 'example', 'machine', 'output', @@ -751,12 +751,12 @@ print(vectorizer.get_feature_names_out()) # learn additional keyphrases from new documents with partial fit vectorizer.partial_fit([docs[1]]) vectorizer.transform([docs[1]]).toarray() ->> > array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +>>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]]) # check learned keyphrases, including newly learned ones print(vectorizer.get_feature_names_out()) ->> > ['output pairs', 'output value', 'function', 'optimal scenario', +>>> ['output pairs', 'output value', 'function', 'optimal scenario', 'pair', 'supervised learning', 'supervisory signal', 'algorithm', 'supervised learning algorithm', 'way', 'training examples', 'input object', 'example', 'machine', 'output', @@ -771,16 +771,16 @@ print(vectorizer.get_feature_names_out()) # update list of learned keyphrases according to 'delete_min_df' vectorizer.update_bow([docs[1]]) vectorizer.transform([docs[1]]).toarray() ->> > array([[5, 5]]) +>>> array([[5, 5]]) # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain) print(vectorizer.get_feature_names_out()) ->> > ['keywords', 'document'] +>>> ['keywords', 'document'] # update again and check the impact of 'decay' on the learned document-keyphrase matrix vectorizer.update_bow([docs[1]]) vectorizer.X_.toarray() ->> > array([[7.5, 7.5]]) +>>> array([[7.5, 7.5]]) ``` From 5fc495bddf8e6bda8b1c0d1c15dc00d0e2a7024a Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 10:50:21 +0200 Subject: [PATCH 04/15] fix docs Signed-off-by: Tim Schopf --- .readthedocs.yaml | 3 +-- README.md | 14 +++++++------- docs/requirements.txt | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e5df02f..8b07701 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -18,7 +18,6 @@ formats: all python: install: - requirements: docs/requirements.txt - - requirements: requirements.txt - method: pip path: . extra_requirements: @@ -27,7 +26,7 @@ python: build: os: ubuntu-22.04 tools: - python: "3.7" + python: "3.8" submodules: include: all diff --git a/README.md b/README.md index 21dd684..3dd9aae 100644 --- a/README.md +++ b/README.md @@ -127,9 +127,12 @@ vectorizer = KeyphraseCountVectorizer() # Print parameters print(vectorizer.get_params()) +``` +```plaintext >>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} ``` + By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline @@ -255,14 +258,11 @@ vectorizer = KeyphraseTfidfVectorizer() # Print parameters print(vectorizer.get_params()) ->>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < - - -class 'numpy.int64'>, 'lowercase': True, 'max_df': None - -, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner', - 'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} ``` +```plaintext +>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner','textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} +``` + To calculate tf values instead, set `use_idf=False`. diff --git a/docs/requirements.txt b/docs/requirements.txt index 8a3ea7e..0186de7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,6 +14,7 @@ docutils>=0.16 numpy>=1.18.5 spacy>=3.0.1 spacy-transformers>=1.1.6 +spacy-curated-transformers>=0.2.2 nltk>=3.6.1 scikit-learn>=1.0 scipy>=1.7.3 From d1257dcf0fbace28f9ff561f0fe2191422a0d6e3 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 10:55:53 +0200 Subject: [PATCH 05/15] fix docs Signed-off-by: Tim Schopf --- docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index df165fc..4667242 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,7 +35,7 @@ # ones. extensions = [ 'sphinx_rtd_theme', - 'recommonmark', + 'myst_parser', 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx_markdown_tables', @@ -64,7 +64,7 @@ master_doc = 'index' source_parsers = { - '.md': 'recommonmark.parser.CommonMarkParser', + '.md': 'myst_parser.parser.MystParser', } source_suffix = ['.rst', '.md'] From a5e0f99dd9cb95be06ffa1f570f16f139cbd6409 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 12:04:26 +0200 Subject: [PATCH 06/15] fix docs Signed-off-by: Tim Schopf --- README.md | 14 +++++++------- docs/conf.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 3dd9aae..21dd684 100644 --- a/README.md +++ b/README.md @@ -127,12 +127,9 @@ vectorizer = KeyphraseCountVectorizer() # Print parameters print(vectorizer.get_params()) -``` -```plaintext >>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} ``` - By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline @@ -258,11 +255,14 @@ vectorizer = KeyphraseTfidfVectorizer() # Print parameters print(vectorizer.get_params()) -``` -```plaintext ->>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner','textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} -``` +>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < + +class 'numpy.int64'>, 'lowercase': True, 'max_df': None + +, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner', + 'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} +``` To calculate tf values instead, set `use_idf=False`. diff --git a/docs/conf.py b/docs/conf.py index 4667242..df165fc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,7 +35,7 @@ # ones. extensions = [ 'sphinx_rtd_theme', - 'myst_parser', + 'recommonmark', 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx_markdown_tables', @@ -64,7 +64,7 @@ master_doc = 'index' source_parsers = { - '.md': 'myst_parser.parser.MystParser', + '.md': 'recommonmark.parser.CommonMarkParser', } source_suffix = ['.rst', '.md'] From e01123a11b9b68f2a784b5cf97c8d02b01a839e4 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 12:07:35 +0200 Subject: [PATCH 07/15] fix docs Signed-off-by: Tim Schopf --- .readthedocs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 8b07701..8aac750 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ version: 2 sphinx: configuration: docs/conf.py builder: html - fail_on_warning: true + fail_on_warning: false # Optionally build your docs in additional formats such as PDF formats: all From eede5fea3e10ccf010127d9b5036dcb695877189 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 13:00:52 +0200 Subject: [PATCH 08/15] add tests Signed-off-by: Tim Schopf --- .readthedocs.yaml | 2 +- tests/requirements.txt | 4 ++- tests/test_vectorizers.py | 47 ++++++++++++++++++++++++++++ tests/utils.py | 64 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 2 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 8aac750..4e9a238 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -26,7 +26,7 @@ python: build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.7" submodules: include: all diff --git a/tests/requirements.txt b/tests/requirements.txt index a32ad4e..8c11f57 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,4 +1,6 @@ pytest>=7.0.1 keybert>=0.5.0 flair==0.11.3 -scipy==1.7.3 \ No newline at end of file +scipy==1.7.3 +bertopic>=0.16.1 +datasets==2.13.2 \ No newline at end of file diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py index d2372a5..2be060d 100644 --- a/tests/test_vectorizers.py +++ b/tests/test_vectorizers.py @@ -2,6 +2,8 @@ import flair import spacy +from bertopic import BERTopic +from datasets import load_dataset from flair.models import SequenceTagger from flair.tokenization import SegtokSentenceSplitter from keybert import KeyBERT @@ -132,3 +134,48 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag keyphrases = vectorizer.get_feature_names_out() assert sorted(keyphrases) == sorted_english_test_keyphrases + + +def test_online_vectorizer(): + first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix() + second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix() + first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases() + english_keyphrases = utils.get_english_test_keyphrases() + frequencies_after_min_df = utils.get_frequencies_after_min_df() + frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df() + frequencies_after_bow = utils.get_frequencies_after_bow() + + # intitial vectorizer fit + vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3) + + assert [sorted(count_list) for count_list in + vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix + assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases + + # learn additional keyphrases from new documents with partial fit + vectorizer.partial_fit([english_docs[1]]) + + assert [sorted(count_list) for count_list in + vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix + assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases + + # update list of learned keyphrases according to 'delete_min_df' + vectorizer.update_bow([english_docs[1]]) + assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all() + + # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain) + assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df + + # update again and check the impact of 'decay' on the learned document-keyphrase matrix + vectorizer.update_bow([english_docs[1]]) + assert (vectorizer.X_.toarray() == frequencies_after_bow).all() + + +def test_bertopic(): + data = load_dataset("ag_news") + texts = data['train']['text'] + texts = texts[:100] + topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer()) + topics, probs = topic_model.fit_transform(documents=texts) + new_topics = topic_model.reduce_outliers(texts, topics) + topic_model.update_topics(texts, topics=new_topics) diff --git a/tests/utils.py b/tests/utils.py index d4c275e..02b8fc3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +import numpy as np def get_english_test_docs(): english_docs = ["""Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a @@ -56,6 +57,36 @@ def get_english_test_keyphrases(): return sorted_english_test_keyphrases +def get_english_first_doc_test_keyphrases(): + sorted_english_first_doc_test_keyphrases = ['algorithm', + 'class labels', + 'example', + 'function', + 'inductive bias', + 'input', + 'input object', + 'machine', + 'new examples', + 'optimal scenario', + 'output', + 'output pairs', + 'output value', + 'pair', + 'set', + 'supervised learning', + 'supervised learning algorithm', + 'supervisory signal', + 'task', + 'training data', + 'training examples', + 'unseen instances', + 'unseen situations', + 'vector', + 'way'] + + return sorted_english_first_doc_test_keyphrases + + def get_sorted_english_keyphrases_custom_flair_tagger(): sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content', 'document relevance', @@ -102,6 +133,21 @@ def get_sorted_english_count_matrix(): return sorted_english_count_matrix +def get_sorted_english_first_doc_count_matrix(): + sorted_english_first_doc_count_matrix = [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]] + + return sorted_english_first_doc_count_matrix + + +def get_sorted_english_second_doc_count_matrix(): + sorted_english_second_doc_count_matrix = [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 5, 5]] + + return sorted_english_second_doc_count_matrix + + def get_sorted_french_count_matrix(): sorted_french_count_matrix = [[1, 1, 1, 1]] @@ -130,3 +176,21 @@ def get_english_keybert_keyphrases(): 'document content']] return english_keybert_keyphrases + + +def get_frequencies_after_min_df(): + frequency_array = np.array([[5, 5]]) + + return frequency_array + + +def get_frequencies_after_bow(): + frequency_array = np.array([[7.5, 7.5]]) + + return frequency_array + + +def get_frequent_keyphrases_after_min_df(): + keyphrases = ['document', 'keywords'] + + return keyphrases From 175caa68dcaeaae371ed022ba7db6bbea6c6896e Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 13:07:02 +0200 Subject: [PATCH 09/15] update test requirements.txt Signed-off-by: Tim Schopf --- tests/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 8c11f57..4f8397b 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,4 +3,5 @@ keybert>=0.5.0 flair==0.11.3 scipy==1.7.3 bertopic>=0.16.1 -datasets==2.13.2 \ No newline at end of file +datasets==2.13.2 +importlib-metadata==6.7.0 \ No newline at end of file From 38fc6e2e3c4e312e29bb6251a7c351f0fcd6e2c0 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 13:33:03 +0200 Subject: [PATCH 10/15] update tests/requirements.txt Signed-off-by: Tim Schopf --- tests/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 4f8397b..654e66c 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -4,4 +4,4 @@ flair==0.11.3 scipy==1.7.3 bertopic>=0.16.1 datasets==2.13.2 -importlib-metadata==6.7.0 \ No newline at end of file +umap-learn==0.5.4 \ No newline at end of file From b1e1ff4019b5a2c1c28a80c974f73f92a4ddc4fa Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 14:12:59 +0200 Subject: [PATCH 11/15] fix online vectorozers bug Signed-off-by: Tim Schopf --- keyphrase_vectorizers/keyphrase_count_vectorizer.py | 1 - keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py | 1 - 2 files changed, 2 deletions(-) diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index e39fa29..d570391 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -178,7 +178,6 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm' self.decay = decay self.delete_min_df = delete_min_df self.running_fit_transform = False - self.X_ = None def fit(self, raw_documents: List[str]) -> object: """ diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py index ab553a4..596bc0b 100644 --- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py @@ -185,7 +185,6 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm' self.use_idf = use_idf self.smooth_idf = smooth_idf self.sublinear_tf = sublinear_tf - self.X_ = None self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf, sublinear_tf=self.sublinear_tf) From c3bc1230e2673f2b4e3f2b4f10c0c4a6d5a07972 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 14:31:05 +0200 Subject: [PATCH 12/15] update BERTopic test Signed-off-by: Tim Schopf --- tests/requirements.txt | 2 +- tests/test_vectorizers.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 654e66c..86dd5f1 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,5 +3,5 @@ keybert>=0.5.0 flair==0.11.3 scipy==1.7.3 bertopic>=0.16.1 -datasets==2.13.2 +scikit-learn>=1.0.1 umap-learn==0.5.4 \ No newline at end of file diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py index 2be060d..721e073 100644 --- a/tests/test_vectorizers.py +++ b/tests/test_vectorizers.py @@ -3,10 +3,10 @@ import flair import spacy from bertopic import BERTopic -from datasets import load_dataset from flair.models import SequenceTagger from flair.tokenization import SegtokSentenceSplitter from keybert import KeyBERT +from sklearn.datasets import fetch_20newsgroups import tests.utils as utils from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer @@ -172,9 +172,8 @@ def test_online_vectorizer(): def test_bertopic(): - data = load_dataset("ag_news") - texts = data['train']['text'] - texts = texts[:100] + data = fetch_20newsgroups(subset='train') + texts = data.data[:100] topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer()) topics, probs = topic_model.fit_transform(documents=texts) new_topics = topic_model.reduce_outliers(texts, topics) From 8ddff25e973dcc27060629f23303ac7b3043d440 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 14:40:03 +0200 Subject: [PATCH 13/15] v0.0.12 release Signed-off-by: Tim Schopf --- keyphrase_vectorizers/keyphrase_count_vectorizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index d570391..01503ea 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -505,4 +505,4 @@ def _clean_bow(self) -> None: x = np.array(self.keyphrases) mask = np.full(len(self.keyphrases), True, dtype=bool) mask[indices] = False - self.keyphrases = list(x[~mask]) + self.keyphrases = list(x[~mask]) \ No newline at end of file From b44e9feccc54a0bbf5a1f710f44a682774f90ed3 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 15:14:54 +0200 Subject: [PATCH 14/15] update comments Signed-off-by: Tim Schopf --- .../keyphrase_vectorizer_mixin.py | 2 +- tests/utils.py | 31 ++++--------------- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py index 134aa7f..0c89805 100644 --- a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py +++ b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py @@ -428,7 +428,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L else: pos_tuples = custom_pos_tagger(raw_documents=document_list) - # get the original documents after they were processed by spaCy + # get the original documents after they were processed by a tokenizer and a POS tagger processed_docs = [] for tup in pos_tuples: token = tup[0] diff --git a/tests/utils.py b/tests/utils.py index 02b8fc3..7981c3f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -58,31 +58,12 @@ def get_english_test_keyphrases(): def get_english_first_doc_test_keyphrases(): - sorted_english_first_doc_test_keyphrases = ['algorithm', - 'class labels', - 'example', - 'function', - 'inductive bias', - 'input', - 'input object', - 'machine', - 'new examples', - 'optimal scenario', - 'output', - 'output pairs', - 'output value', - 'pair', - 'set', - 'supervised learning', - 'supervised learning algorithm', - 'supervisory signal', - 'task', - 'training data', - 'training examples', - 'unseen instances', - 'unseen situations', - 'vector', - 'way'] + sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias', + 'input', 'input object', 'machine', 'new examples', 'optimal scenario', + 'output', 'output pairs', 'output value', 'pair', 'set', + 'supervised learning', 'supervised learning algorithm', + 'supervisory signal', 'task', 'training data', 'training examples', + 'unseen instances', 'unseen situations', 'vector', 'way'] return sorted_english_first_doc_test_keyphrases From 0ca23676b32e566e16fa049a09376b6e56a75596 Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 15:18:09 +0200 Subject: [PATCH 15/15] update comments Signed-off-by: Tim Schopf --- keyphrase_vectorizers/keyphrase_count_vectorizer.py | 2 +- keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index 01503ea..e8444b2 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -39,7 +39,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator): must be customized accordingly. Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages. Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered. - In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. + In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. Parameters ---------- diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py index 596bc0b..14335ca 100644 --- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py @@ -37,7 +37,7 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer): must be customized accordingly. Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages. Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered. - In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. + In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval,