diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e5df02f..4e9a238 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ version: 2 sphinx: configuration: docs/conf.py builder: html - fail_on_warning: true + fail_on_warning: false # Optionally build your docs in additional formats such as PDF formats: all @@ -18,7 +18,6 @@ formats: all python: install: - requirements: docs/requirements.txt - - requirements: requirements.txt - method: pip path: . extra_requirements: diff --git a/README.md b/README.md index 00dbc9d..21dd684 100644 --- a/README.md +++ b/README.md @@ -255,7 +255,7 @@ vectorizer = KeyphraseTfidfVectorizer() # Print parameters print(vectorizer.get_params()) ->> > {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < +>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < class 'numpy.int64'>, 'lowercase': True, 'max_df': None @@ -434,7 +434,7 @@ vectorizer.fit(docs) keyphrases = vectorizer.get_feature_names_out() print(keyphrases) ->>>['output value' 'information retrieval' 'algorithm' 'vector' 'groups' +>>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups' 'main topics' 'task' 'precise summary' 'supervised learning' 'inductive bias' 'information retrieval environment' 'supervised learning algorithm' 'function' 'input' 'pair' @@ -735,12 +735,12 @@ vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3) # intitial vectorizer fit vectorizer.fit_transform([docs[0]]).toarray() ->> > array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, +>>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1]]) # check learned keyphrases print(vectorizer.get_feature_names_out()) ->> > ['output pairs', 'output value', 'function', 'optimal scenario', +>>> ['output pairs', 'output value', 'function', 'optimal scenario', 'pair', 'supervised learning', 'supervisory signal', 'algorithm', 'supervised learning algorithm', 'way', 'training examples', 'input object', 'example', 'machine', 'output', @@ -751,12 +751,12 @@ print(vectorizer.get_feature_names_out()) # learn additional keyphrases from new documents with partial fit vectorizer.partial_fit([docs[1]]) vectorizer.transform([docs[1]]).toarray() ->> > array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +>>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]]) # check learned keyphrases, including newly learned ones print(vectorizer.get_feature_names_out()) ->> > ['output pairs', 'output value', 'function', 'optimal scenario', +>>> ['output pairs', 'output value', 'function', 'optimal scenario', 'pair', 'supervised learning', 'supervisory signal', 'algorithm', 'supervised learning algorithm', 'way', 'training examples', 'input object', 'example', 'machine', 'output', @@ -771,16 +771,16 @@ print(vectorizer.get_feature_names_out()) # update list of learned keyphrases according to 'delete_min_df' vectorizer.update_bow([docs[1]]) vectorizer.transform([docs[1]]).toarray() ->> > array([[5, 5]]) +>>> array([[5, 5]]) # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain) print(vectorizer.get_feature_names_out()) ->> > ['keywords', 'document'] +>>> ['keywords', 'document'] # update again and check the impact of 'decay' on the learned document-keyphrase matrix vectorizer.update_bow([docs[1]]) vectorizer.X_.toarray() ->> > array([[7.5, 7.5]]) +>>> array([[7.5, 7.5]]) ``` @@ -790,7 +790,8 @@ vectorizer.X_.toarray() [Back to Table of Contents](#toc) When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry: -``` + +```plaintext @conference{schopf_etal_kdir22, author={Tim Schopf and Simon Klimek and Florian Matthes}, title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction}, diff --git a/docs/requirements.txt b/docs/requirements.txt index 8a3ea7e..0186de7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,6 +14,7 @@ docutils>=0.16 numpy>=1.18.5 spacy>=3.0.1 spacy-transformers>=1.1.6 +spacy-curated-transformers>=0.2.2 nltk>=3.6.1 scikit-learn>=1.0 scipy>=1.7.3 diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index d379bf2..e8444b2 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -39,7 +39,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator): must be customized accordingly. Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages. Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered. - In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. + In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. Parameters ---------- @@ -458,11 +458,15 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix: that do not exceed `self.delete_min_df` are removed from its vocabulary and bag-of-keywords matrix. - Arguments: - raw_documents: A list of documents + Parameters + ---------- + raw_documents : iterable + An iterable of strings. - Returns: - X_: Bag-of-keywords matrix + Returns + ------- + X_ : scipy.sparse.csr_matrix + Bag-of-keywords matrix """ if hasattr(self, "X_"): @@ -501,4 +505,4 @@ def _clean_bow(self) -> None: x = np.array(self.keyphrases) mask = np.full(len(self.keyphrases), True, dtype=bool) mask[indices] = False - self.keyphrases = list(x[~mask]) + self.keyphrases = list(x[~mask]) \ No newline at end of file diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py index 596bc0b..14335ca 100644 --- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py @@ -37,7 +37,7 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer): must be customized accordingly. Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages. Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered. - In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. + In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py index 17132a3..df6bbc6 100644 --- a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py +++ b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py @@ -428,7 +428,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L else: pos_tuples = custom_pos_tagger(raw_documents=document_list) - # get the original documents after they were processed by spaCy + # get the original documents after they were processed by a tokenizer and a POS tagger processed_docs = [] for tup in pos_tuples: token = tup[0] diff --git a/tests/requirements.txt b/tests/requirements.txt index a32ad4e..86dd5f1 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,4 +1,7 @@ pytest>=7.0.1 keybert>=0.5.0 flair==0.11.3 -scipy==1.7.3 \ No newline at end of file +scipy==1.7.3 +bertopic>=0.16.1 +scikit-learn>=1.0.1 +umap-learn==0.5.4 \ No newline at end of file diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py index d2372a5..721e073 100644 --- a/tests/test_vectorizers.py +++ b/tests/test_vectorizers.py @@ -2,9 +2,11 @@ import flair import spacy +from bertopic import BERTopic from flair.models import SequenceTagger from flair.tokenization import SegtokSentenceSplitter from keybert import KeyBERT +from sklearn.datasets import fetch_20newsgroups import tests.utils as utils from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer @@ -132,3 +134,47 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag keyphrases = vectorizer.get_feature_names_out() assert sorted(keyphrases) == sorted_english_test_keyphrases + + +def test_online_vectorizer(): + first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix() + second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix() + first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases() + english_keyphrases = utils.get_english_test_keyphrases() + frequencies_after_min_df = utils.get_frequencies_after_min_df() + frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df() + frequencies_after_bow = utils.get_frequencies_after_bow() + + # intitial vectorizer fit + vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3) + + assert [sorted(count_list) for count_list in + vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix + assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases + + # learn additional keyphrases from new documents with partial fit + vectorizer.partial_fit([english_docs[1]]) + + assert [sorted(count_list) for count_list in + vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix + assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases + + # update list of learned keyphrases according to 'delete_min_df' + vectorizer.update_bow([english_docs[1]]) + assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all() + + # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain) + assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df + + # update again and check the impact of 'decay' on the learned document-keyphrase matrix + vectorizer.update_bow([english_docs[1]]) + assert (vectorizer.X_.toarray() == frequencies_after_bow).all() + + +def test_bertopic(): + data = fetch_20newsgroups(subset='train') + texts = data.data[:100] + topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer()) + topics, probs = topic_model.fit_transform(documents=texts) + new_topics = topic_model.reduce_outliers(texts, topics) + topic_model.update_topics(texts, topics=new_topics) diff --git a/tests/utils.py b/tests/utils.py index d4c275e..7981c3f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +import numpy as np def get_english_test_docs(): english_docs = ["""Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a @@ -56,6 +57,17 @@ def get_english_test_keyphrases(): return sorted_english_test_keyphrases +def get_english_first_doc_test_keyphrases(): + sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias', + 'input', 'input object', 'machine', 'new examples', 'optimal scenario', + 'output', 'output pairs', 'output value', 'pair', 'set', + 'supervised learning', 'supervised learning algorithm', + 'supervisory signal', 'task', 'training data', 'training examples', + 'unseen instances', 'unseen situations', 'vector', 'way'] + + return sorted_english_first_doc_test_keyphrases + + def get_sorted_english_keyphrases_custom_flair_tagger(): sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content', 'document relevance', @@ -102,6 +114,21 @@ def get_sorted_english_count_matrix(): return sorted_english_count_matrix +def get_sorted_english_first_doc_count_matrix(): + sorted_english_first_doc_count_matrix = [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]] + + return sorted_english_first_doc_count_matrix + + +def get_sorted_english_second_doc_count_matrix(): + sorted_english_second_doc_count_matrix = [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 5, 5]] + + return sorted_english_second_doc_count_matrix + + def get_sorted_french_count_matrix(): sorted_french_count_matrix = [[1, 1, 1, 1]] @@ -130,3 +157,21 @@ def get_english_keybert_keyphrases(): 'document content']] return english_keybert_keyphrases + + +def get_frequencies_after_min_df(): + frequency_array = np.array([[5, 5]]) + + return frequency_array + + +def get_frequencies_after_bow(): + frequency_array = np.array([[7.5, 7.5]]) + + return frequency_array + + +def get_frequent_keyphrases_after_min_df(): + keyphrases = ['document', 'keywords'] + + return keyphrases