From 8ec413a6cc931cacabb718e22ea79fb4b90cf24f Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 09:21:06 +0200
Subject: [PATCH 01/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 keyphrase_vectorizers/keyphrase_count_vectorizer.py | 13 +++++++++----
 keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
index d379bf2..e39fa29 100644
--- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -178,6 +178,7 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm'
         self.decay = decay
         self.delete_min_df = delete_min_df
         self.running_fit_transform = False
+        self.X_ = None
 
     def fit(self, raw_documents: List[str]) -> object:
         """
@@ -458,11 +459,15 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix:
         that do not exceed `self.delete_min_df` are removed from its
         vocabulary and bag-of-keywords matrix.
 
-        Arguments:
-            raw_documents: A list of documents
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable of strings.
 
-        Returns:
-            X_: Bag-of-keywords matrix
+        Returns
+        -------
+        X_ : scipy.sparse.csr_matrix
+            Bag-of-keywords matrix
         """
 
         if hasattr(self, "X_"):
diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
index 596bc0b..ab553a4 100644
--- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
@@ -185,6 +185,7 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm'
         self.use_idf = use_idf
         self.smooth_idf = smooth_idf
         self.sublinear_tf = sublinear_tf
+        self.X_ = None
 
         self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf,
                                        sublinear_tf=self.sublinear_tf)

From 2173e4f6179508fabd162ab55d54ef6c34e29455 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 09:56:31 +0200
Subject: [PATCH 02/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 00dbc9d..d8ad4b8 100644
--- a/README.md
+++ b/README.md
@@ -434,7 +434,7 @@ vectorizer.fit(docs)
 keyphrases = vectorizer.get_feature_names_out()
 print(keyphrases)
 
->>>['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
+>> > ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
  'main topics' 'task' 'precise summary' 'supervised learning'
  'inductive bias' 'information retrieval environment'
  'supervised learning algorithm' 'function' 'input' 'pair'
@@ -790,7 +790,8 @@ vectorizer.X_.toarray()
 [Back to Table of Contents](#toc)
 
 When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry:
-``` 
+
+```plaintext
 @conference{schopf_etal_kdir22,
 author={Tim Schopf and Simon Klimek and Florian Matthes},
 title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},

From be1e841b25c27ef24b446061ff54f538e6cdd5c5 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 09:58:35 +0200
Subject: [PATCH 03/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index d8ad4b8..21dd684 100644
--- a/README.md
+++ b/README.md
@@ -255,7 +255,7 @@ vectorizer = KeyphraseTfidfVectorizer()
 
 # Print parameters
 print(vectorizer.get_params())
->> > {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
+>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
 
 
 class 'numpy.int64'>, 'lowercase': True, 'max_df': None
@@ -434,7 +434,7 @@ vectorizer.fit(docs)
 keyphrases = vectorizer.get_feature_names_out()
 print(keyphrases)
 
->> > ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
+>>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
  'main topics' 'task' 'precise summary' 'supervised learning'
  'inductive bias' 'information retrieval environment'
  'supervised learning algorithm' 'function' 'input' 'pair'
@@ -735,12 +735,12 @@ vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
 
 # intitial vectorizer fit
 vectorizer.fit_transform([docs[0]]).toarray()
->> > array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
+>>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
              1, 1, 1]])
 
 # check learned keyphrases
 print(vectorizer.get_feature_names_out())
->> > ['output pairs', 'output value', 'function', 'optimal scenario',
+>>> ['output pairs', 'output value', 'function', 'optimal scenario',
       'pair', 'supervised learning', 'supervisory signal', 'algorithm',
       'supervised learning algorithm', 'way', 'training examples',
       'input object', 'example', 'machine', 'output',
@@ -751,12 +751,12 @@ print(vectorizer.get_feature_names_out())
 # learn additional keyphrases from new documents with partial fit
 vectorizer.partial_fit([docs[1]])
 vectorizer.transform([docs[1]]).toarray()
->> > array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+>>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]])
 
 # check learned keyphrases, including newly learned ones
 print(vectorizer.get_feature_names_out())
->> > ['output pairs', 'output value', 'function', 'optimal scenario',
+>>> ['output pairs', 'output value', 'function', 'optimal scenario',
       'pair', 'supervised learning', 'supervisory signal', 'algorithm',
       'supervised learning algorithm', 'way', 'training examples',
       'input object', 'example', 'machine', 'output',
@@ -771,16 +771,16 @@ print(vectorizer.get_feature_names_out())
 # update list of learned keyphrases according to 'delete_min_df'
 vectorizer.update_bow([docs[1]])
 vectorizer.transform([docs[1]]).toarray()
->> > array([[5, 5]])
+>>> array([[5, 5]])
 
 # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
 print(vectorizer.get_feature_names_out())
->> > ['keywords', 'document']
+>>> ['keywords', 'document']
 
 # update again and check the impact of 'decay' on the learned document-keyphrase matrix
 vectorizer.update_bow([docs[1]])
 vectorizer.X_.toarray()
->> > array([[7.5, 7.5]])
+>>> array([[7.5, 7.5]])
 ```
 
 <a name="#citation-information"/></a>

From 5fc495bddf8e6bda8b1c0d1c15dc00d0e2a7024a Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 10:50:21 +0200
Subject: [PATCH 04/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 .readthedocs.yaml     |  3 +--
 README.md             | 14 +++++++-------
 docs/requirements.txt |  1 +
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index e5df02f..8b07701 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -18,7 +18,6 @@ formats: all
 python:
   install:
     - requirements: docs/requirements.txt
-    - requirements: requirements.txt
     - method: pip
       path: .
       extra_requirements:
@@ -27,7 +26,7 @@ python:
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.7"
+    python: "3.8"
 
 submodules:
   include: all
diff --git a/README.md b/README.md
index 21dd684..3dd9aae 100644
--- a/README.md
+++ b/README.md
@@ -127,9 +127,12 @@ vectorizer = KeyphraseCountVectorizer()
 
 # Print parameters
 print(vectorizer.get_params())
+```
+```plaintext
 >>> {'binary': False, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
 ```
 
+
 By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is
 specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives,
 followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline
@@ -255,14 +258,11 @@ vectorizer = KeyphraseTfidfVectorizer()
 
 # Print parameters
 print(vectorizer.get_params())
->>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
-
-
-class 'numpy.int64'>, 'lowercase': True, 'max_df': None
-
-, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner',
-                                                                   'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
 ```
+```plaintext
+>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner','textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
+```
+
 
 To calculate tf values instead, set `use_idf=False`.
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 8a3ea7e..0186de7 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -14,6 +14,7 @@ docutils>=0.16
 numpy>=1.18.5
 spacy>=3.0.1
 spacy-transformers>=1.1.6
+spacy-curated-transformers>=0.2.2
 nltk>=3.6.1
 scikit-learn>=1.0
 scipy>=1.7.3

From d1257dcf0fbace28f9ff561f0fe2191422a0d6e3 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 10:55:53 +0200
Subject: [PATCH 05/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 docs/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index df165fc..4667242 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -35,7 +35,7 @@
 # ones.
 extensions = [
     'sphinx_rtd_theme',
-    'recommonmark',
+    'myst_parser',
     'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
     'sphinx_markdown_tables',
@@ -64,7 +64,7 @@
 master_doc = 'index'
 
 source_parsers = {
-    '.md': 'recommonmark.parser.CommonMarkParser',
+    '.md': 'myst_parser.parser.MystParser',
 }
 
 source_suffix = ['.rst', '.md']

From a5e0f99dd9cb95be06ffa1f570f16f139cbd6409 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 12:04:26 +0200
Subject: [PATCH 06/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 README.md    | 14 +++++++-------
 docs/conf.py |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 3dd9aae..21dd684 100644
--- a/README.md
+++ b/README.md
@@ -127,12 +127,9 @@ vectorizer = KeyphraseCountVectorizer()
 
 # Print parameters
 print(vectorizer.get_params())
-```
-```plaintext
 >>> {'binary': False, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
 ```
 
-
 By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is
 specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives,
 followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline
@@ -258,11 +255,14 @@ vectorizer = KeyphraseTfidfVectorizer()
 
 # Print parameters
 print(vectorizer.get_params())
-```
-```plaintext
->>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner','textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
-```
+>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
+
 
+class 'numpy.int64'>, 'lowercase': True, 'max_df': None
+
+, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner',
+                                                                   'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
+```
 
 To calculate tf values instead, set `use_idf=False`.
 
diff --git a/docs/conf.py b/docs/conf.py
index 4667242..df165fc 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -35,7 +35,7 @@
 # ones.
 extensions = [
     'sphinx_rtd_theme',
-    'myst_parser',
+    'recommonmark',
     'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
     'sphinx_markdown_tables',
@@ -64,7 +64,7 @@
 master_doc = 'index'
 
 source_parsers = {
-    '.md': 'myst_parser.parser.MystParser',
+    '.md': 'recommonmark.parser.CommonMarkParser',
 }
 
 source_suffix = ['.rst', '.md']

From e01123a11b9b68f2a784b5cf97c8d02b01a839e4 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 12:07:35 +0200
Subject: [PATCH 07/15] fix docs

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 .readthedocs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 8b07701..8aac750 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 sphinx:
   configuration: docs/conf.py
   builder: html
-  fail_on_warning: true
+  fail_on_warning: false
 
 # Optionally build your docs in additional formats such as PDF
 formats: all

From eede5fea3e10ccf010127d9b5036dcb695877189 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 13:00:52 +0200
Subject: [PATCH 08/15] add tests

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 .readthedocs.yaml         |  2 +-
 tests/requirements.txt    |  4 ++-
 tests/test_vectorizers.py | 47 ++++++++++++++++++++++++++++
 tests/utils.py            | 64 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 8aac750..4e9a238 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -26,7 +26,7 @@ python:
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.7"
 
 submodules:
   include: all
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a32ad4e..8c11f57 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,4 +1,6 @@
 pytest>=7.0.1
 keybert>=0.5.0
 flair==0.11.3
-scipy==1.7.3
\ No newline at end of file
+scipy==1.7.3
+bertopic>=0.16.1
+datasets==2.13.2
\ No newline at end of file
diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py
index d2372a5..2be060d 100644
--- a/tests/test_vectorizers.py
+++ b/tests/test_vectorizers.py
@@ -2,6 +2,8 @@
 
 import flair
 import spacy
+from bertopic import BERTopic
+from datasets import load_dataset
 from flair.models import SequenceTagger
 from flair.tokenization import SegtokSentenceSplitter
 from keybert import KeyBERT
@@ -132,3 +134,48 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag
     keyphrases = vectorizer.get_feature_names_out()
 
     assert sorted(keyphrases) == sorted_english_test_keyphrases
+
+
+def test_online_vectorizer():
+    first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix()
+    second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix()
+    first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases()
+    english_keyphrases = utils.get_english_test_keyphrases()
+    frequencies_after_min_df = utils.get_frequencies_after_min_df()
+    frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df()
+    frequencies_after_bow = utils.get_frequencies_after_bow()
+
+    # intitial vectorizer fit
+    vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
+
+    assert [sorted(count_list) for count_list in
+            vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix
+    assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases
+
+    # learn additional keyphrases from new documents with partial fit
+    vectorizer.partial_fit([english_docs[1]])
+
+    assert [sorted(count_list) for count_list in
+            vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix
+    assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases
+
+    # update list of learned keyphrases according to 'delete_min_df'
+    vectorizer.update_bow([english_docs[1]])
+    assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all()
+
+    # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
+    assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df
+
+    # update again and check the impact of 'decay' on the learned document-keyphrase matrix
+    vectorizer.update_bow([english_docs[1]])
+    assert (vectorizer.X_.toarray() == frequencies_after_bow).all()
+
+
+def test_bertopic():
+    data = load_dataset("ag_news")
+    texts = data['train']['text']
+    texts = texts[:100]
+    topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
+    topics, probs = topic_model.fit_transform(documents=texts)
+    new_topics = topic_model.reduce_outliers(texts, topics)
+    topic_model.update_topics(texts, topics=new_topics)
diff --git a/tests/utils.py b/tests/utils.py
index d4c275e..02b8fc3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,3 +1,4 @@
+import numpy as np
 def get_english_test_docs():
     english_docs = ["""Supervised learning is the machine learning task of learning a function that
              maps an input to an output based on example input-output pairs. It infers a
@@ -56,6 +57,36 @@ def get_english_test_keyphrases():
     return sorted_english_test_keyphrases
 
 
+def get_english_first_doc_test_keyphrases():
+    sorted_english_first_doc_test_keyphrases = ['algorithm',
+                                                'class labels',
+                                                'example',
+                                                'function',
+                                                'inductive bias',
+                                                'input',
+                                                'input object',
+                                                'machine',
+                                                'new examples',
+                                                'optimal scenario',
+                                                'output',
+                                                'output pairs',
+                                                'output value',
+                                                'pair',
+                                                'set',
+                                                'supervised learning',
+                                                'supervised learning algorithm',
+                                                'supervisory signal',
+                                                'task',
+                                                'training data',
+                                                'training examples',
+                                                'unseen instances',
+                                                'unseen situations',
+                                                'vector',
+                                                'way']
+
+    return sorted_english_first_doc_test_keyphrases
+
+
 def get_sorted_english_keyphrases_custom_flair_tagger():
     sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content',
                                                'document relevance',
@@ -102,6 +133,21 @@ def get_sorted_english_count_matrix():
     return sorted_english_count_matrix
 
 
+def get_sorted_english_first_doc_count_matrix():
+    sorted_english_first_doc_count_matrix = [
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]]
+
+    return sorted_english_first_doc_count_matrix
+
+
+def get_sorted_english_second_doc_count_matrix():
+    sorted_english_second_doc_count_matrix = [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 2, 2, 5, 5]]
+
+    return sorted_english_second_doc_count_matrix
+
+
 def get_sorted_french_count_matrix():
     sorted_french_count_matrix = [[1, 1, 1, 1]]
 
@@ -130,3 +176,21 @@ def get_english_keybert_keyphrases():
          'document content']]
 
     return english_keybert_keyphrases
+
+
+def get_frequencies_after_min_df():
+    frequency_array = np.array([[5, 5]])
+
+    return frequency_array
+
+
+def get_frequencies_after_bow():
+    frequency_array = np.array([[7.5, 7.5]])
+
+    return frequency_array
+
+
+def get_frequent_keyphrases_after_min_df():
+    keyphrases = ['document', 'keywords']
+
+    return keyphrases

From 175caa68dcaeaae371ed022ba7db6bbea6c6896e Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 13:07:02 +0200
Subject: [PATCH 09/15] update test requirements.txt

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 tests/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/requirements.txt b/tests/requirements.txt
index 8c11f57..4f8397b 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -3,4 +3,5 @@ keybert>=0.5.0
 flair==0.11.3
 scipy==1.7.3
 bertopic>=0.16.1
-datasets==2.13.2
\ No newline at end of file
+datasets==2.13.2
+importlib-metadata==6.7.0
\ No newline at end of file

From 38fc6e2e3c4e312e29bb6251a7c351f0fcd6e2c0 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 13:33:03 +0200
Subject: [PATCH 10/15] update tests/requirements.txt

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 tests/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/requirements.txt b/tests/requirements.txt
index 4f8397b..654e66c 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -4,4 +4,4 @@ flair==0.11.3
 scipy==1.7.3
 bertopic>=0.16.1
 datasets==2.13.2
-importlib-metadata==6.7.0
\ No newline at end of file
+umap-learn==0.5.4
\ No newline at end of file

From b1e1ff4019b5a2c1c28a80c974f73f92a4ddc4fa Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 14:12:59 +0200
Subject: [PATCH 11/15] fix online vectorozers bug

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 keyphrase_vectorizers/keyphrase_count_vectorizer.py | 1 -
 keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
index e39fa29..d570391 100644
--- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -178,7 +178,6 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm'
         self.decay = decay
         self.delete_min_df = delete_min_df
         self.running_fit_transform = False
-        self.X_ = None
 
     def fit(self, raw_documents: List[str]) -> object:
         """
diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
index ab553a4..596bc0b 100644
--- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
@@ -185,7 +185,6 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm'
         self.use_idf = use_idf
         self.smooth_idf = smooth_idf
         self.sublinear_tf = sublinear_tf
-        self.X_ = None
 
         self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf,
                                        sublinear_tf=self.sublinear_tf)

From c3bc1230e2673f2b4e3f2b4f10c0c4a6d5a07972 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 14:31:05 +0200
Subject: [PATCH 12/15] update BERTopic test

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 tests/requirements.txt    | 2 +-
 tests/test_vectorizers.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/requirements.txt b/tests/requirements.txt
index 654e66c..86dd5f1 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -3,5 +3,5 @@ keybert>=0.5.0
 flair==0.11.3
 scipy==1.7.3
 bertopic>=0.16.1
-datasets==2.13.2
+scikit-learn>=1.0.1
 umap-learn==0.5.4
\ No newline at end of file
diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py
index 2be060d..721e073 100644
--- a/tests/test_vectorizers.py
+++ b/tests/test_vectorizers.py
@@ -3,10 +3,10 @@
 import flair
 import spacy
 from bertopic import BERTopic
-from datasets import load_dataset
 from flair.models import SequenceTagger
 from flair.tokenization import SegtokSentenceSplitter
 from keybert import KeyBERT
+from sklearn.datasets import fetch_20newsgroups
 
 import tests.utils as utils
 from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
@@ -172,9 +172,8 @@ def test_online_vectorizer():
 
 
 def test_bertopic():
-    data = load_dataset("ag_news")
-    texts = data['train']['text']
-    texts = texts[:100]
+    data = fetch_20newsgroups(subset='train')
+    texts = data.data[:100]
     topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
     topics, probs = topic_model.fit_transform(documents=texts)
     new_topics = topic_model.reduce_outliers(texts, topics)

From 8ddff25e973dcc27060629f23303ac7b3043d440 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 14:40:03 +0200
Subject: [PATCH 13/15] v0.0.12 release

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 keyphrase_vectorizers/keyphrase_count_vectorizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
index d570391..01503ea 100644
--- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -505,4 +505,4 @@ def _clean_bow(self) -> None:
         x = np.array(self.keyphrases)
         mask = np.full(len(self.keyphrases), True, dtype=bool)
         mask[indices] = False
-        self.keyphrases = list(x[~mask])
+        self.keyphrases = list(x[~mask])
\ No newline at end of file

From b44e9feccc54a0bbf5a1f710f44a682774f90ed3 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 15:14:54 +0200
Subject: [PATCH 14/15] update comments

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 .../keyphrase_vectorizer_mixin.py             |  2 +-
 tests/utils.py                                | 31 ++++---------------
 2 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
index 134aa7f..0c89805 100644
--- a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
+++ b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
@@ -428,7 +428,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L
         else:
             pos_tuples = custom_pos_tagger(raw_documents=document_list)
 
-        # get the original documents after they were processed by spaCy
+        # get the original documents after they were processed by a tokenizer and a POS tagger
         processed_docs = []
         for tup in pos_tuples:
             token = tup[0]
diff --git a/tests/utils.py b/tests/utils.py
index 02b8fc3..7981c3f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -58,31 +58,12 @@ def get_english_test_keyphrases():
 
 
 def get_english_first_doc_test_keyphrases():
-    sorted_english_first_doc_test_keyphrases = ['algorithm',
-                                                'class labels',
-                                                'example',
-                                                'function',
-                                                'inductive bias',
-                                                'input',
-                                                'input object',
-                                                'machine',
-                                                'new examples',
-                                                'optimal scenario',
-                                                'output',
-                                                'output pairs',
-                                                'output value',
-                                                'pair',
-                                                'set',
-                                                'supervised learning',
-                                                'supervised learning algorithm',
-                                                'supervisory signal',
-                                                'task',
-                                                'training data',
-                                                'training examples',
-                                                'unseen instances',
-                                                'unseen situations',
-                                                'vector',
-                                                'way']
+    sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias',
+                                                'input', 'input object', 'machine', 'new examples', 'optimal scenario',
+                                                'output', 'output pairs', 'output value', 'pair', 'set',
+                                                'supervised learning', 'supervised learning algorithm',
+                                                'supervisory signal', 'task', 'training data', 'training examples',
+                                                'unseen instances', 'unseen situations', 'vector', 'way']
 
     return sorted_english_first_doc_test_keyphrases
 

From 0ca23676b32e566e16fa049a09376b6e56a75596 Mon Sep 17 00:00:00 2001
From: Tim Schopf <tim.schopf@t-online.de>
Date: Mon, 29 Apr 2024 15:18:09 +0200
Subject: [PATCH 15/15] update comments

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
---
 keyphrase_vectorizers/keyphrase_count_vectorizer.py | 2 +-
 keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
index 01503ea..e8444b2 100644
--- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -39,7 +39,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
         must be customized accordingly.
         Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_  differ between languages.
         Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
-        In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
+        In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
 
     Parameters
     ----------
diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
index 596bc0b..14335ca 100644
--- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
@@ -37,7 +37,7 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
         must be customized accordingly.
         Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_  differ between languages.
         Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
-        In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
+        In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
 
     Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
     This is a common term weighting scheme in information retrieval,