diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index d093f13..46c7942 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -67,9 +67,9 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator): A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging. Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed. - custom_pos_tagger: callable - A callable function that that gets a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. - If this parameter is not None, the custom tagger function is used to tag words with Parts-of-Speech, while the spaCy pipeline is ignored. + custom_pos_tagger: callable, default=None + A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. + If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. max_df : int, default=None During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold. diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py index 44677ec..b17efb7 100644 --- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py @@ -95,10 +95,9 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer): A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging. Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed. -custom_pos_tagger: callable - A callable function that that gets a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. - If this parameter is not None, the custom tagger function is used to tag words with Parts-of-Speech, while the spaCy pipeline is ignored. - + custom_pos_tagger: callable, default=None + A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. + If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. max_df : int, default=None During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold. diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py index e121483..563f836 100644 --- a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py +++ b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py @@ -206,9 +206,9 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging. Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed. - custom_pos_tagger: callable - A callable function that that gets a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. - If this parameter is not None, the custom tagger function is used to tag words with Parts-of-Speech, while the spaCy pipeline is ignored. + custom_pos_tagger: callable + A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. + If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. lowercase : bool, default=True Whether the returned keyphrases should be converted to lowercase. diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py index 75f4d3e..0011b91 100644 --- a/tests/test_vectorizers.py +++ b/tests/test_vectorizers.py @@ -83,10 +83,16 @@ def test_custom_tagger(): tagger = SequenceTagger.load('pos') splitter = SegtokSentenceSplitter() + # define custom pos tagger function using flair def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTagger = tagger, splitter: flair.tokenization.SegtokSentenceSplitter = splitter) -> List[tuple]: + """ + Important: - # split sentences in docs + The mandatory 'raw_documents' parameter can NOT be named differently and has to expect a list of strings. + Furthermore the function has to return a list of (word token, POS-tag) tuples. + """ + # split texts into sentences sentences = [] for doc in raw_documents: sentences.extend(splitter.split(doc)) @@ -94,17 +100,14 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag # predict POS tags tagger.predict(sentences) + # iterate through sentences to get word tokens and predicted POS-tags pos_tags = [] words = [] - # iterate through sentences and print predicted labels for sentence in sentences: - tagger.predict(sentence) - pos_tags.extend([label.value for label in sentence.get_labels('pos')]) words.extend([word.text for word in sentence]) - flair_tags = list(zip(words, pos_tags)) - return flair_tags + return list(zip(words, pos_tags)) vectorizer = KeyphraseCountVectorizer(custom_pos_tagger=custom_pos_tagger) vectorizer.fit(english_docs)