minor documentation updates for 'custom_pos_tagger' feature

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
TimSchopf · Jun 19, 2022 · aed6d55 · aed6d55
1 parent 4dda3bf
commit aed6d55
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 16 deletions.
diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -67,9 +67,9 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
             A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging.
             Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed.
 
-    custom_pos_tagger: callable
-            A callable function that that gets a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
-            If this parameter is not None, the custom tagger function is used to tag words with Parts-of-Speech, while the spaCy pipeline is ignored.
+    custom_pos_tagger: callable, default=None
+            A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
+            If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
 
     max_df : int, default=None
         During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold.

diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
@@ -95,10 +95,9 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
             A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging.
             Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed.
 
-custom_pos_tagger: callable
-            A callable function that that gets a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
-            If this parameter is not None, the custom tagger function is used to tag words with Parts-of-Speech, while the spaCy pipeline is ignored.
-
+    custom_pos_tagger: callable, default=None
+            A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
+            If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
 
     max_df : int, default=None
         During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold.

diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
@@ -206,9 +206,9 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L
             A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging.
             Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed.
 
-        custom_pos_tagger: callable
-            A callable function that that gets a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
-            If this parameter is not None, the custom tagger function is used to tag words with Parts-of-Speech, while the spaCy pipeline is ignored.
+    custom_pos_tagger: callable
+            A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
+            If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
 
         lowercase : bool, default=True
             Whether the returned keyphrases should be converted to lowercase.

diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py
@@ -83,28 +83,31 @@ def test_custom_tagger():
     tagger = SequenceTagger.load('pos')
     splitter = SegtokSentenceSplitter()
 
+    # define custom pos tagger function using flair
     def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTagger = tagger,
                           splitter: flair.tokenization.SegtokSentenceSplitter = splitter) -> List[tuple]:
+        """
+        Important:
 
-        # split sentences in docs
+        The mandatory 'raw_documents' parameter can NOT be named differently and has to expect a list of strings.
+        Furthermore the function has to return a list of (word token, POS-tag) tuples.
+        """
+        # split texts into sentences
         sentences = []
         for doc in raw_documents:
             sentences.extend(splitter.split(doc))
 
         # predict POS tags
         tagger.predict(sentences)
 
+        # iterate through sentences to get word tokens and predicted POS-tags
         pos_tags = []
         words = []
-        # iterate through sentences and print predicted labels
         for sentence in sentences:
-            tagger.predict(sentence)
-
             pos_tags.extend([label.value for label in sentence.get_labels('pos')])
             words.extend([word.text for word in sentence])
 
-        flair_tags = list(zip(words, pos_tags))
-        return flair_tags
+        return list(zip(words, pos_tags))
 
     vectorizer = KeyphraseCountVectorizer(custom_pos_tagger=custom_pos_tagger)
     vectorizer.fit(english_docs)