flairNLP · alanakbik · Dec 7, 2024 · Dec 7, 2024 · Dec 7, 2024 · Dec 7, 2024
diff --git a/docs/conf.py b/docs/conf.py
@@ -5,6 +5,7 @@
 
 # -- Project information -----------------------------------------------------
 from sphinx_github_style import get_linkcode_resolve
+from torch.nn import Module
 
 version = "0.14.0"
 release = "0.14.0"
@@ -100,7 +101,7 @@ def linkcode_resolve(*args):
 html_show_sphinx = False
 
 # Napoleon settings
-napoleon_include_init_with_doc = True
+napoleon_include_init_with_doc = False
 napoleon_include_private_with_doc = False
 
 autodoc_default_options = {

diff --git a/docs/tutorial/tutorial-training/how-model-training-works.md b/docs/tutorial/tutorial-training/how-model-training-works.md
@@ -279,16 +279,10 @@ print(sentence.to_tagged_string())
 
 If the model works well, it will correctly tag 'love' as a verb in this example.
 
-## Summary
+## Next
 
-This tutorial gave you a general overview of the main steps to train a model:
+Congrats, you now have a general overview of the main steps to train a model in Flair!
 
--    load a corpus
--    choose a label type
--    create a label dictionary
--    choose embeddings
--    initialize model
--    initialize trainer
--    train
+Next, learn about the [two main training approaches in Flair](train-vs-fine-tune.md).
 
 
diff --git a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
@@ -159,3 +159,6 @@ example we chose `label_type='topic'` to denote that we are loading a corpus wit
 
 
 
+## Next 
+
+Next, learn [how to train a sequence tagger](how-to-train-sequence-tagger.md).
diff --git a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
@@ -193,3 +193,7 @@ The following datasets are supported:
 | Universal Dependency Treebanks      | [flair.datasets.treebanks](#flair.datasets.treebanks)                                                                                       |
 | OCR-Layout-NER                      | [flair.datasets.ocr](#flair.datasets.ocr)                                                                                                   |
 
+
+## Next 
+
+Next, learn how to load a [custom dataset](how-to-load-custom-dataset.md).
diff --git a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
@@ -223,3 +223,6 @@ trainer.train('resources/taggers/example-universal-pos',
 This gives you a multilingual model. Try experimenting with more languages!
 
 
+## Next 
+
+Next, learn [how to train a text classifier](how-to-train-text-classifier.md).
diff --git a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
@@ -58,3 +58,7 @@ classifier.predict(sentence)
 print(sentence.labels)
 ```
 
+
+## Next 
+
+Next, learn [how to train an entity linker](how-to-train-span-classifier.md).
diff --git a/docs/tutorial/tutorial-training/train-vs-fine-tune.md b/docs/tutorial/tutorial-training/train-vs-fine-tune.md
@@ -1,11 +1,50 @@
 # Training vs fine-tuning
 
 There are two broad ways you train a model: The "classic" approach and the fine-tuning approach. This section
-explains the differences, and the things you need to do. 
+explains the differences. 
 
 
 ## Fine-Tuning
 
+Fine-tuning is the current state-of-the-art approach. The main idea is that you take a pre-trained language model that 
+consists of (hundreds of) millions of trained parameters. To this language model you add a simple prediction head with
+randomly initialized weights. 
+
+Since in this case, the vast majority of parameters in the model is already trained, you only need to "fine-tune" this
+model. This means: Very small learning rate (LR) and just a few epochs. You are essentially just minimally modifying 
+the model to adapt it to the task you want to solve.
+
+Use this method by calling [`ModelTrainer.fine_tune()`](#flair.trainers.ModelTrainer.fine_tune).
+Since most models in Flair were trained this way, this is likely the approach you'll want to use. 
+
 
 ## Training
 
+On the other hand, you should use the classic training approach if the majority of the trainable parameters in your 
+model is randomly initialized. This can happen for instance if you freeze the model weights of the pre-trained language 
+model, leaving only the randomly initialited prediction head as trainable parameters. This training approach is also
+referred to as "feature-based" or "probing" in some papers. 
+
+Since the majority of parameters is randomly initialized, you need to fully train the model. This means: high learning 
+rate and many epochs. 
+
+Use this method by calling  [`ModelTrainer.train()`](#flair.trainers.ModelTrainer.train) .
+
+```{note}
+Another application of classic training is for linear probing of pre-trained language models. In this scenario, you 
+"freeze" the weights of the language model (meaning that they cannot be changed) and add a prediction head that is 
+trained from scratch. So, even though a language model is involved, its parameters are not trainable. This means that 
+all trainable parameters in this scenario are randomly initialized, therefore necessitating the use of the classic
+training approach.
+```
+
+
+## Paper 
+
+If you are interested in an experimental comparison of the two above-mentioned approach, check out [our paper](https://arxiv.org/pdf/2011.06993) 
+that compares fine-tuning to the feature-based approach.
+
+
+## Next 
+
+Next, learn how to load a [training dataset](how-to-load-prepared-dataset.md).
diff --git a/flair/data.py b/flair/data.py
@@ -1372,6 +1372,14 @@ def unlabeled_identifier(self) -> str:
 
 
 class Corpus(typing.Generic[T_co]):
+    """The main object in Flair for holding a dataset used for training and testing.
+
+    A corpus consists of three splits: A `train` split used for training, a `dev` split used for model selection
+    and/or early stopping and a `test` split used for testing. All three splits are optional, so it is possible
+    to create a corpus only using one or two splits. If the option `sample_missing_splits` is set to True,
+    missing splits will be randomly sampled from the training split.
+    """
+
     def __init__(
         self,
         train: Optional[Dataset[T_co]] = None,
@@ -1381,6 +1389,26 @@ def __init__(
         sample_missing_splits: Union[bool, str] = True,
         random_seed: Optional[int] = None,
     ) -> None:
+        """
+        Constructor method to initialize a :class:`Corpus`. You can define the train, dev and test split
+        by passing the corresponding Dataset object to the constructor. At least one split should be defined.
+        If the option `sample_missing_splits` is set to True, missing splits will be randomly sampled from the
+        train split.
+
+        In most cases, you will not use the constructor yourself. Rather, you will create a corpus using one of our
+        helper methods that read common NLP filetypes. For instance, you can use
+        :class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into
+        a :class:`Corpus`.
+
+        Args:
+            train: The split you use for model training.
+            dev: A holdout split typically used for model selection or early stopping.
+            test: The final test data to compute the score of the model.
+            name: A name that identifies the corpus.
+            sample_missing_splits: If set to True, missing splits are sampled from train. If set to False,
+                missing splits are not sampled and left empty. Default: True.
+            random_seed: Set a random seed to control the sampling of missing splits.
+        """
         # set name
         self.name: str = name
 
@@ -1419,14 +1447,17 @@ def __init__(
 
     @property
     def train(self) -> Optional[Dataset[T_co]]:
+        """The training split as a :class:`torch.utils.data.Dataset` object."""
         return self._train
 
     @property
     def dev(self) -> Optional[Dataset[T_co]]:
+        """The dev split as a :class:`torch.utils.data.Dataset` object."""
         return self._dev
 
     @property
     def test(self) -> Optional[Dataset[T_co]]:
+        """The test split as a :class:`torch.utils.data.Dataset` object."""
         return self._test
 
     def downsample(
@@ -1443,12 +1474,12 @@ def downsample(
         data points. It additionally returns a pointer to itself for use in method chaining.
 
         Args:
-            percentage (float): A float value between 0. and 1. that indicates to which percentage the corpus
+            percentage: A float value between 0. and 1. that indicates to which percentage the corpus
                 should be downsampled. Default value is 0.1, meaning it gets downsampled to 10%.
-            downsample_train (bool): Whether or not to include the training split in downsampling. Default is True.
-            downsample_dev (bool): Whether or not to include the dev split in downsampling. Default is True.
-            downsample_test (bool): Whether or not to include the test split in downsampling. Default is True.
-            random_seed (int): An optional random seed to make downsampling reproducible.
+            downsample_train: Whether or not to include the training split in downsampling. Default is True.
+            downsample_dev: Whether or not to include the dev split in downsampling. Default is True.
+            downsample_test: Whether or not to include the test split in downsampling. Default is True.
+            random_seed: An optional random seed to make downsampling reproducible.
 
         Returns:
             A pointer to itself for optional use in method chaining.
@@ -1580,9 +1611,17 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float, random_seed:
         return splits[0]
 
     def obtain_statistics(self, label_type: Optional[str] = None, pretty_print: bool = True) -> Union[dict, str]:
-        """Print statistics about the class distribution and sentence sizes.
+        """Print statistics about the corpus, including the length of the sentences and the labels in the corpus.
 
-        only labels of sentences are taken into account
+        Args:
+            label_type: Optionally set this value to obtain statistics only for one specific type of label (such
+                as "ner" or "pos"). If not set, statistics for all labels will be returned.
+            pretty_print: If set to True, returns pretty json (indented for readabilty). If not, the json is
+                returned as a single line. Default: True.
+
+        Returns:
+            If pretty_print is True, returns a pretty print formatted string in json format. Otherwise, returns a
+                dictionary holding a json.
         """
         json_data = {
             "TRAIN": self._obtain_statistics_for(self.train, "TRAIN", label_type),
@@ -1654,7 +1693,21 @@ def make_label_dictionary(
     ) -> Dictionary:
         """Creates a dictionary of all labels assigned to the sentences in the corpus.
 
-        :return: dictionary of labels
+        Args:
+            label_type: The name of the label type for which the dictionary should be created. Some corpora have
+                multiple layers of annotation, such as "pos" and "ner". In this case, you should choose the label type
+                you are interested in.
+            min_count: Optionally set this to exclude rare labels from the dictionary (i.e., labels seen fewer
+                than the provided integer value).
+            add_unk: Optionally set this to True to include a "UNK" value in the dictionary. In most cases, this
+                is not needed since the label dictionary is well-defined, but some use cases might have open classes
+                and require this.
+            add_dev_test: Optionally set this to True to construct the label dictionary not only from the train
+                split, but also from dev and test. This is only necessary if some labels never appear in train but do
+                appear in one of the other splits.
+
+        Returns:
+            A Dictionary of all unique labels in the corpus.
         """
         if min_count > 0 and not add_unk:
             add_unk = True
@@ -1833,13 +1886,25 @@ def add_label_noise(
         )
 
     def get_label_distribution(self):
+        """Counts occurrences of each label in the corpus and returns them as a dictionary object.
+
+        This allows you to get an idea of which label appears how often in the Corpus.
+
+        Returns:
+            Dictionary with labels as keys and their occurrences as values.
+        """
         class_to_count = defaultdict(lambda: 0)
         for sent in self.train:
             for label in sent.labels:
                 class_to_count[label.value] += 1
         return class_to_count
 
     def get_all_sentences(self) -> ConcatDataset:
+        """Returns all sentences (spanning all three splits) in the :class:`Corpus`.
+
+        Returns:
+            A :class:`torch.utils.data.Dataset` object that includes all sentences of this corpus.
+        """
         parts = []
         if self.train:
             parts.append(self.train)

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -22,6 +22,17 @@
 
 
 class SequenceTagger(flair.nn.Classifier[Sentence]):
+    """The SequenceTagger is one of two main architectures in Flair used for sequence tagging.
+
+    Sequence tagging means classifying words in a sentence, for instance for part-of-speech tagging or named entity
+    recognition. The SequenceTagger implements the "classic" model based on the LSTM-CRF architecture: words are first
+    embedded using one or multiple :class:`flair.embeddings.TokenEmbeddings`, these embeddings are then passed to the
+    LSTM. Its hidden states for each input word are used to make the final prediction with a softmax classifier.
+    For decoding, the SequenceTagger by default uses a CRF approach.
+
+    Alternatively, you can use the class :class:`flair.models.TokenClassifier` for sequence tagging without a LSTM-CRF.
+    """
+
     def __init__(
         self,
         embeddings: TokenEmbeddings,
@@ -44,9 +55,7 @@ def __init__(
         init_from_state_dict: bool = False,
         allow_unk_predictions: bool = False,
     ) -> None:
-        """Sequence Tagger class for predicting labels for single tokens. Can be parameterized by several attributes.
-
-        In case of multitask learning, pass shared embeddings or shared rnn into respective attributes.
+        """Constructor for this class.
 
         Args:
             embeddings: Embeddings to use during training and prediction
@@ -268,6 +277,16 @@ def RNN(
         return RNN
 
     def forward_loss(self, sentences: list[Sentence]) -> tuple[torch.Tensor, int]:
+        """Conducts a forward pass through the SequenceTagger using labeled sentences and return the loss.
+
+        Args:
+            sentences: A batch of labeled sentences.
+
+        Returns:
+            A tuple consisting of the loss tensor and the number of tokens in the batch.
+
+        """
+
         # if there are no sentences, there is no loss
         if len(sentences) == 0:
             return torch.tensor(0.0, dtype=torch.float, device=flair.device, requires_grad=True), 0
@@ -291,7 +310,7 @@ def _prepare_tensors(self, data_points: Union[list[Sentence], Sentence]) -> tupl
         return sentence_tensor, lengths
 
     def forward(self, sentence_tensor: torch.Tensor, lengths: torch.LongTensor):
-        """Forward propagation through network.
+        """Forward pass through the SequenceTagger.
 
         Args:
             sentence_tensor: A tensor representing the batch of sentences.
@@ -439,7 +458,11 @@ def predict(
         embedding_storage_mode="none",
         force_token_predictions: bool = False,
     ):
-        """Predicts labels for current batch with CRF or Softmax.
+        """Call this method to predict labels for sentences.
+
+        Predictions are directly added to the Sentence objects that are passed to this method. This means that
+        the predict() method does not return predictions. Rather, predictions are stored at each sentence and can
+        be retrieved by calling :func:`flair.data.Sentence.get_labels()` on each :class:`flair.data.Sentence`.
 
         Args:
             sentences: List of sentences in batch
Original file line number	Diff line number	Diff line change
Expand Up		@@ -159,3 +159,6 @@ example we chose `label_type='topic'` to denote that we are loading a corpus wit



		## Next

		Next, learn [how to train a sequence tagger](how-to-train-sequence-tagger.md).
Original file line number	Diff line number	Diff line change
Expand Up		@@ -223,3 +223,6 @@ trainer.train('resources/taggers/example-universal-pos',
		This gives you a multilingual model. Try experimenting with more languages!


		## Next

		Next, learn [how to train a text classifier](how-to-train-text-classifier.md).
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,3 +58,7 @@ classifier.predict(sentence) @@
     print(sentence.labels)
     ```
+    ## Next
+    Next, learn [how to train an entity linker](how-to-train-span-classifier.md).