diff --git a/docs/conf.py b/docs/conf.py index 22ffbd0194..3f5d95cb04 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -5,6 +5,7 @@ # -- Project information ----------------------------------------------------- from sphinx_github_style import get_linkcode_resolve +from torch.nn import Module version = "0.14.0" release = "0.14.0" @@ -100,7 +101,7 @@ def linkcode_resolve(*args): html_show_sphinx = False # Napoleon settings -napoleon_include_init_with_doc = True +napoleon_include_init_with_doc = False napoleon_include_private_with_doc = False autodoc_default_options = { diff --git a/docs/tutorial/tutorial-training/how-model-training-works.md b/docs/tutorial/tutorial-training/how-model-training-works.md index 9241213c98..a4b2392a7c 100644 --- a/docs/tutorial/tutorial-training/how-model-training-works.md +++ b/docs/tutorial/tutorial-training/how-model-training-works.md @@ -279,16 +279,10 @@ print(sentence.to_tagged_string()) If the model works well, it will correctly tag 'love' as a verb in this example. -## Summary +## Next -This tutorial gave you a general overview of the main steps to train a model: +Congrats, you now have a general overview of the main steps to train a model in Flair! -- load a corpus -- choose a label type -- create a label dictionary -- choose embeddings -- initialize model -- initialize trainer -- train +Next, learn about the [two main training approaches in Flair](train-vs-fine-tune.md). diff --git a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md index 1e7fadb0f1..4bd71d4340 100644 --- a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md +++ b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md @@ -159,3 +159,6 @@ example we chose `label_type='topic'` to denote that we are loading a corpus wit +## Next + +Next, learn [how to train a sequence tagger](how-to-train-sequence-tagger.md). diff --git a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md index f5cd654ed7..b53aeef917 100644 --- a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md +++ b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md @@ -193,3 +193,7 @@ The following datasets are supported: | Universal Dependency Treebanks | [flair.datasets.treebanks](#flair.datasets.treebanks) | | OCR-Layout-NER | [flair.datasets.ocr](#flair.datasets.ocr) | + +## Next + +Next, learn how to load a [custom dataset](how-to-load-custom-dataset.md). \ No newline at end of file diff --git a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md index fc9bc492b1..0f79022b70 100644 --- a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md +++ b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md @@ -223,3 +223,6 @@ trainer.train('resources/taggers/example-universal-pos', This gives you a multilingual model. Try experimenting with more languages! +## Next + +Next, learn [how to train a text classifier](how-to-train-text-classifier.md). diff --git a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md index 265689c21f..b88084ff00 100644 --- a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md +++ b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md @@ -58,3 +58,7 @@ classifier.predict(sentence) print(sentence.labels) ``` + +## Next + +Next, learn [how to train an entity linker](how-to-train-span-classifier.md). \ No newline at end of file diff --git a/docs/tutorial/tutorial-training/train-vs-fine-tune.md b/docs/tutorial/tutorial-training/train-vs-fine-tune.md index fd45e90ea0..657a3a0aa8 100644 --- a/docs/tutorial/tutorial-training/train-vs-fine-tune.md +++ b/docs/tutorial/tutorial-training/train-vs-fine-tune.md @@ -1,11 +1,50 @@ # Training vs fine-tuning There are two broad ways you train a model: The "classic" approach and the fine-tuning approach. This section -explains the differences, and the things you need to do. +explains the differences. ## Fine-Tuning +Fine-tuning is the current state-of-the-art approach. The main idea is that you take a pre-trained language model that +consists of (hundreds of) millions of trained parameters. To this language model you add a simple prediction head with +randomly initialized weights. + +Since in this case, the vast majority of parameters in the model is already trained, you only need to "fine-tune" this +model. This means: Very small learning rate (LR) and just a few epochs. You are essentially just minimally modifying +the model to adapt it to the task you want to solve. + +Use this method by calling [`ModelTrainer.fine_tune()`](#flair.trainers.ModelTrainer.fine_tune). +Since most models in Flair were trained this way, this is likely the approach you'll want to use. + ## Training +On the other hand, you should use the classic training approach if the majority of the trainable parameters in your +model is randomly initialized. This can happen for instance if you freeze the model weights of the pre-trained language +model, leaving only the randomly initialited prediction head as trainable parameters. This training approach is also +referred to as "feature-based" or "probing" in some papers. + +Since the majority of parameters is randomly initialized, you need to fully train the model. This means: high learning +rate and many epochs. + +Use this method by calling [`ModelTrainer.train()`](#flair.trainers.ModelTrainer.train) . + +```{note} +Another application of classic training is for linear probing of pre-trained language models. In this scenario, you +"freeze" the weights of the language model (meaning that they cannot be changed) and add a prediction head that is +trained from scratch. So, even though a language model is involved, its parameters are not trainable. This means that +all trainable parameters in this scenario are randomly initialized, therefore necessitating the use of the classic +training approach. +``` + + +## Paper + +If you are interested in an experimental comparison of the two above-mentioned approach, check out [our paper](https://arxiv.org/pdf/2011.06993) +that compares fine-tuning to the feature-based approach. + + +## Next + +Next, learn how to load a [training dataset](how-to-load-prepared-dataset.md). \ No newline at end of file diff --git a/flair/data.py b/flair/data.py index f5d902f0e6..7ee32f40b9 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1372,6 +1372,14 @@ def unlabeled_identifier(self) -> str: class Corpus(typing.Generic[T_co]): + """The main object in Flair for holding a dataset used for training and testing. + + A corpus consists of three splits: A `train` split used for training, a `dev` split used for model selection + and/or early stopping and a `test` split used for testing. All three splits are optional, so it is possible + to create a corpus only using one or two splits. If the option `sample_missing_splits` is set to True, + missing splits will be randomly sampled from the training split. + """ + def __init__( self, train: Optional[Dataset[T_co]] = None, @@ -1381,6 +1389,26 @@ def __init__( sample_missing_splits: Union[bool, str] = True, random_seed: Optional[int] = None, ) -> None: + """ + Constructor method to initialize a :class:`Corpus`. You can define the train, dev and test split + by passing the corresponding Dataset object to the constructor. At least one split should be defined. + If the option `sample_missing_splits` is set to True, missing splits will be randomly sampled from the + train split. + + In most cases, you will not use the constructor yourself. Rather, you will create a corpus using one of our + helper methods that read common NLP filetypes. For instance, you can use + :class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into + a :class:`Corpus`. + + Args: + train: The split you use for model training. + dev: A holdout split typically used for model selection or early stopping. + test: The final test data to compute the score of the model. + name: A name that identifies the corpus. + sample_missing_splits: If set to True, missing splits are sampled from train. If set to False, + missing splits are not sampled and left empty. Default: True. + random_seed: Set a random seed to control the sampling of missing splits. + """ # set name self.name: str = name @@ -1419,14 +1447,17 @@ def __init__( @property def train(self) -> Optional[Dataset[T_co]]: + """The training split as a :class:`torch.utils.data.Dataset` object.""" return self._train @property def dev(self) -> Optional[Dataset[T_co]]: + """The dev split as a :class:`torch.utils.data.Dataset` object.""" return self._dev @property def test(self) -> Optional[Dataset[T_co]]: + """The test split as a :class:`torch.utils.data.Dataset` object.""" return self._test def downsample( @@ -1443,12 +1474,12 @@ def downsample( data points. It additionally returns a pointer to itself for use in method chaining. Args: - percentage (float): A float value between 0. and 1. that indicates to which percentage the corpus + percentage: A float value between 0. and 1. that indicates to which percentage the corpus should be downsampled. Default value is 0.1, meaning it gets downsampled to 10%. - downsample_train (bool): Whether or not to include the training split in downsampling. Default is True. - downsample_dev (bool): Whether or not to include the dev split in downsampling. Default is True. - downsample_test (bool): Whether or not to include the test split in downsampling. Default is True. - random_seed (int): An optional random seed to make downsampling reproducible. + downsample_train: Whether or not to include the training split in downsampling. Default is True. + downsample_dev: Whether or not to include the dev split in downsampling. Default is True. + downsample_test: Whether or not to include the test split in downsampling. Default is True. + random_seed: An optional random seed to make downsampling reproducible. Returns: A pointer to itself for optional use in method chaining. @@ -1580,9 +1611,17 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float, random_seed: return splits[0] def obtain_statistics(self, label_type: Optional[str] = None, pretty_print: bool = True) -> Union[dict, str]: - """Print statistics about the class distribution and sentence sizes. + """Print statistics about the corpus, including the length of the sentences and the labels in the corpus. - only labels of sentences are taken into account + Args: + label_type: Optionally set this value to obtain statistics only for one specific type of label (such + as "ner" or "pos"). If not set, statistics for all labels will be returned. + pretty_print: If set to True, returns pretty json (indented for readabilty). If not, the json is + returned as a single line. Default: True. + + Returns: + If pretty_print is True, returns a pretty print formatted string in json format. Otherwise, returns a + dictionary holding a json. """ json_data = { "TRAIN": self._obtain_statistics_for(self.train, "TRAIN", label_type), @@ -1654,7 +1693,21 @@ def make_label_dictionary( ) -> Dictionary: """Creates a dictionary of all labels assigned to the sentences in the corpus. - :return: dictionary of labels + Args: + label_type: The name of the label type for which the dictionary should be created. Some corpora have + multiple layers of annotation, such as "pos" and "ner". In this case, you should choose the label type + you are interested in. + min_count: Optionally set this to exclude rare labels from the dictionary (i.e., labels seen fewer + than the provided integer value). + add_unk: Optionally set this to True to include a "UNK" value in the dictionary. In most cases, this + is not needed since the label dictionary is well-defined, but some use cases might have open classes + and require this. + add_dev_test: Optionally set this to True to construct the label dictionary not only from the train + split, but also from dev and test. This is only necessary if some labels never appear in train but do + appear in one of the other splits. + + Returns: + A Dictionary of all unique labels in the corpus. """ if min_count > 0 and not add_unk: add_unk = True @@ -1833,6 +1886,13 @@ def add_label_noise( ) def get_label_distribution(self): + """Counts occurrences of each label in the corpus and returns them as a dictionary object. + + This allows you to get an idea of which label appears how often in the Corpus. + + Returns: + Dictionary with labels as keys and their occurrences as values. + """ class_to_count = defaultdict(lambda: 0) for sent in self.train: for label in sent.labels: @@ -1840,6 +1900,11 @@ def get_label_distribution(self): return class_to_count def get_all_sentences(self) -> ConcatDataset: + """Returns all sentences (spanning all three splits) in the :class:`Corpus`. + + Returns: + A :class:`torch.utils.data.Dataset` object that includes all sentences of this corpus. + """ parts = [] if self.train: parts.append(self.train) diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 68e8e95b57..7bc9ec051b 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -22,6 +22,17 @@ class SequenceTagger(flair.nn.Classifier[Sentence]): + """The SequenceTagger is one of two main architectures in Flair used for sequence tagging. + + Sequence tagging means classifying words in a sentence, for instance for part-of-speech tagging or named entity + recognition. The SequenceTagger implements the "classic" model based on the LSTM-CRF architecture: words are first + embedded using one or multiple :class:`flair.embeddings.TokenEmbeddings`, these embeddings are then passed to the + LSTM. Its hidden states for each input word are used to make the final prediction with a softmax classifier. + For decoding, the SequenceTagger by default uses a CRF approach. + + Alternatively, you can use the class :class:`flair.models.TokenClassifier` for sequence tagging without a LSTM-CRF. + """ + def __init__( self, embeddings: TokenEmbeddings, @@ -44,9 +55,7 @@ def __init__( init_from_state_dict: bool = False, allow_unk_predictions: bool = False, ) -> None: - """Sequence Tagger class for predicting labels for single tokens. Can be parameterized by several attributes. - - In case of multitask learning, pass shared embeddings or shared rnn into respective attributes. + """Constructor for this class. Args: embeddings: Embeddings to use during training and prediction @@ -268,6 +277,16 @@ def RNN( return RNN def forward_loss(self, sentences: list[Sentence]) -> tuple[torch.Tensor, int]: + """Conducts a forward pass through the SequenceTagger using labeled sentences and return the loss. + + Args: + sentences: A batch of labeled sentences. + + Returns: + A tuple consisting of the loss tensor and the number of tokens in the batch. + + """ + # if there are no sentences, there is no loss if len(sentences) == 0: return torch.tensor(0.0, dtype=torch.float, device=flair.device, requires_grad=True), 0 @@ -291,7 +310,7 @@ def _prepare_tensors(self, data_points: Union[list[Sentence], Sentence]) -> tupl return sentence_tensor, lengths def forward(self, sentence_tensor: torch.Tensor, lengths: torch.LongTensor): - """Forward propagation through network. + """Forward pass through the SequenceTagger. Args: sentence_tensor: A tensor representing the batch of sentences. @@ -439,7 +458,11 @@ def predict( embedding_storage_mode="none", force_token_predictions: bool = False, ): - """Predicts labels for current batch with CRF or Softmax. + """Call this method to predict labels for sentences. + + Predictions are directly added to the Sentence objects that are passed to this method. This means that + the predict() method does not return predictions. Rather, predictions are stored at each sentence and can + be retrieved by calling :func:`flair.data.Sentence.get_labels()` on each :class:`flair.data.Sentence`. Args: sentences: List of sentences in batch