Lot of documentation

baal-org · Jun 4, 2024 · 7f42fc1 · 7f42fc1
1 parent 65a939a
commit 7f42fc1
Show file tree

Hide file tree

Showing 61 changed files with 3,188 additions and 273,546 deletions.
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ The framework consists of four main parts, as demonstrated in the flowchart belo
 - ActiveLearningLoop
 
 <p align="center">
-  <img src="docs/research/literature/images/Baalscheme.svg">
+  <img src="docs/learn/literature/images/Baalscheme.svg">
 </p>
 
 To get started, wrap your dataset in our _[**ActiveLearningDataset**](baal/active/dataset.py)_ class. This will ensure
@@ -114,19 +114,19 @@ In conclusion, your script should be similar to this:
 dataset = ActiveLearningDataset(your_dataset)
 dataset.label_randomly(INITIAL_POOL)  # label some data
 model = MCDropoutModule(your_model)
-model = ModelWrapper(model, args=TrainingArgs(...))
-active_loop = ActiveLearningLoop(dataset,
-                                 get_probabilities=model.predict_on_dataset,
-                                 heuristic=heuristics.BALD(),
-                                 iterations=20, # Number of MC sampling.
-                                 query_size=QUERY_SIZE)  # Number of item to label.
-for al_step in range(N_ALSTEP):
-    model.train_on_dataset(dataset)
-    metrics = model.test_on_dataset(test_dataset)
-    # Label the next most uncertain items.
-    if not active_loop.step():
-        # We're done!
-        break
+wrapper = ModelWrapper(model, args=TrainingArgs(...))
+experiment = ActiveLearningExperiment(
+    trainer=wrapper, # Huggingface or ModelWrapper to train
+    al_dataset=dataset, # Active learning dataset
+    eval_dataset=test_dataset, # Evaluation Dataset
+    heuristic=BALD(), # Uncertainty heuristic to use
+    query_size=100, # How many items to label per round.
+    iterations=20, # How many MC sampling to perform per item.
+    pool_size=None, # Optionally limit the size of the unlabelled pool.
+    criterion=None # Stopping criterion for the experiment.
+)
+# The experiment will run until all items are labelled.
+metrics = experiment.start()
 ```
 
 For a complete experiment, see _[experiments/vgg_mcdropout_cifar10.py](experiments/vgg_mcdropout_cifar10.py)_ .

diff --git a/baal/active/dataset/pytorch_dataset.py b/baal/active/dataset/pytorch_dataset.py
@@ -188,12 +188,12 @@ def label(self, index: Union[list, int], value: Optional[Any] = None) -> None:
             elif self.can_label and val is None:
                 raise ValueError(
                     """The dataset is able to label data, but no label was provided.
-                                 If this is a research setting, please set the
+                                 If this is a learn setting, please set the
                                   `ActiveLearningDataset.can_label` to `False`.
                                   """
                 )
             else:
-                # Regular research usecase.
+                # Regular learn usecase.
                 self.labelled_map[idx] = active_step
                 if val is not None:
                     warnings.warn(

diff --git a/baal/bayesian/consistent_dropout.py b/baal/bayesian/consistent_dropout.py
@@ -12,7 +12,7 @@
 
 class ConsistentDropout(_DropoutNd):
     """
-    ConsistentDropout is useful when doing research.
+    ConsistentDropout is useful when doing learn.
     It guarantees that while the masks are the same between batches
     during inference. The masks are different inside the batch.
 
@@ -59,7 +59,7 @@ def eval(self):
 
 class ConsistentDropout2d(_DropoutNd):
     """
-    ConsistentDropout is useful when doing research.
+    ConsistentDropout is useful when doing learn.
     It guarantees that while the mask are the same between batches,
     they are different inside the batch.
 

diff --git a/baal/experiments/base.py b/baal/experiments/base.py
@@ -1,8 +1,9 @@
 import itertools
-from typing import Union, Optional, TYPE_CHECKING
+from typing import Union, Optional
 
-import pandas as pd
+import numpy as np
 import structlog
+from torch.utils.data import Subset
 from tqdm import tqdm
 
 from baal import ModelWrapper, ActiveLearningDataset
@@ -14,27 +15,48 @@
 
 try:
     import transformers
-
-    TRANSFORMERS_AVAILABLE = True
-except ImportError:
     from baal.transformers_trainer_wrapper import BaalTransformersTrainer
     from baal.experiments.transformers import TransformersAdapter
 
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    BaalTransformersTrainer = None
+    TransformersAdapter = None
     TRANSFORMERS_AVAILABLE = False
 
 log = structlog.get_logger(__name__)
 
 
 class ActiveLearningExperiment:
+    """Experiment manager for Baal.
+
+    Takes care of:
+        1. Train the model on the label set.
+        2. Evaluate the model on the evaluation set.
+        3. Predict on the unlabelled examples.
+        4. Label the most uncertain examples.
+        5. Stop the experiment if finished.
+
+    Args:
+        trainer: Huggingface or ModelWrapper to train
+        al_dataset: Active learning dataset
+        eval_dataset: Evaluation Dataset
+        heuristic: Uncertainty heuristic to use
+        query_size: How many items to label per round.
+        iterations: How many MC sampling to perform per item.
+        pool_size: Optionally limit the size of the unlabelled pool.
+        criterion: Stopping criterion for the experiment.
+    """
     def __init__(
-        self,
-        trainer: Union[ModelWrapper, "BaalTransformersTrainer"],
-        al_dataset: ActiveLearningDataset,
-        eval_dataset: Dataset,
-        heuristic: AbstractHeuristic,
-        query_size: int = 100,
-        iterations: int = 20,
-        criterion: Optional[StoppingCriterion] = None,
+            self,
+            trainer: Union[ModelWrapper, "BaalTransformersTrainer"],
+            al_dataset: ActiveLearningDataset,
+            eval_dataset: Dataset,
+            heuristic: AbstractHeuristic,
+            query_size: int = 100,
+            iterations: int = 20,
+            pool_size: Optional[int] = None,
+            criterion: Optional[StoppingCriterion] = None,
     ):
         self.al_dataset = al_dataset
         self.eval_dataset = eval_dataset
@@ -44,6 +66,7 @@ def __init__(
         self.criterion = criterion or LabellingBudgetStoppingCriterion(
             al_dataset, labelling_budget=al_dataset.n_unlabelled
         )
+        self.pool_size = pool_size
         self.adapter = self._get_adapter(trainer)
 
     def start(self):
@@ -55,8 +78,9 @@ def start(self):
             eval_metrics = self.adapter.evaluate(
                 self.eval_dataset, average_predictions=self.iterations
             )
+            pool = self._get_pool()
             ranks, uncertainty = self.heuristic.get_ranks(
-                self.adapter.predict(self.al_dataset.pool, iterations=self.iterations)
+                self.adapter.predict(pool, iterations=self.iterations)
             )
             self.al_dataset.label(ranks[: self.query_size])
             records.append({**train_metrics, **eval_metrics})
@@ -65,7 +89,7 @@ def start(self):
                 return records
 
     def _get_adapter(
-        self, trainer: Union[ModelWrapper, "BaalTransformersTrainer"]
+            self, trainer: Union[ModelWrapper, "BaalTransformersTrainer"]
     ) -> FrameworkAdapter:
         if isinstance(trainer, ModelWrapper):
             return ModelWrapperAdapter(trainer)
@@ -75,3 +99,10 @@ def _get_adapter(
             f"{type(trainer)} is not a supported trainer."
             " Baal supports ModelWrapper and BaalTransformersTrainer"
         )
+
+    def _get_pool(self):
+        if self.pool_size is None:
+            return self.al_dataset.pool
+        pool = self.al_dataset.pool
+        indices = np.random.choice(len(pool), self.pool_size, replace=False)
+        return Subset(pool, indices)
diff --git a/baal/transformers_trainer_wrapper.py b/baal/transformers_trainer_wrapper.py
@@ -5,8 +5,8 @@
 import torch
 from numpy._typing import NDArray
 from torch import nn
-from tqdm import tqdm
 from transformers import PreTrainedModel, TrainingArguments
+from transformers.utils.logging import tqdm
 
 from baal.utils.warnings import raise_warnings_cache_replicated
 

diff --git a/docs/_static/images/diagram.png b/docs/_static/images/diagram.png
diff --git a/docs/_static/images/github-mark.svg b/docs/_static/images/github-mark.svg
diff --git a/docs/_static/images/help.png b/docs/_static/images/help.png
diff --git a/docs/_static/images/open-book_171322.png b/docs/_static/images/open-book_171322.png
diff --git a/docs/api/active_experiment.md b/docs/api/active_experiment.md
@@ -0,0 +1,39 @@
+# Active Learning Experiment
+
+In this module, we find all the utilities to do active learning.
+Baal takes care of the dataset split between labelled and unlabelled examples.
+It also takes care of the active learning loop:
+
+1. Train the model on the label set.
+2. Evaluate the model on the evaluation set.
+3. Predict on the unlabelled examples.
+4. Label the most uncertain examples.
+5. Stop the experiment if finished.
+
+### Example
+
+```python
+from baal.active.dataset import ActiveLearningDataset
+from baal.experiments.base import ActiveLearningExperiment
+al_dataset = ActiveLearningDataset(your_dataset)
+
+# To start, we can select 1000 random examples to be labelled
+al_dataset.label_randomly(1000)
+
+experiment = ActiveLearningExperiment(
+    trainer=..., # Huggingface or ModelWrapper to train
+    al_dataset=al_dataset, # Active learning dataset
+    eval_dataset=..., # Evaluation Dataset
+    heuristic=BALD(), # Uncertainty heuristic to use
+    query_size=100, # How many items to label per round.
+    iterations=20, # How many MC sampling to perform per item.
+    pool_size=None, # Optionally limit the size of the unlabelled pool.
+    criterion=None # Stopping criterion for the experiment.
+)
+experiment.start()
+```
+
+### API
+
+### baal.experiments.base.ActiveLearningExperiment
+::: baal.experiments.base.ActiveLearningExperiment
diff --git a/docs/api/dataset_management.md b/docs/api/dataset_management.md
@@ -1,16 +1,8 @@
-# Active learning functionality
+# Active Learning Dataset
 
-In this module, we find all the utilities to do active learning.
-
-1. Dataset management
-2. Active loop implementation
+In this module, we find all the utilities to manage data when performing active learning.
 
 Baal takes care of the dataset split between labelled and unlabelled examples.
-It also takes care of the active learning loop:
-
-1. Predict on the unlabelled examples.
-2. Label the most uncertain examples.
-
 ### Example
 
 ```python
@@ -39,8 +31,5 @@ assert al_dataset.pool.transform is None
 ### baal.active.ActiveLearningDataset
 ::: baal.active.ActiveLearningDataset
 
-### baal.active.ActiveLearningLoop
-::: baal.active.ActiveLearningLoop
-
 ### baal.active.FileDataset
 ::: baal.active.FileDataset
diff --git a/docs/api/index.md b/docs/api/index.md
@@ -1,7 +1,8 @@
 # API Reference
 
-### :material-file-tree: API Definition
+### :material-file-tree: Components
 
+* [baal.experiments.ActiveLearningExperiment](./active_experiment.md)
 * [baal.modelwrapper.ModelWrapper](./modelwrapper.md)
 * [baal.bayesian](./bayesian.md)
 * [baal.active](./dataset_management.md)

diff --git a/docs/index.md b/docs/index.md
@@ -11,7 +11,29 @@ in order to maximize the efficiency of labelling during active learning. Our lib
 To know more on what is Bayesian active learning, see our [User guide](user_guide/index.md).
 
 We are a member of Pytorch's ecosystem, and we welcome contributions from the community.
-If you have any question, we are reachable on [Slack](https://join.slack.com/t/baal-world/shared_invite/zt-z0izhn4y-Jt6Zu5dZaV2rsAS9sdISfg).
+
+::cards:: cols=2
+- title: User Guide
+  content: |
+    Learn how to use Baal
+  image: /_static/images/open-book_171322.png
+  url: /user_guide
+- title: Get Help
+  content: |
+    Submit an issue on Github
+  image: /_static/images/github-mark.svg
+  url: https://github.com/baal-org/baal/issues/new/choose
+- title: Community
+  content: |
+    Join our Slack!
+  image: https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg
+  url: https://join.slack.com/t/baal-world/shared_invite/zt-z0izhn4y-Jt6Zu5dZaV2rsAS9sdISfg
+- title: FAQ
+  content: Most common questions
+  image: /_static/images/help.png
+  url: support/faq
+::/cards::
+
 
 ## Installation
 
@@ -25,11 +47,3 @@ Baal is available as a package on PyPI:
 
     `baal[vision]` installs dependencies for our Lightning-Flash integration.
 
-
-## Support
-
-For support, we have several ways to help you:
-
-* Our [:material-help: FAQ](support/faq.md)
-* Submit an issue on Github [here](https://github.com/baal-org/baal/issues/new/choose)
-* Join our [:material-slack: Slack](https://join.slack.com/t/baal-world/shared_invite/zt-z0izhn4y-Jt6Zu5dZaV2rsAS9sdISfg)!
diff --git a/docs/research/dirichlet_calibration.md → docs/learn/dirichlet_calibration.md b/docs/research/dirichlet_calibration.md → docs/learn/dirichlet_calibration.md
diff --git a/docs/research/double_descent.md → docs/learn/double_descent.md b/docs/research/double_descent.md → docs/learn/double_descent.md
diff --git a/docs/research/images/ALL_active.png → docs/learn/images/ALL_active.png b/docs/research/images/ALL_active.png → docs/learn/images/ALL_active.png
diff --git a/docs/research/images/BALDvsCBALD_active.png → docs/learn/images/BALDvsCBALD_active.png b/docs/research/images/BALDvsCBALD_active.png → docs/learn/images/BALDvsCBALD_active.png
diff --git a/docs/research/images/CBALDvsBALD.png → docs/learn/images/CBALDvsBALD.png b/docs/research/images/CBALDvsBALD.png → docs/learn/images/CBALDvsBALD.png
diff --git a/docs/research/images/CBALDvsBALDECE.png → docs/learn/images/CBALDvsBALDECE.png b/docs/research/images/CBALDvsBALDECE.png → docs/learn/images/CBALDvsBALDECE.png
diff --git a/docs/research/images/EntvsCEnt_active.png → docs/learn/images/EntvsCEnt_active.png b/docs/research/images/EntvsCEnt_active.png → docs/learn/images/EntvsCEnt_active.png
diff --git a/docs/research/images/dirichlet_calib.png → docs/learn/images/dirichlet_calib.png b/docs/research/images/dirichlet_calib.png → docs/learn/images/dirichlet_calib.png
diff --git a/docs/research/images/doubledescend_01.png → docs/learn/images/doubledescend_01.png b/docs/research/images/doubledescend_01.png → docs/learn/images/doubledescend_01.png
diff --git a/docs/research/images/doubledescend_02.png → docs/learn/images/doubledescend_02.png b/docs/research/images/doubledescend_02.png → docs/learn/images/doubledescend_02.png
diff --git a/docs/research/images/doubledescend_03.png → docs/learn/images/doubledescend_03.png b/docs/research/images/doubledescend_03.png → docs/learn/images/doubledescend_03.png
diff --git a/docs/research/images/doubledescend_04.png → docs/learn/images/doubledescend_04.png b/docs/research/images/doubledescend_04.png → docs/learn/images/doubledescend_04.png
diff --git a/docs/learn/index.md b/docs/learn/index.md
@@ -0,0 +1,76 @@
+# Bayesian deep active learning research
+
+Research in this field is quite dynamic with multiple labs around the world working on this problem.
+
+In a nutshell, we want to:
+
+> Optimize labelling by maximizing the information obtained after each label.
+    
+Another critical goal of our research is to better understand the sampling bias active learning creates. 
+Recent research has shown that active learning creates more balanced, fairer datasets.
+
+### Notations and glossary
+
+* Training dataset $D_L$
+* Pool, the unlabelled portion of the dataset $D_U$
+* Heuristic, the function that computes the uncertainty (ex. BALD) $U$
+* Active learning step, the sequence of training, selecting and labelling one or many examples.
+* BALD, an heuristic that works well with deep learning models that are overconfident.
+* Query size, the number of items to label between retraining.
+* Iterations, number of Monte Carlo sampling to do.
+
+## Active learning
+
+Active learning is a field of machine learning that reduces labelling cost by only labelling the most informative examples.
+Datasets, especially in industry, contain many similar examples that would bring no information to the model.
+
+To select the next example to label, we first train a machine learning model on the trained dataset.
+Then we compute the model's uncertainty on all unlabelled examples. The most uncertain is selected to be labelled.
+
+
+## Bayesian active learning
+
+Bayesian active learning builds upon active learning by framing the problem from a Bayesian point-of-view.
+In this case, we want to reduce the epistemic uncertainty (ie. the model's uncertainty) on a dataset.
+
+In addition, we will do this by sampling from the posterior distribution allowing us to better estimate the uncertainty.
+As an example, it is common to use MC-Dropout (Gal and Ghahramani, 2016) and BALD (Houlsby et al. 2013) to do this.
+The former allows us to draw from the posterior distribution and the latter estimates the mutual information.
+In recent years, new approaches were suggested to improve BALD such as BatchBALD (Kirsch et al, 2019) or ICAL (Jain et al. 2020), but they work on similar principles. 
+
+
+## Open challenges
+
+Active learning is a challenging field, many techniques work only on classification or are sensitive to the data distribution.
+Often, uniform selection sets a strong baseline, especially on academic datasets.
+
+### Consequences of using AL
+
+The effect of using active learning is an understudied problem.
+
+While we know that AL creates more balanced datasets, better calibrated models and such.
+We do not know what is the effect of sampling bias in all settings. 
+
+At ICLR 2020, Farquhar et al. showed that sampling bias produces biased estimators,
+and they propose a new unbiased estimator that gets good results on simple models.
+We hope that work in this area continues so that we can better understand the impact of active learning.
+
+
+**Resources**
+
+* [Literature review](/literature/index.md)
+* [Active learning dataset and training loop classes](../notebooks/fundamentals/active-learning)
+* [Methods for approximating bayesian posteriors](../notebooks/fundamentals/posteriors)
+* [Full active learning example](../notebooks/active_learning_process)
+
+
+**References**
+
+* Kirsch, Andreas, Joost Van Amersfoort, and Yarin Gal. "Batchbald: Efficient and diverse batch acquisition for deep bayesian active learning." NeurIPS (2019).
+* Jain, Siddhartha, Ge Liu, and David Gifford. "Information Condensing Active Learning." arXiv preprint arXiv:2002.07916 (2020).
+* Houlsby, Neil, et al. "Bayesian active learning for classification and preference learning." arXiv preprint arXiv:1112.5745 (2011).
+* Gal, Yarin, and Zoubin Ghahramani. "Dropout as a bayesian approximation: Representing model uncertainty in deep learning." international conference on machine learning. PMLR, 2016.
+
+---
+
+We strongly suggest to go through our [literature review](./literature/index.md).
diff --git a/...earch/literature/Additional papers/dmi.md → ...learn/literature/Additional papers/dmi.md b/...earch/literature/Additional papers/dmi.md → ...learn/literature/Additional papers/dmi.md
diff --git a/...earch/literature/Additional papers/duq.md → ...learn/literature/Additional papers/duq.md b/...earch/literature/Additional papers/duq.md → ...learn/literature/Additional papers/duq.md
diff --git a/...h/literature/Additional papers/gyolov3.md → ...n/literature/Additional papers/gyolov3.md b/...h/literature/Additional papers/gyolov3.md → ...n/literature/Additional papers/gyolov3.md
diff --git a/...rature/Additional papers/lightcoresets.md → ...rature/Additional papers/lightcoresets.md b/...rature/Additional papers/lightcoresets.md → ...rature/Additional papers/lightcoresets.md
diff --git a/...ure/Additional papers/sparse_selection.md → ...ure/Additional papers/sparse_selection.md b/...ure/Additional papers/sparse_selection.md → ...ure/Additional papers/sparse_selection.md
diff --git a/...arch/literature/Additional papers/vaal.md → ...earn/literature/Additional papers/vaal.md b/...arch/literature/Additional papers/vaal.md → ...earn/literature/Additional papers/vaal.md
diff --git a/...research/literature/images/Baalscheme.svg → docs/learn/literature/images/Baalscheme.svg b/...research/literature/images/Baalscheme.svg → docs/learn/literature/images/Baalscheme.svg
diff --git a/...search/literature/images/GYOLOV3/fig1.png → .../learn/literature/images/GYOLOV3/fig1.png b/...search/literature/images/GYOLOV3/fig1.png → .../learn/literature/images/GYOLOV3/fig1.png
diff --git a/...search/literature/images/GYOLOV3/fig2.png → .../learn/literature/images/GYOLOV3/fig2.png b/...search/literature/images/GYOLOV3/fig2.png → .../learn/literature/images/GYOLOV3/fig2.png
diff --git a/...search/literature/images/GYOLOV3/fig3.png → .../learn/literature/images/GYOLOV3/fig3.png b/...search/literature/images/GYOLOV3/fig3.png → .../learn/literature/images/GYOLOV3/fig3.png
diff --git a/docs/research/literature/images/dmi/fig3.png → docs/learn/literature/images/dmi/fig3.png b/docs/research/literature/images/dmi/fig3.png → docs/learn/literature/images/dmi/fig3.png
diff --git a/...es/experiment_results/iterations_mcdc.png → ...es/experiment_results/iterations_mcdc.png b/...es/experiment_results/iterations_mcdc.png → ...es/experiment_results/iterations_mcdc.png
diff --git a/...literature/images/lightcoreset/q_func.png → ...literature/images/lightcoreset/q_func.png b/...literature/images/lightcoreset/q_func.png → ...literature/images/lightcoreset/q_func.png
diff --git a/...earch/literature/images/logo_original.png → ...learn/literature/images/logo_original.png b/...earch/literature/images/logo_original.png → ...learn/literature/images/logo_original.png
diff --git a/...search/literature/images/repo_logo_25.jpg → .../learn/literature/images/repo_logo_25.jpg b/...search/literature/images/repo_logo_25.jpg → .../learn/literature/images/repo_logo_25.jpg
diff --git a/...erature/images/repo_logo_25_no_corner.svg → ...erature/images/repo_logo_25_no_corner.svg b/...erature/images/repo_logo_25_no_corner.svg → ...erature/images/repo_logo_25_no_corner.svg
diff --git a/...iterature/images/sparse_selection/eq4.png → ...iterature/images/sparse_selection/eq4.png b/...iterature/images/sparse_selection/eq4.png → ...iterature/images/sparse_selection/eq4.png
diff --git a/...terature/images/sparse_selection/fig4.png → ...terature/images/sparse_selection/fig4.png b/...terature/images/sparse_selection/fig4.png → ...terature/images/sparse_selection/fig4.png
diff --git a/.../research/literature/images/vaal/fig1.png → docs/learn/literature/images/vaal/fig1.png b/.../research/literature/images/vaal/fig1.png → docs/learn/literature/images/vaal/fig1.png
diff --git a/.../research/literature/images/vaal/fig2.png → docs/learn/literature/images/vaal/fig2.png b/.../research/literature/images/vaal/fig2.png → docs/learn/literature/images/vaal/fig2.png
diff --git a/docs/research/literature/index.md → docs/learn/literature/index.md b/docs/research/literature/index.md → docs/learn/literature/index.md