diff --git a/docs/getting_started/overview.md b/docs/getting_started/overview.md
index e6bcadf..4c8a199 100644
--- a/docs/getting_started/overview.md
+++ b/docs/getting_started/overview.md
@@ -266,7 +266,7 @@ Ouput:
 For choice prediction, PyCMTensor generates a vector of probabilities for each observation in the *validation* dataset. It is also possible to output discrete prediction (e.g. classification) using the Argmax function. To output the predicted probabilites after estimation, use the function:
 
 ```python
-prob = mymodel.predict(ds, return_probabilities=True)
+prob = mymodel.predict(ds)
 print(pd.DataFrame(prob))
 ```
 
@@ -370,7 +370,7 @@ print(mymodel.results.model_statistics())
 print(mymodel.results.benchmark())
 
 # predictions
-prob = mymodel.predict(ds, return_probabilities=True)
+prob = mymodel.predict(ds)
 print(pd.DataFrame(prob))
 
 # elasticities
diff --git a/pycmtensor/__init__.py b/pycmtensor/__init__.py
index 5657f7f..dc22513 100644
--- a/pycmtensor/__init__.py
+++ b/pycmtensor/__init__.py
@@ -1,4 +1,25 @@
-"""Top-level package for PyCMTensor."""
+"""Top-level package for PyCMTensor.
+
+This code snippet defines the default configuration settings for the PyCMTensor package. It sets various parameters such as batch size, seed value, maximum number of epochs, learning rate, optimizer, and learning rate scheduler.
+
+Example Usage:
+import pycmtensor.defaultconfig as defaultconfig
+
+config = defaultconfig.config
+
+print(config.batch_size)  # Output: 32
+print(config.seed)  # Output: 100
+print(config.max_epochs)  # Output: 500
+print(config.base_learning_rate)  # Output: 0.01
+print(config.optimizer)  # Output: <class 'pycmtensor.optimizers.Adam'>
+print(config.lr_scheduler)  # Output: <class 'pycmtensor.scheduler.ConstantLR'>
+
+Inputs:
+No specific inputs are required for this code snippet.
+
+Outputs:
+The code snippet does not produce any outputs directly. It sets the default configuration settings for the PyCMTensor package, which can be accessed and used by other parts of the package.
+"""
 
 __author__ = """Melvin Wong"""
 __version__ = "1.8.0"
diff --git a/pycmtensor/dataset.py b/pycmtensor/dataset.py
index b3d70de..42e2d57 100644
--- a/pycmtensor/dataset.py
+++ b/pycmtensor/dataset.py
@@ -1,6 +1,6 @@
-# dataset.py
-# converts pandas dataframe into an xarray dataset
-
+"""
+The code snippet is a part of a class called `Dataset` that converts a pandas DataFrame into an xarray dataset. It initializes the dataset object with the DataFrame and the name of the choice variable. It also provides methods to access and manipulate the dataset.
+"""
 
 import aesara.tensor as aet
 from aesara.tensor.var import TensorVariable
@@ -9,62 +9,42 @@
 
 config = defaultconfig.config
 
-from .logger import debug, info
+from pycmtensor.logger import debug, info
 
 __all__ = ["Dataset"]
 
 
 class Dataset:
     def __init__(self, df, choice, **kwargs):
-        """Base PyCMTensor Dataset class object
-
-        This class stores the data in an array format, and a symbolic tensor reference
-        variable object. To call the tensor variable, we invoke the label of the
-        variable as an item in the Dataset class, like so:
-        ```python
-        ds = Dataset(df=df, choice="choice")
-        return ds["label_of_variable"]  -> TensorVariable
-        ```
-
-        To call the data array, we use the `train_dataset()` or `valid_dataset()`
-        method. See method reference for info about the arguments. For example:
-        ```python
-        # to get the data array for variable "time"
-        arr = ds.train_dataset(ds["time"])
-        ```
+        """Initialize the Dataset object with a pandas DataFrame and the name of the choice variable.
 
         Args:
-            df (pandas.DataFrame): the pandas dataframe object to load
-            choice (str): the name of the choice variable
+            df (pandas.DataFrame): The pandas DataFrame object containing the dataset.
+            choice (str): The name of the choice variable.
+            **kwargs (optional): Additional keyword arguments to configure the dataset.
 
         Attributes:
-            n (int): total number of rows in the dataset
-            x (list[TensorVariable]): the full list of (input) `TensorVariable` objects
-                to build the tensor expression from
-            y (TensorVariable): the output (choice) `TensorVariable` object
-            scale (dict): a dictionary of `float` values to store the scaling factor
-                used for each variable
-            choice (str): the name of the choice variable
-            ds (dict): a dictionary of `numpy.ndarray` to store the values of each
-                variable
-            split_frac (float): the factor used to split the dataset into training and
-                validation datasets
-            train_index (list): the list of values of the indices of the training
-                dataset
-            valid_index (list): the list of values of the indices of the validation
-                dataset
-            n_train (int): the size of the training dataset
-            n_valid (int): the size of the validation dataset
+            n (int): The number of rows in the dataset.
+            x (list[TensorVariable]): The list of input TensorVariable objects.
+            y (TensorVariable): The output TensorVariable object.
+            scale (dict): A dictionary of scaling factors for each variable.
+            choice (str): The name of the choice variable.
+            ds (dict): A dictionary of variable values.
+            split_frac (float): The split fraction used to split the dataset.
+            idx_train (list): The list of indices of the training dataset.
+            idx_valid (list): The list of indices of the validation dataset.
+            n_train (int): The size of the training dataset.
+            n_valid (int): The size of the validation dataset.
 
         Example:
-            Example initalization of a pandas dataset:
+            Example initialization of a Dataset object:
 
             ```python
             ds = Dataset(df=pd.read_csv("datafile.csv", sep=","), choice="mode")
             ds.split(frac=0.8)
             ```
 
-            Attributes can be access by invoking:
+            Accessing attributes:
             ```python
             print(ds.choice)
             ```
@@ -74,6 +54,8 @@ def __init__(self, df, choice, **kwargs):
             'car'
             ```
 
+        Raises:
+            IndexError: If the choice variable is not found in the DataFrame columns.
         """
         for key, value in kwargs.items():
             config.add(key, value)
@@ -81,7 +63,7 @@ def __init__(self, df, choice, **kwargs):
         if choice not in df.columns:
             raise IndexError(f"{choice} not found in dataframe.")
 
-        df[choice] = df[choice].astype("int")  # ensure choice variable is an integer
+        df[choice] = df[choice].astype("int")
         df.reset_index(drop=True, inplace=True)
         while df[choice].min() > 0:
             df[choice] -= df[choice].min()
@@ -111,12 +93,22 @@ def __call__(self):
         return self.ds
 
     def __getitem__(self, key):
+        """Returns the input or output variable(s) of the dataset object by their names.
+
+        Args:
+            key (str or list or tuple): The name(s) of the variable(s) to be accessed.
+
+        Returns:
+            TensorVariable or list of TensorVariable: The input or output variable(s) corresponding to the given name(s).
+
+        Raises:
+            KeyError: If the given name(s) do not match any input or output variable.
+        """
         if isinstance(key, (list, tuple)):
             return self._make_tensor(key)
         else:
             if key in [var.name for var in self.x]:
-                i = [x.name for x in self.x].index(key)
-                return self.x[i]
+                return self.x[[x.name for x in self.x].index(key)]
             if key == self.y.name:
                 return self.y
             else:
@@ -132,28 +124,27 @@ def _make_tensor(self, keys):
 
     @property
     def n_train(self) -> int:
-        return len(self.train_index)
+        return len(self.idx_train)
 
     @property
     def n_valid(self) -> int:
-        return len(self.valid_index)
+        return len(self.idx_valid)
 
     @property
-    def train_index(self) -> list:
+    def idx_train(self) -> list:
         if self.split_frac == 1:
             return self.index
-
         n = round(self.n * self.split_frac)
         return self.index[:n]
 
     @property
-    def valid_index(self) -> list:
+    def idx_valid(self) -> list:
         if self.split_frac == 1:
             return self.index
         n = round(self.n * self.split_frac)
         return self.index[n:]
 
-    def drop(self, variables):
+    def drop(self, variables) -> None:
         """Method for dropping `variables` from the dataset
 
         Args:
@@ -163,10 +154,13 @@ def drop(self, variables):
             KeyError: raises an error if any item in `variables` is not found in the dataset or item is the choice variable
 
         !!! Warning
-            Choice variable cannot be explicity dropped.
+            Choice variable cannot be explicitly dropped.
         """
         for variable in variables:
-            if (variable in self.ds) and (variable != self.choice):
+            if variable == self.choice:
+                raise KeyError(f"Cannot drop choice variable '{variable}'")
+
+            if variable in self.ds:
                 i = [x.name for x in self.x].index(variable)
                 del self.x[i]
                 del self.scale[variable]
@@ -174,9 +168,9 @@ def drop(self, variables):
                 debug(f"Dropped input variable '{variable}' from dataset")
 
             else:
-                raise KeyError
+                raise KeyError(f"Variable '{variable}' not found in dataset")
 
-    def scale_variable(self, variable, factor):
+    def scale_variable(self, variable, factor) -> None:
         """Multiply values of the `variable` by $1/\\textrm{factor}$.
 
         Args:
@@ -187,15 +181,16 @@ def scale_variable(self, variable, factor):
         self.scale[variable] = self.scale[variable] * factor
 
     def split(self, frac):
-        """Method to split dataset into training and validation subsets
+        """Method to split the dataset into training and validation subsets based on a given fraction.
 
         Args:
-            frac (float): the fraction to split the dataset into the training set. The training set will be indexed from `0` to `frac` $\\times$ `Dataset.n`. The validation dataset will be from the last index of the training set to the last row of the dataset.
+            frac (float): The fraction to split the dataset into the training set.
 
-        Note:
-            The actual splitting of the dataset is done during the training procedure,
-            or when invoking the `train_dataset()` or `valid_dataset()` methods
+        Returns:
+            None
 
+        Notes:
+            - The actual splitting of the dataset is done during the training procedure or when invoking the `train_dataset()` or `valid_dataset()` methods.
         """
 
         self.split_frac = frac
@@ -269,7 +264,7 @@ def train_dataset(self, variables, index=None, batch_size=None, shift=None):
             ```
         """
 
-        n_index = self.train_index
+        n_index = self.idx_train
 
         return self._dataset_slice(variables, index, batch_size, shift, n_index)
 
@@ -291,6 +286,6 @@ def valid_dataset(self, variables, index=None, batch_size=None, shift=None):
             (list): a list of array object(s) corresponding to the input variables
         """
 
-        n_index = self.valid_index
+        n_index = self.idx_valid
 
         return self._dataset_slice(variables, index, batch_size, shift, n_index)
diff --git a/pycmtensor/expressions.py b/pycmtensor/expressions.py
index 0e3617c..cd3c1dd 100644
--- a/pycmtensor/expressions.py
+++ b/pycmtensor/expressions.py
@@ -1,5 +1,6 @@
-# expressions.py
-"""PyCMTensor expressions module"""
+"""
+The code snippet is a part of the PyCMTensor expressions module. It defines a base class for parsing and manipulating Aesara tensor expressions. The class provides methods for parsing a tensor expression to remove parentheses and tensor operators, and returns a clean list of keywords found in the expression. It also defines a base class for expression objects, which includes overloaded operators for tensor operations such as addition, subtraction, multiplication, division, and comparison.
+"""
 from typing import Union
 
 import aesara
@@ -17,63 +18,83 @@
 
 
 class ExpressionParser(object):
+    HARD_CODED_STRINGS = [
+        "(",
+        ")",
+        ",",
+        "[",
+        "]",
+        "{",
+        "}",
+        "=",
+        "*",
+        "-",
+        "+",
+        "/",
+        ":",
+        "AdvancedSubtensor",
+        "Reshape",
+        "ARange",
+        "Assert",
+        "Shape",
+        "BroadcastTo",
+        "Composite",
+        "Could",
+        "ScalarFromTensor",
+        "Abs",
+        "Softmax",
+        "Switch",
+        "dtype",
+    ]
+
     def __init__(self, expression=None):
-        """Base class for the ExpressionParser object
+        """Base class for parsing and manipulating Aesara tensor expressions.
 
         Args:
-            expression (TensorVariable): the TensorVariable object to evaluate
+            expression (TensorVariable, optional): The tensor expression to parse. Defaults to None.
         """
         if expression is not None:
             self.expression = str(pprint(expression))
 
     @staticmethod
     def parse(expression):
-        """Parses Aesara Tensor string expression from `aesara.pprint()`. This function
-        removes parentheses and Tensor operators and returns a 'clean' list of
-        expressions
+        """Parses a tensor expression to remove parentheses and tensor operators.
 
         Args:
-            expression (TensorVariable): the symbolic Tensor object to parse
+            expression (TensorVariable): The symbolic tensor object to parse.
 
         Returns:
-            (list): found keywords in expressions
+            list: The clean list of keywords found in the expression.
         """
+        stdout = ExpressionParser._get_stdout(expression)
+        stdout = ExpressionParser._remove_parentheses(stdout)
+        stdout = ExpressionParser._remove_tensor_operators(stdout)
+        symbols = ExpressionParser._remove_duplicates(stdout)
+        return symbols
+
+    @staticmethod
+    def _get_stdout(expression):
         if isinstance(expression, str):
-            stdout = expression
+            return expression
         else:
-            stdout = str(pprint(expression))
-        for s in [
-            "(",
-            ")",
-            ",",
-            "[",
-            "]",
-            "{",
-            "}",
-            "=",
-            "*",
-            "-",
-            "+",
-            "/",
-            ":",
-            "AdvancedSubtensor",
-            "Reshape",
-            "ARange",
-            "Assert",
-            "Shape",
-            "BroadcastTo",
-            "Composite",
-            "Could",
-            "ScalarFromTensor",
-            "Abs",
-            "Softmax",
-            "Switch",
-            "dtype",
-        ]:
-            stdout = str.replace(stdout, s, " ")
-        symbols = [s for s in str.split(stdout, " ") if len(s) > 0]
-        symbols = list(set(symbols))
-        return symbols
+            return str(pprint(expression))
+
+    @staticmethod
+    def _remove_parentheses(stdout):
+        for s in ["(", ")", "[", "]", "{", "}"]:
+            stdout = stdout.replace(s, " ")
+        return stdout
+
+    @staticmethod
+    def _remove_tensor_operators(stdout):
+        for s in ExpressionParser.HARD_CODED_STRINGS:
+            stdout = stdout.replace(s, " ")
+        return stdout
+
+    @staticmethod
+    def _remove_duplicates(stdout):
+        symbols = [s for s in stdout.split(" ") if len(s) > 0]
+        return list(set(symbols))
 
 
 class TensorExpressions:
@@ -86,10 +107,9 @@ def __add__(self, other):
         if isinstance(other, (TensorVariable, TensorSharedVariable)):
             return self() + other
         elif isinstance(other, Param):
-            print((self() + other()).eval())
             return self() + other()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -99,7 +119,7 @@ def __radd__(self, other):
         elif isinstance(other, Param):
             return other() + self()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -109,7 +129,7 @@ def __sub__(self, other):
         elif isinstance(other, Param):
             return self() - other()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -119,7 +139,7 @@ def __rsub__(self, other):
         elif isinstance(other, Param):
             return other() - self()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -132,7 +152,7 @@ def __mul__(self, other):
         elif isinstance(other, (Param, RandomDraws)):
             return self() * other()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"__mul__ {other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -145,7 +165,7 @@ def __rmul__(self, other):
         elif isinstance(other, (Param, RandomDraws)):
             return self() * other()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -155,7 +175,7 @@ def __div__(self, other):
         elif isinstance(other, Param):
             return self() / other()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -165,7 +185,7 @@ def __rdiv__(self, other):
         elif isinstance(other, Param):
             return other() / self()
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -178,7 +198,7 @@ def __pow__(self, other):
         elif isinstance(other, Param):
             return aet.pow(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -188,7 +208,7 @@ def __rpow__(self, other):
         elif isinstance(other, Param):
             return aet.pow(other(), self())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -198,7 +218,7 @@ def __lt__(self, other):
         elif isinstance(other, Param):
             return aet.lt(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -208,7 +228,7 @@ def __le__(self, other):
         elif isinstance(other, Param):
             return aet.le(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -218,7 +238,7 @@ def __gt__(self, other):
         elif isinstance(other, Param):
             return aet.gt(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -228,7 +248,7 @@ def __ge__(self, other):
         elif isinstance(other, Param):
             return aet.ge(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -238,7 +258,7 @@ def __eq__(self, other):
         elif isinstance(other, Param):
             return aet.eq(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -248,7 +268,7 @@ def __ne__(self, other):
         elif isinstance(other, Param):
             return aet.neq(self(), other())
         else:
-            raise NotImplementedError(
+            raise TypeError(
                 f"{other} must be a TensorVariable or TensorShared Variable object"
             )
 
@@ -268,6 +288,9 @@ def __init__(self, name, value=0.0, lb=None, ub=None, status=0):
             init_value (float): the inital value set at object creation
             shape (list): the shape of the Param
 
+        Raises:
+            ValueError: If `lb` is greater than `ub`
+
         !!! note
             `init_value` is an immutable property
         """
@@ -278,8 +301,8 @@ def __init__(self, name, value=0.0, lb=None, ub=None, status=0):
         self.shared_var = aesara.shared(value, name=name, borrow=True)
         self._init_value = value
 
-        if all([lb, ub]) and not (lb <= ub):
-            raise ValueError(f"ub must be greater than lb. ub={ub}, lb={lb}")
+        if lb is not None and ub is not None and lb > ub:
+            raise ValueError("lb must be less than or equal to ub")
         self.ub = ub
         self.lb = lb
 
@@ -290,14 +313,17 @@ def name(self):
 
     @property
     def init_value(self):
+        """Returns the initial value of the parameter"""
         return self._init_value
 
     @property
     def status(self):
+        """Returns the status of the parameter"""
         return self._status
 
     @property
     def shape(self):
+        """Returns the shape of the initial value of the parameter"""
         return self._init_value.shape
 
     @property
@@ -309,10 +335,11 @@ def init_type(self):
         raise NotImplementedError
 
     def __call__(self):
-        """Returns the shared value"""
+        """Returns the shared value of the parameter"""
         return self.shared_var
 
     def __repr__(self):
+        """Returns a string representation of the parameter"""
         return f"Param({self.name}, {self.shape})"
 
     def get_value(self):
@@ -320,7 +347,7 @@ def get_value(self):
         return self.shared_var.get_value()
 
     def set_value(self, value: Union[np.ndarray, float]):
-        """Set the value of the shared variable"""
+        """Sets the value of the shared variable"""
         self.shared_var.set_value(value)
 
     def reset_value(self):
@@ -333,11 +360,11 @@ def __init__(self, name, value=0.0, lb=None, ub=None, status=0):
         """Constructor for Beta parameter
 
         Args:
-            name (str): name of Beta parameter
-            value (float): the default value of the Beta parameter
-            lb (float): value lower bound
-            ub (float): value upper bound
-            status (int): if 1, do not estimate this Beta parameter
+            name (str): The name of the Beta parameter.
+            value (float): The default value of the Beta parameter.
+            lb (float): The lower bound of the Beta parameter.
+            ub (float): The upper bound of the Beta parameter.
+            status (int): The status of the Beta parameter.
 
         !!! example
             Specifying a Beta parameter:
@@ -368,22 +395,18 @@ def __repr__(self):
 
 class RandomDraws(TensorExpressions):
     def __init__(self, name: str, draw_type: str, n_draws: int):
-        """Constructor for model random draws
+        """Constructor for the RandomDraws class.
 
         Args:
-            name (str): name of the RandomDraw object
-            draw_type (str): the distribution of the draw
-            n_draws (int): number of draws, determines the size of this shared tensor
+            name (str): The name of the RandomDraw object.
+            draw_type (str): The distribution of the draw. Can be "normal", "lognormal", "gumbel", "exponential", "gamma", or "poisson".
+            n_draws (int): The number of draws, which determines the size of the shared tensor.
 
-        !!! note
-            `draw_type` can be the following:
-
-            - `"normal"`
-            - `"lognormal"`
-            - `"gumbel"`
-            - `"exponential"`
-            - `"gamma"`
-            - `"poisson"`
+        Raises:
+            NotImplementedError: If an unsupported draw_type is provided.
+
+        Returns:
+            None. The method initializes the object and creates a shared variable.
         """
         self._name = name
         self.n_draws = n_draws
@@ -422,10 +445,9 @@ def __init__(self, name, size, value=None):
         """Class object for neural net bias vector
 
         Args:
-            name (str): name of the parameter
-            size (Union[tuple,list]): size of the array in 1 dimension
-            value (numpy.ndarray): initial values of the parameter, if `None` given,
-                defaults to `0`
+            name (str): The name of the parameter.
+            size (Union[tuple,list]): The size of the array in 1 dimension.
+            value (numpy.ndarray): The initial values of the parameter. If `None` is given, it defaults to `0`.
         """
         Param.__init__(self, name, lb=None, ub=None)
 
diff --git a/pycmtensor/functions.py b/pycmtensor/functions.py
index 47447c3..8ce9081 100644
--- a/pycmtensor/functions.py
+++ b/pycmtensor/functions.py
@@ -24,18 +24,14 @@
 def relu(x, alpha=0.0):
     """Compute the element-wise rectified linear activation function.
 
-    Source taken from Theano 0.7.1
+    Source from Theano 0.7.1
 
     Args:
-        x (TensorVariable): symbolic tensor
-        alpha (Union[float, TensorSharedVariable]): Slope for negative input, usually
-            between 0 and 1. The default value of 0 will lead to the standard
-            rectifier, 1 will lead to a linear activation function, and any value in
-            between will give a leaky rectifier. A shared variable (broadcastable against `x`) will result in a parameterized rectifier with learnable slope
-            (s).
+        x (TensorVariable): The input symbolic tensor.
+        alpha (float or TensorSharedVariable): The slope for negative input. A value between 0 and 1. Default is 0.
 
     Returns:
-        (TensorVariable): Elementwise rectifier applied to `x`.
+        (TensorVariable): The element-wise rectified linear activation function applied to `x`.
     """
     if alpha == 0.0:
         return 0.5 * (x + aet.abs(x))
@@ -54,17 +50,15 @@ def exp_mov_average(batch_avg, moving_avg, alpha=0.1):
     """Calculates the exponential moving average (EMA) of a new minibatch
 
     Args:
-        batch_avg (TensorVariable): mean batch value
-        moving_avg (TensorVariable): accumulated mean
-        alpha (float): ratio of moving average to batch average
+        batch_avg (array-like): The mean batch value.
+        moving_avg (array-like): The accumulated mean.
+        alpha (float): The ratio of moving average to batch average.
 
     Returns:
-        (TensorVariable): the new moving average
+        (TensorVariable): The new moving average
 
     !!! note
-        The moving average will decay by the difference between the existing value
-        and the new value multiplied by the moving average factor. A higher `alpha`
-        value results in faster changing moving average.
+        The moving average will decay by the difference between the existing value and the new value multiplied by the moving average factor. A higher `alpha` value results in faster changing moving average.
 
         Formula:
 
diff --git a/pycmtensor/logger.py b/pycmtensor/logger.py
index ef77316..8f979a5 100644
--- a/pycmtensor/logger.py
+++ b/pycmtensor/logger.py
@@ -1,7 +1,27 @@
-# logger.py
 """PyCMTensor logger module
 
 This module sets the logging state of the program. Verbosity is defined by `set_level()`
+
+## Example Usage
+```python
+# Set the logger level to DEBUG
+set_level(DEBUG)
+
+# Get the effective level of the logger
+level = get_effective_level()
+
+# Log a message at the INFO level
+info("This is an informational message")
+
+# Log a message at the WARNING level
+warning("This is a warning message")
+```
+
+## Inputs
+- `level` (int): The level of the logger to be set. It can be one of the predefined levels: `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL`.
+
+## Outputs
+- None
 """
 import logging
 
diff --git a/pycmtensor/models/MNL.py b/pycmtensor/models/MNL.py
index 5580e9d..c04da24 100644
--- a/pycmtensor/models/MNL.py
+++ b/pycmtensor/models/MNL.py
@@ -40,7 +40,7 @@ def __init__(self, ds, variables, utility, av=None, **kwargs):
             pred (TensorVariable): prediction tensor variable function
         """
 
-        super().__init__(**kwargs)
+        BaseModel.__init__(self, **kwargs)
         self.name = "MNL"
         self.params = []  # keep track of all the Params
         self.betas = []  # keep track of the Betas
@@ -82,28 +82,75 @@ def __init__(self, ds, variables, utility, av=None, **kwargs):
 
     @property
     def n_params(self):
-        """Return the total number of estimated parameters"""
+        """
+        Returns the number of parameters in the Multinomial Logit model.
+
+        Returns:
+            int: The number of parameters in the Multinomial Logit model.
+        """
         return super().n_params
 
     @property
     def n_betas(self):
-        """Return the number of estimated betas"""
+        """Return the number of estimated betas in the Multinomial Logit model.
+
+        Returns:
+            int: The number of estimated betas.
+        """
         return super().n_betas
 
     def get_betas(self):
-        """returns the values of the betas
+        """Returns the values of the betas in the model as a dictionary.
 
         Returns:
-            (dict): beta values
+            dict: A dictionary containing the beta values, where the keys represent the beta names and the values represent their corresponding values.
         """
         return super().get_betas()
 
-    def reset_values(self):
-        """resets the values of all parameters"""
+    def reset_values(self) -> None:
+        """Resets the values of all parameters by calling the reset_values method of the parent class.
+
+        This method resets the values of all parameters to their initial values.
+
+        Example Usage:
+        ```python
+        # Create an instance of the MNL class
+        model = MNL(ds, variables, utility, av=None)
+
+        # Call the reset_values method to reset the parameter values
+        model.reset_values()
+        ```
+
+        Inputs:
+        - None
+
+        Flow:
+        1. The `reset_values` method is called.
+        2. The method calls the `reset_values` method of the parent class `BaseModel` to reset the values of all parameters.
+
+        Outputs:
+        - None
+        """
         return super().reset_values()
 
     def build_cost_fn(self):
-        """constructs aesara functions for cost and prediction errors"""
+        """Constructs Aesara functions for calculating the cost and prediction errors of the Multinomial Logit model.
+
+        Inputs:
+        - None
+
+        Outputs:
+        - None
+
+        Example Usage:
+        ```python
+        # Create an instance of the MNL class
+        model = MNL(ds, variables, utility, av=None)
+
+        # Call the build_cost_fn method
+        model.build_cost_fn()
+        ```
+        """
         self.log_likelihood_fn = aesara.function(
             name="log_likelihood",
             inputs=self.x + [self.y, self.index],
@@ -119,7 +166,11 @@ def build_cost_fn(self):
         )
 
     def build_gh_fn(self):
-        """constructs aesara functions for hessians and gradient vectors
+        """Constructs Aesara functions for computing the Hessian matrix and the gradient vector.
+
+        Returns:
+            hessian_fn (Aesara function): A function that computes the Hessian matrix.
+            gradient_vector_fn (Aesara function): A function that computes the gradient vector.
 
         !!! note
 
@@ -140,31 +191,31 @@ def build_gh_fn(self):
         )
 
     def build_cost_updates_fn(self, updates):
-        """build/rebuilt cost function with updates to the model. Creates a class function `MNL.cost_updates_fn(*inputs, output, lr)` that receives a list of input variable arrays, the output array, and a learning rate.
+        """Build or rebuild the cost function with updates to the model.
+
+        This method creates a class function `MNL.cost_updates_fn(*inputs, output, lr)` that takes a list of input variable arrays, an output array, and a learning rate as arguments.
 
         Args:
-            updates (List[Tuple[TensorSharedVariable, TensorVariable]]): The list of tuples containing the target shared variable and the new value of the variable.
+            updates (List[Tuple[TensorSharedVariable, TensorVariable]]): A list of tuples containing the target shared variable and the new value of the variable.
         """
         BaseModel.build_cost_updates_fn(self, updates)
 
-    def predict(self, ds, return_probabilities=False):
-        """predicts the output of the most likely alternative given the validation dataset in `ds`. The formula is:
-
-        $$
-            argmax(p_n(y|x))
-        $$
+    def predict(self, ds):
+        """Predicts the output of the most likely alternative given the validation dataset.
 
         Args:
-            ds (Dataset): pycmtensor dataset
-            return_probabilities (bool): if true, returns the probability vector instead
+            ds (Dataset): A pycmtensor dataset object containing the validation data.
 
         Returns:
-            (numpy.ndarray): the predicted choices or the vector of probabilities
+            numpy.ndarray: The predicted choices or the vector of probabilities.
 
         !!! example
 
-            To return predicted choices:
             ```python
+            # Create an instance of the MNL class
+            model = MNL(ds, variables, utility, av=None)
+
+            # Predict the choices using the predict method
             predictions = self.predict(ds)
             print(predictions)
             ```
@@ -174,9 +225,9 @@ def predict(self, ds, return_probabilities=False):
             {'pred_choice': array([...])}
             ```
 
-            To return probabilities:
             ```python
-            prob = self.predict(ds, return_probabilities=True)
+            # Predict the probabilities using the predict method
+            prob = self.predict(ds)
             print(prob)
             ```
 
@@ -187,18 +238,20 @@ def predict(self, ds, return_probabilities=False):
                 ...
             }
             ```
+
+            The expected output for `predictions` is a dictionary with the key `'pred_choice'` and an array of predicted choices as the value. The expected output for `probabilities` is a dictionary with the keys representing the alternative indices and the values being arrays of probabilities.
         """
-        return BaseModel.predict(self, ds, return_probabilities)
+        return BaseModel.predict(self, ds)
 
     def elasticities(self, ds, wrt_choice):
-        """disaggregated point/cross elasticities of choice y wrt x
+        """Calculate the disaggregated point/cross elasticities of the choice variable `y` with respect to the independent variables `x` in a Multinomial Logit model.
 
         Args:
-            ds (pycmtensor.Dataset): dataset containing the training data
-            wrt_choice (int): alternative to evaluate the variables on
+            ds (pycmtensor.Dataset): Dataset containing the training data.
+            wrt_choice (int): Alternative to evaluate the variables on.
 
         Returns:
-            (dict): the disaggregate point elasticities of x
+            dict: Disaggregated point elasticities of the independent variables `x`.
 
         !!! example
 
diff --git a/pycmtensor/models/TasteNet.py b/pycmtensor/models/TasteNet.py
index 26893bb..3be5c9f 100644
--- a/pycmtensor/models/TasteNet.py
+++ b/pycmtensor/models/TasteNet.py
@@ -164,7 +164,7 @@ def build_cost_updates_fn(self, updates):
         """
         BaseModel.build_cost_updates_fn(self, updates)
 
-    def predict(self, ds, return_probabilities=False):
+    def predict(self, ds):
         """predicts the output of the most likely alternative given the validation dataset in `ds`. The formula is:
 
         $$
@@ -173,7 +173,6 @@ def predict(self, ds, return_probabilities=False):
 
         Args:
             ds (Dataset): pycmtensor dataset
-            return_probabilities (bool): if true, returns the probability vector instead
 
         Returns:
             (numpy.ndarray): the predicted choices or the vector of probabilities
@@ -191,7 +190,7 @@ def predict(self, ds, return_probabilities=False):
             array([...])
             ```
         """
-        return BaseModel.predict(self, ds, return_probabilities)
+        return BaseModel.predict(self, ds)
 
     def elasticities(self, ds, wrt_choice):
         """disaggregate point/cross elasticities of choice y wrt x
diff --git a/pycmtensor/models/basic.py b/pycmtensor/models/basic.py
index e4d1472..5fce38e 100644
--- a/pycmtensor/models/basic.py
+++ b/pycmtensor/models/basic.py
@@ -127,15 +127,35 @@ def include_regularization_terms(self, *regularizers):
                 self.cost += reg
 
     def build_cost_updates_fn(self, updates):
+        """Builds a function that calculates the cost updates for a model.
+
+        Args:
+            updates (dict): A dictionary of updates for the model.
+
+        Returns:
+            None
+        """
+        inputs = self.x + [self.y, self.learning_rate, self.index]
+        outputs = self.cost
+
         self.cost_updates_fn = function(
             name="cost_updates",
-            inputs=self.x + [self.y, self.learning_rate, self.index],
-            outputs=self.cost,
+            inputs=inputs,
+            outputs=outputs,
             updates=updates,
             allow_input_downcast=True,
         )
 
-    def predict(self, ds, return_probabilities=False):
+    def predict(self, dataset):
+        """Make predictions on a given dataset.
+
+        Args:
+            dataset: The dataset on which predictions are to be made.
+
+        Returns:
+            A dictionary containing the predicted choices, true choices, and choice probabilities
+            for each data point in the dataset.
+        """
         if not "choice_probabilities_fn" in dir(self):
             self.choice_probabilities_fn = function(
                 name="choice_probabilities",
@@ -144,18 +164,29 @@ def predict(self, ds, return_probabilities=False):
                 allow_input_downcast=True,
             )
 
-        valid_data = ds.valid_dataset(self.x)
-
+        valid_data = dataset.valid_dataset(self.x)
+        valid_ground_truth = dataset.valid_dataset(self.y)
         prob = self.choice_probabilities_fn(*valid_data)
+        result = {
+            **{i: prob[i] for i in range(prob.shape[0])},
+            f"pred_{dataset.choice}": np.argmax(prob, axis=0),
+            f"true_{dataset.choice}": valid_ground_truth[0],
+        }
+        return result
+
+    def elasticities(self, dataset, wrt_choice):
+        """
+        Calculate the elasticities of the model based on the given dataset and choice.
 
-        if return_probabilities:
-            return {i: prob[i] for i in range(prob.shape[0])}
-        else:
-            return {"pred_" + ds.choice: np.argmax(prob, axis=0)}
+        Args:
+            dataset: The dataset used to calculate the elasticities.
+            wrt_choice: The choice with respect to which the elasticities are calculated.
 
-    def elasticities(self, ds, wrt_choice):
+        Returns:
+            The elasticities of the model based on the given dataset and choice.
+        """
         p_y_given_x = self.p_y_given_x[self.y, ..., self.index]
-        while p_y_given_x.ndim > 1:
+        for _ in range(p_y_given_x.ndim - 1):
             p_y_given_x = aet.sum(p_y_given_x, axis=1)
         dy_dx = aet.grad(aet.sum(p_y_given_x), self.x, disconnected_inputs="ignore")
 
@@ -166,7 +197,7 @@ def elasticities(self, ds, wrt_choice):
                 on_unused_input="ignore",
                 allow_input_downcast=True,
             )
-        train_data = ds.train_dataset(self.x)
+        train_data = dataset.train_dataset(self.x)
         index = np.arange((len(train_data[-1])))
         choice = (np.ones(shape=index.shape) * wrt_choice).astype(int)
         return self.elasticity_fn(*train_data, choice, index)
@@ -178,7 +209,7 @@ def __repr__(self):
         return pprint(self.cost)
 
     def __getattr__(self, name):
-        if (name == "hessian_fn") or (name == "gradient_vector_fn"):
+        if name in ["hessian_fn", "gradient_vector_fn"]:
             self.build_gh_fn()
             return getattr(self, name)
         else:
@@ -253,10 +284,9 @@ def compute(model, ds, update=False, **params):
     ```
     """
     # saves original values and replace values by test values in params
-    p_value_old = {}
+    p_value_old = {p.name: p.get_value() for p in model.params if p.name in params}
     for p in model.params:
         if p.name in params:
-            p_value_old[p.name] = p.get_value()
             p.set_value(params[p.name])
 
     # compute all the outputs of the training and validation datasets
@@ -302,7 +332,7 @@ def compute(model, ds, update=False, **params):
 
 
 def train(model, ds, **kwargs):
-    """main training loop
+    """Main training loop
 
     Args:
         model (pycmtensor.models.BaseModel): model to train
@@ -371,13 +401,14 @@ def train(model, ds, **kwargs):
     log_likelihood = model.log_likelihood_fn(*train_data, t_index)
     train_error = model.prediction_error_fn(*train_data)
 
-    if set(ds.train_index) != set(ds.valid_index):
+    if set(ds.idx_train) != set(ds.idx_valid):
         valid_error = model.prediction_error_fn(*valid_data)
     else:
         valid_error = train_error
 
     model.results.best_loglikelihood = log_likelihood
     model.results.best_valid_error = valid_error
+    model.results.best_train_error = train_error
     model.results.best_epoch = 0
     model.results.gnorm = np.nan
 
@@ -424,7 +455,7 @@ def train(model, ds, **kwargs):
                 statistics_graph["valid_error"].append(valid_error)
 
                 # training error
-                if set(ds.train_index) != set(ds.valid_index):
+                if set(ds.idx_train) != set(ds.idx_valid):
                     train_error = model.prediction_error_fn(*train_data)
                 else:
                     train_error = valid_error
@@ -475,6 +506,7 @@ def train(model, ds, **kwargs):
                     model.results.best_iteration = iteration
                     model.results.best_loglikelihood = log_likelihood
                     model.results.best_valid_error = valid_error
+                    model.results.best_train_error = train_error
                     model.results.gnorm = gnorm
 
                     # save Beta params
diff --git a/pycmtensor/models/layers.py b/pycmtensor/models/layers.py
index 76004ec..00da3ac 100644
--- a/pycmtensor/models/layers.py
+++ b/pycmtensor/models/layers.py
@@ -173,142 +173,3 @@ def __repr__(self):
 
         def __call__(self):
             return DenseLayer.__call__(self)
-
-
-# class BatchNormLayer(Layer):
-#     def __init__(self, gamma, beta, batch_size, factor=0.05, epsilon=1e-6):
-#         """Class object for Batch Normalization layer
-
-#         Args:
-#             gamma (TensorSharedVariable): gamma variable for variance
-#             beta (TensorSharedVariable): beta variable for mean
-#             batch_size (int): batch size indicator
-#             factor (float, optional): exponential moving average factor
-#             epsilon (float, optional): small value to prevent floating point error
-
-#         Notes:
-#             The ema factor controls how fast/slow the running average is changed.
-#             Higher ``factor`` value discounts older values faster.
-#         """
-
-#         self._updates = []
-#         self.batch_size = batch_size
-#         self.gamma = gamma
-#         self.beta = beta
-#         self.epsilon = epsilon
-#         self.factor = factor
-#         self.gamma = gamma
-#         self.beta = beta
-#         self.params = [self.gamma, self.beta]
-
-#         # internal record of the running variance and mean
-#         self._mv_var = aesara.shared(np.ones(gamma.shape), name="mv_var")
-#         self._mv_mean = aesara.shared(np.zeros(beta.shape), name="mv_mean")
-
-#     def apply(self, input):
-#         """Function to apply the input to the computational graph"""
-#         if isinstance(input, (list, tuple)):
-#             input = aet.stack(input)
-#         self.input = input
-
-#         # variance and mean of each batch of input during training
-#         batch_var = aet.var(self.input, axis=1)
-#         batch_mean = aet.mean(self.input, axis=1)
-
-#         # updates for the running mean and variance values
-#         ema_var = functions.exp_mov_average(batch_var, self._mv_mean, alpha=self.factor)
-#         ema_mean = functions.exp_mov_average(
-#             batch_mean, self._mv_var, alpha=self.factor
-#         )
-#         self._updates.append((self._mv_var, ema_mean))
-#         self._updates.append((self._mv_mean, ema_var))
-
-#         # condition when training
-#         batch_std = aet.shape_padaxis(aet.sqrt(batch_var + self.epsilon), 1)
-#         h = (self.input - aet.shape_padaxis(batch_mean, 1)) / batch_std
-#         batch_norm = self.gamma() * h.swapaxes(0, -1) + self.beta()
-#         self.batch_norm = batch_norm.swapaxes(0, -1)
-
-#         # condition when testing
-#         mv_std = aet.shape_padaxis(aet.sqrt(self.mv_var + self.epsilon), 1)
-#         h_hat = (self.input - aet.shape_padaxis(self.mv_mean, 1)) / mv_std
-#         full_norm = self.gamma() * h_hat.swapaxes(0, -1) + self.beta()
-#         self.full_norm = full_norm.swapaxes(0, -1)
-
-#     @property
-#     def mv_mean(self):
-#         """Returns the stored running mean"""
-#         return self._mv_mean
-
-#     @property
-#     def mv_var(self):
-#         """Return the stored running variance"""
-#         return self._mv_var
-
-#     @property
-#     def updates(self):
-#         """Returns a list of update tuple pairs"""
-#         return self._updates
-
-#     @property
-#     def output(self):
-#         """Returns the output of this layer
-
-#         Note:
-#             Returns the full normalized layer using the running mean if the input
-#             length is not equivalent to the batch size
-#         """
-#         return aet.switch(
-#             aet.eq(self.input.shape[1], aet.constant(self.batch_size)),
-#             self.batch_norm,
-#             self.full_norm,
-#         )
-
-
-# class ResidualLayer:
-#     def __init__(self, layers: list):
-#         """Definition of the Residual layer block
-
-#         Args:
-#             layers (list): a list of layers that defines the residual block
-
-#         Example:
-
-#             .. code-block:: python
-
-#                 res_layer = ResidualLayer(layers=[
-#                     DenseLayer(w_1, b_1, activation=relu),
-#                     DenseLayer(w_2, b_2, activation=relu)
-#                 ])
-#         """
-#         for layer in layers:
-#             if not isinstance(layer, Layer):
-#                 raise TypeError(f"{layer} is not a Layer class instance")
-
-#         self.layers = layers
-#         self.params = []
-#         self._updates = []
-
-#     def apply(self, input):
-#         """Function to apply the input to the computational graph"""
-#         if isinstance(input, (list, tuple)):
-#             input = aet.stack(input)
-#         self.input = input
-
-#         for n, layer in enumerate(self.layers):
-#             if n == 0:
-#                 layer.apply(self.input)
-#             else:
-#                 layer.apply(self.layers[n - 1].output)
-#             self.params.extend(layer.params)
-#             self._updates.extend(layer.updates)
-#         self._output = self.layers[-1].output + self.input
-
-#     @property
-#     def updates(self):
-#         """Returns a list of update tuple pairs"""
-#         return self._updates
-
-#     def output(self):
-#         """Returns the output of this layer"""
-#         return self._output
diff --git a/pycmtensor/optimizers.py b/pycmtensor/optimizers.py
index 87ef1ca..8a2d163 100644
--- a/pycmtensor/optimizers.py
+++ b/pycmtensor/optimizers.py
@@ -33,43 +33,43 @@ def __init__(self, name, epsilon=1e-8, **kwargs):
 
         Args:
             name (str): name of the optimizer
+            epsilon (float, optional): small value to avoid division by zero.
+                Defaults to `1e-8`
 
         """
         self.name = name
-        self._epsilon = shared(epsilon, name="epsilion")
+        self._epsilon = shared(epsilon, name="epsilon")
 
     @property
     def epsilon(self):
         return self._epsilon
 
     def __repr__(self):
+        """Returns a string representation of the optimizer object.
+
+        Returns:
+            str: A string representation of the optimizer object, including its name and parameters.
+        """
         return f"{self.name}"
 
-    def update(self, cost, params, lr):
+    def update(self, **kwargs):
         """Update parameters for aesara function calls
 
-        Args:
-            cost (TensorVariable): a scalar element for the expression of the cost
-                function where the derivatives are calculated
-            params (list[TensorSharedVariable]): parameters of the model
-            lr (Union[float, TensorSharedVariable]): the learning rate
-
         Returns:
-            (list): a list of `(param, param_new)` tuple pairs
+            None
         """
-        pass
+        raise NotImplementedError("Subclasses must implement the `update` method.")
 
 
 class Adam(Optimizer):
     def __init__(self, params, b1=0.9, b2=0.999, **kwargs):
-        """An optimizer that implments the Adam algorithm[^1]
+        """An optimizer that implements the Adam algorithm[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            b1 (float, optional): exponential decay rate for the 1st moment estimates.
-                Defaults to `0.9`
-            b2 (float, optional): exponential decay rate for the 2nd moment estimates.
-                Defaults to `0.999`
+            params (list): A list of parameters.
+            b1 (float, optional): The value of the b1 parameter. Defaults to 0.9.
+            b2 (float, optional): The value of the b2 parameter. Defaults to 0.999.
+            **kwargs: Additional keyword arguments.
 
         Attributes:
             t (TensorSharedVariable): time step
@@ -137,7 +137,18 @@ def update(self, cost, params, lr):
 
 class AdamW(Adam):
     def __init__(self, params, b1=0.9, b2=0.999, **kwargs):
-        """Adam with weight decay"""
+        """Initializes the AdamW class with the given parameters.
+
+        Args:
+            params (list): A list of parameters.
+            b1 (float, optional): The value of the b1 parameter. Defaults to 0.9.
+            b2 (float, optional): The value of the b2 parameter. Defaults to 0.999.
+            **kwargs: Additional keyword arguments.
+
+        Example:
+            params = [...] # list of parameters
+            adamw = AdamW(params, b1=0.9, b2=0.999)
+        """
         super().__init__(params, b1, b2)
         self.w = config.adam_weight_decay
 
@@ -179,11 +190,10 @@ def __init__(self, params, b1=0.99, b2=0.999, **kwargs):
         """An optimizer that implements the Nesterov Adam algorithm[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            b1 (float, optional): exponential decay rate for the 1st moment estimates.
-                Defaults to `0.9`
-            b2 (float, optional): exponential decay rate for the 2nd moment estimates.
-                Defaults to `0.999`
+            params (list): A list of parameters.
+            b1 (float, optional): The value of the b1 parameter. Defaults to 0.9.
+            b2 (float, optional): The value of the b2 parameter. Defaults to 0.999.
+            **kwargs: Additional keyword arguments.
 
         Attributes:
             t (TensorSharedVariable): time step
@@ -236,11 +246,10 @@ def __init__(self, params, b1=0.9, b2=0.999, **kwargs):
         the Adam algorithm
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            b1 (float, optional): exponential decay rate for the 1st moment estimates.
-                Defaults to `0.9`
-            b2 (float, optional): exponential decay rate for the 2nd moment estimates.
-                Defaults to `0.999`
+            params (list): A list of parameters.
+            b1 (float, optional): The value of the b1 parameter. Defaults to 0.9.
+            b2 (float, optional): The value of the b2 parameter. Defaults to 0.999.
+            **kwargs: Additional keyword arguments.
 
         Attributes:
             t (TensorSharedVariable): time step
@@ -291,13 +300,12 @@ def __init__(self, params, rho=0.95, **kwargs):
         - The need for a manually selected global learning rate
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            rho (float, optional): the decay rate for learning rate.
-                Defaults to `0.95`
+            params (list[TensorSharedVariable]): A list of shared variables representing the parameters of the model.
+            rho (float, optional): A float representing the decay rate for the learning rate. Defaults to 0.95.
 
         Attributes:
-            accumulator (list[TensorSharedVariable]): gradient accumulator
-            delta (list[TensorSharedVariable]): adaptive difference between gradients
+            accumulator (list[TensorSharedVariable]): A list of gradient accumulators.
+            delta (list[TensorSharedVariable]): A list of adaptive differences between gradients.
 
         [^1]: Zeiler, 2012. ADADELTA: An Adaptive Learning Rate Method. http://arxiv.org/abs/1212.5701
         """
@@ -347,14 +355,14 @@ def __init__(self, params, inc=1.05, dec=0.5, bounds=[1e-6, 50.0], **kwargs):
         """An optimizer that implements the Rprop algorithm[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            inc (float, optional): increment step if same gradient direction
-            dec (float, optional): decrement step if different gradient direction
-            bounds (List[float]): min and maximum bounds for increment step
+            params (list[TensorSharedVariable]): A list of TensorSharedVariable objects representing the parameters of the model.
+            inc (float, optional): A float representing the increment step if the gradient direction is the same.
+            dec (float, optional): A float representing the decrement step if the gradient direction is different.
+            bounds (list[float]): A list of floats representing the minimum and maximum bounds for the increment step.
 
         Attributes:
-            factor (List[TensorVariable]): learning rate factor multiplier (init=1.)
-            ghat (List[TensorVariable]): previous step gradients
+            factor (list[TensorVariable]): A list of learning rate factor multipliers (init=1.0).
+            ghat (list[TensorVariable]): A list of previous step gradients.
 
         [^1]: Igel, C., & Hüsken, M. (2003). Empirical evaluation of the improved Rprop learning algorithms. Neurocomputing, 50, 105-123.
         """
@@ -397,14 +405,14 @@ def __init__(self, params, rho=0.9, **kwargs):
         """An optimizer that implements the RMSprop algorithm[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            rho (float, optional): discounting factor for the history/coming gradient.
-                Defaults to `0.9`
+            params (list[TensorSharedVariable]): Parameters of the model.
+            rho (float, optional): Discounting factor for the history/coming gradient. Defaults to 0.9.
 
         Attributes:
-            accumulator (TensorVariable): gradient accumulator
+            accumulator (TensorVariable): Gradient accumulator.
 
-        [^1]: Hinton, 2012. rmsprop: Divide the gradient by a running average of its recent magnitude. http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+        [^1]: Hinton, G. E. (2012). rmsprop: Divide the gradient by a running average of its recent magnitude.
+              Retrieved from http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
         """
         super().__init__(name="RMSProp")
         self.rho = shared(rho)
@@ -436,15 +444,14 @@ def update(self, cost, params, lr):
 
 class Momentum(Optimizer):
     def __init__(self, params, mu=0.9, **kwargs):
-        """An optimizer that implements the Momentum algorithm[^1]
+        """Initializes the Momentum optimizer[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            mu (float, optional): acceleration factor in the relevant direction
-                and dampens oscillations. Defaults to `0.9`
+            params (list[TensorSharedVariable]): A list of parameters of the model.
+            mu (float, optional): The acceleration factor in the relevant direction and dampens oscillations. Defaults to `0.9`.
 
         Attributes:
-            velocity (list[TensorSharedVariable]): momentum velocity
+            velocity (list[TensorSharedVariable]): The momentum velocity.
 
         [^1]: Sutskever et al., 2013. On the importance of initialization and momentum in deep learning. http://jmlr.org/proceedings/papers/v28/sutskever13.pdf
         """
@@ -478,13 +485,12 @@ def __init__(self, params, mu=0.99, **kwargs):
         """An optimizer that implements the Nestrov Accelerated Gradient algorithm[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            mu (float, optional): acceleration factor in the relevant direction
-                and dampens oscillations. Defaults to `0.9`
+            params (list[TensorSharedVariable]): A list of parameters of the model.
+            mu (float, optional): The acceleration factor in the relevant direction. Defaults to `0.99`.
 
         Attributes:
-            t (TensorSharedVariable): momentum time step
-            velocity (list[TensorSharedVariable]): momentum velocity
+            t (TensorSharedVariable): The momentum time step.
+            velocity (list[TensorSharedVariable]): The momentum velocity.
 
         [^1]: Sutskever et al., 2013. On the importance of initialization and momentum in deep learning. http://jmlr.org/proceedings/papers/v28/sutskever13.pdf
         """
@@ -585,11 +591,11 @@ def update(self, cost, params, lr):
 
 class SQNBFGS(Optimizer):
     def __init__(self, params, config=None, **kwargs):
-        """A L-BFGS optimizer implementing the adaptive stochastic Quasi-Newton (SQN) based approach [^1]
+        """Initializes the SQNBFGS optimizer object[^1]
 
         Args:
-            params (list[TensorSharedVariable]): parameters of the model
-            config (pycmtensor.config): pycmtensor config object
+            params (list[TensorSharedVariable]): The parameters of the model.
+            config (pycmtensor.config): The pycmtensor config object.
 
         [^1]: Byrd, R. H., Hansen, S. L., Nocedal, J., & Singer, Y. (2016). A stochastic quasi-Newton method for large-scale optimization. SIAM Journal on Optimization, 26(2), 1008-1031.
         """
@@ -670,6 +676,17 @@ def update(self, cost, params, lr):
 
 
 def clip(param, min, max):
+    """
+    Clips the value of a parameter within a specified range.
+
+    Args:
+        param (float): The parameter value to be clipped.
+        min (float): The minimum value that the parameter can take.
+        max (float): The maximum value that the parameter can take.
+
+    Returns:
+        float: The clipped value of the parameter.
+    """
     if any([min, max]) and (config.beta_clipping):
         if min is None:
             min = -9999.0
diff --git a/pycmtensor/regularizers.py b/pycmtensor/regularizers.py
index a6c5d46..9a6e1f2 100644
--- a/pycmtensor/regularizers.py
+++ b/pycmtensor/regularizers.py
@@ -9,20 +9,17 @@
 __all__ = ["Regularizers"]
 
 
-class Regularizers(object):
-    def __init__(self):
-        pass
-
+class Regularizers:
     @staticmethod
     def l1(params, weight=0.001):
         """compute the L1 norm of the tensors
 
         Args:
-            params (Union[list[TensorVariable], TensorVariable]): the parameters to compute the L1 norm
-            weight (float): value for penalizing the regularization term
+            params (Union[list[TensorVariable], TensorVariable]): The parameters to compute the L1 norm.
+            weight (float): The value for penalizing the regularization term. Default value is 0.001.
 
         Returns:
-            (TensorVariable): the L1 norm (sum of absolute values)
+            (TensorVariable): The L1 norm, which is the sum of absolute values of the tensors.
         """
         if not isinstance(params, list):
             params = [params]
@@ -31,14 +28,14 @@ def l1(params, weight=0.001):
 
     @staticmethod
     def l2(params, weight=0.0001):
-        """compute the L2 norm of the tensors
+        """Compute the L2 norm of the tensors
 
         Args:
-            params (Union[list[TensorVariable], TensorVariable]): the parameters to compute the L2 norm
-            weight (float): value for penalizing the regularization term
+            params (Union[list[TensorVariable], TensorVariable]): The parameters to compute the L2 norm.
+            weight (float): The value for penalizing the regularization term. Default value is 0.0001.
 
         Returns:
-            (TensorVariable): the L2 norm (sum of squared values)
+            (TensorVariable): The L2 norm, which is the sum of squared values of the tensors.
         """
         if not isinstance(params, list):
             params = [params]
diff --git a/pycmtensor/results.py b/pycmtensor/results.py
index abe0499..f3d8706 100644
--- a/pycmtensor/results.py
+++ b/pycmtensor/results.py
@@ -52,6 +52,7 @@ def __init__(self):
         self.null_loglikelihood = -np.inf
         self.best_loglikelihood = -np.inf
         self.best_valid_error = 1.0
+        self.best_train_error = 1.0
         self.best_epoch = 0
 
         self.gnorm = None
@@ -127,18 +128,19 @@ def model_statistics(self):
             (pandas.DataFrame): Summary of the model statistics
         """
         stats = pd.DataFrame(columns=["value"]).astype("object")
-        stats.loc["Number of training samples used"] = int(self.n_train)
-        stats.loc["Number of validation samples used"] = int(self.n_valid)
-        stats.loc["Number of estimated parameters in the model"] = int(self.n_params)
-        stats.loc["Null. log likelihood"] = self.null_loglikelihood
-        stats.loc["Final log likelihood"] = self.best_loglikelihood
-        stats.loc["Accuracy"] = f"{100*(1-self.best_valid_error):.2f}%"
-        stats.loc["Likelihood ratio test"] = self.loglikelihood_ratio_test()
-        stats.loc["Rho square"] = self.rho_square()
-        stats.loc["Rho square bar"] = self.rho_square_bar()
-        stats.loc["Akaike Information Criterion"] = self.AIC()
-        stats.loc["Bayesian Information Criterion"] = self.BIC()
-        stats.loc["Final gradient norm"] = f"{self.gnorm:.5e}"
+        stats.loc["Number of training samples used"] = f"{self.n_train:d}"
+        stats.loc["Number of validation samples used"] = f"{self.n_valid:d}"
+        stats.loc["Number of estimated parameters in the model"] = f"{self.n_params:d}"
+        stats.loc["Null. log likelihood"] = f"{self.null_loglikelihood:.2f}"
+        stats.loc["Final log likelihood"] = f"{self.best_loglikelihood:.2f}"
+        stats.loc["Validation Accuracy"] = f"{100*(1-self.best_valid_error):.2f}%"
+        stats.loc["Training Accuracy"] = f"{100*(1-self.best_train_error):.2f}%"
+        stats.loc["Likelihood ratio test"] = f"{self.loglikelihood_ratio_test():.2f}"
+        stats.loc["Rho square"] = f"{self.rho_square():.3f}"
+        stats.loc["Rho square bar"] = f"{self.rho_square_bar():.3f}"
+        stats.loc["Akaike Information Criterion"] = f"{self.AIC():.2f}"
+        stats.loc["Bayesian Information Criterion"] = f"{self.BIC():.2f}"
+        stats.loc["Final gradient norm"] = f"{self.gnorm:.3e}"
         return stats
 
     def beta_statistics(self):
@@ -162,7 +164,7 @@ def beta_statistics(self):
             index=self.betas,
             data=[np.percentile(value, 50) for _, value in self.betas.items()],
             columns=["value"],
-        )
+        ).round(3)
 
         stats["std err"] = stderror(h, self.betas)
         stats["t-test"] = t_test(stats["std err"], self.betas)
@@ -171,6 +173,7 @@ def beta_statistics(self):
         stats["rob. std err"] = rob_stderror(h, bh, self.betas)
         stats["rob. t-test"] = t_test(stats["rob. std err"], self.betas)
         stats["rob. p-value"] = p_value(stats["rob. std err"], self.betas)
+        stats = stats.round(3)
 
         for key, value in self.betas.items():
             if value.shape != ():
@@ -196,7 +199,7 @@ def model_correlation_matrix(self):
             data=correlation_matrix(h),
         )
 
-        return mat
+        return mat.round(3)
 
     def model_robust_correlation_matrix(self):
         """Robust correlation matrix calculated from the hessian and bhhh
@@ -220,7 +223,7 @@ def model_robust_correlation_matrix(self):
             data=rob_correlation_matrix(h, bh),
         )
 
-        return mat
+        return mat.round(3)
 
     def show_training_plot(self, sample_intervals=1):
         """Displays the statistics graph as a line plot
diff --git a/pycmtensor/scheduler.py b/pycmtensor/scheduler.py
index a88aca1..3f08fbb 100644
--- a/pycmtensor/scheduler.py
+++ b/pycmtensor/scheduler.py
@@ -1,7 +1,25 @@
 # scheduler.py
 """PyCMTensor scheduler module
 
-This module contains the implementation of the learning rate scheduler. By default, a constant learning rate is used. 
+The code snippet defines a base class called `Scheduler` for learning rate schedulers. It also includes three subclasses: `ConstantLR`, `StepLR`, and `PolynomialLR`, which implement specific learning rate scheduling strategies.
+
+Example Usage:
+- Creating a `Scheduler` object:
+    scheduler = Scheduler(lr=0.01)
+- Getting the learning rate for a specific epoch:
+    lr = scheduler(epoch=5)
+- Creating a `ConstantLR` object:
+    constant_lr = ConstantLR(lr=0.01)
+- Getting the learning rate for a specific epoch:
+    lr = constant_lr(epoch=10)
+- Creating a `StepLR` object:
+    step_lr = StepLR(lr=0.01, factor=0.95, drop_every=5)
+- Getting the learning rate for a specific epoch:
+    lr = step_lr(epoch=15)
+- Creating a `PolynomialLR` object:
+    poly_lr = PolynomialLR(lr=0.01, max_epochs=20, power=0.5)
+- Getting the learning rate for a specific epoch:
+    lr = poly_lr(epoch=8)
 """
 
 import numpy as np
@@ -19,21 +37,34 @@
 
 class Scheduler:
     def __init__(self, lr):
-        """Base class for learning rate scheduler
+        """Initializes the Scheduler object with a base learning rate.
 
         Args:
-            lr (float): the base learning rate
+            lr (float): The base learning rate.
+
         Attributes:
-            history (list): (iteration #, lr) tuples
+            name (str): Name of the scheduler.
+            _base_lr (float): Base learning rate.
+            _history (list): List to store the learning rate history.
         """
         self.name = "Scheduler"
         self._base_lr = lr
         self._history = []
 
     def __str__(self):
+        """Returns a string representation of the Scheduler object.
+
+        Returns:
+            str: String representation of the Scheduler object.
+        """
         return f"{self.name}"
 
     def __repr__(self):
+        """Returns a string representation of the Scheduler object with its attributes.
+
+        Returns:
+            str: String representation of the Scheduler object with its attributes.
+        """
         msg = f"{self.name}("
         attrs = [d for d in dir(self) if not d.startswith("_")]
         for a in attrs:
@@ -45,11 +76,24 @@ def __repr__(self):
         return msg[:-2] + ")"
 
     def __call__(self, epoch):
+        """Records the learning rate and returns the current learning rate for a specific epoch.
+
+        Args:
+            epoch (int): The epoch number.
+
+        Returns:
+            float: The current learning rate.
+        """
         self.record(self.lr)
         return self.lr
 
     @property
     def lr(self):
+        """Property that returns the base learning rate.
+
+        Returns:
+            float: The base learning rate.
+        """
         return self._base_lr
 
     @property  # alias for lr
@@ -58,16 +102,21 @@ def learning_rate(self):
 
     @property
     def history(self):
+        """Property that returns the learning rate history.
+
+        Returns:
+            list: The learning rate history.
+        """
         return self._history
 
     def record(self, lr):
-        """Saves the history of the learning rate and returns the current learning rate
+        """Saves the history of the learning rate and returns the current learning rate.
 
         Args:
-            lr (float): the learning rate
+            lr (float): The learning rate.
 
         Returns:
-            (float): the current learning rate
+            float: The current learning rate.
         """
         self.history.append(lr)
         return lr
@@ -75,11 +124,10 @@ def record(self, lr):
 
 class ConstantLR(Scheduler):
     def __init__(self, lr=0.01, **kwargs):
-        """Base class for constant learning rate scheduler
+        """Subclass of Scheduler for constant learning rate scheduler.
 
         Args:
             lr (float): initial learning rate
-            **kwargs (dict): overloaded keyword arguments
         """
         super().__init__(lr)
         self.name = "ConstantLR"
@@ -93,7 +141,6 @@ def __init__(self, lr=0.01, factor=0.95, drop_every=10, **kwargs):
             lr (float): initial learning rate
             factor (float): percentage reduction to the learning rate
             drop_every (int): step down the learning rate after every n steps
-            **kwargs (dict): overloaded keyword arguments
         """
         super().__init__(lr)
         self.name = "StepLR"
@@ -120,13 +167,12 @@ def __call__(self, epoch):
 
 class PolynomialLR(Scheduler):
     def __init__(self, max_epochs, lr=0.01, power=1.0, **kwargs):
-        """Base class for polynomial decay learning rate scheduler
+        """Subclass of Scheduler for polynomial decay learning rate scheduler.
 
         Args:
             lr (float): initial learning rate value
             max_epochs (int): the max number of training epochs
             power (float): the exponential factor to decay
-            **kwargs (dict): overloaded keyword arguments
         """
         super().__init__(lr)
         self.name = "PolynomialLR"
@@ -153,6 +199,17 @@ def __call__(self, epoch):
 
 class CyclicLR(Scheduler):
     def __init__(self, lr=0.01, max_lr=0.1, cycle_steps=16, scale_fn=None, **kwargs):
+        """Subclass of Scheduler for cyclic learning rate scheduler.
+
+        Args:
+            lr (float, optional): initial learning rate value.
+            max_lr (float): Peak learning rate value.
+            cycle_steps (int): The number of steps to complete a cycle.
+            scale_fn (function): Scaling function for the learning rate.
+
+        Raises:
+            ValueError: _description_
+        """
         super().__init__(lr)
         self.name = "CyclicLR"
         self._max_lr = max_lr
@@ -183,17 +240,18 @@ def scale_fn(self, k):
 
 class Triangular2CLR(CyclicLR):
     def __init__(self, lr=0.01, max_lr=0.1, cycle_steps=16, **kwargs):
-        """Base class for Triangular Cyclic LR scheduler. The scaling of the Triangular Cyclic function is:
+        """Subclass of CyclicLR for Triangular Cyclic LR scheduler.
+
+        The scaling of the Triangular Cyclic function is:
 
         $$
         scale = \\frac{1}{2^{step-1}}
         $$
 
         Args:
-            lr (float): initial learning rate value
-            max_lr (float): peak learning rate value
-            cycle_steps (int): the number of steps to complete a cycle
-            **kwargs (dict): overloaded keyword arguments
+            lr (float, optional): initial learning rate value.
+            max_lr (float): Peak learning rate value.
+            cycle_steps (int): The number of steps to complete a cycle.
         """
         super().__init__(lr, max_lr, cycle_steps, scale_fn=self.scale_fn)
         self.name = "Triangular2CLR"
@@ -204,18 +262,19 @@ def scale_fn(self, k):
 
 class ExpRangeCLR(CyclicLR):
     def __init__(self, lr=0.01, max_lr=0.1, cycle_steps=16, gamma=0.5, **kwargs):
-        """Base class for exponential range Cyclic LR scheduler. The scaling is:
+        """Subclass of CyclicLR for exponential range Cyclic LR scheduler.
+
+        The scaling is:
 
         $$
         scale = \\gamma^{step}
         $$
 
         Args:
-            lr (float): initial learning rate value
-            max_lr (float): peak learning rate value
-            cycle_steps (int): the number of steps to complete a cycle
-            gamma (float): exponential parameter. Default=0.5
-            **kwargs (dict): overloaded keyword arguments
+            lr (float, optional): initial learning rate value.
+            max_lr (float): Peak learning rate value.
+            cycle_steps (int): The number of steps to complete a cycle.
+            gamma (float): Exponential parameter.
         """
         super().__init__(lr, max_lr, cycle_steps, scale_fn=self.scale_fn)
         self.name = "ExpRangeCLR"
diff --git a/pycmtensor/statistics.py b/pycmtensor/statistics.py
index e03b908..e0eb989 100644
--- a/pycmtensor/statistics.py
+++ b/pycmtensor/statistics.py
@@ -1,7 +1,16 @@
-# statistics.py
 """PyCMTensor statistics module
 
 This module contains methods for calculating the statistics of the estimated parameters.
+
+Functions:
+- variance_covariance(hessian): Computes the variance covariance matrix given the Hessian.
+- rob_variance_covariance(hessian, bhhh): Computes the robust variance covariance matrix given the Hessian and BHHH matrices.
+- t_test(stderr, params): Computes the statistical t-test of the estimated parameters and the standard errors.
+- p_value(stderr, params): Computes the p-value (statistical significance) of the estimated parameters using the two-tailed normal distribution.
+- stderror(hessian, params): Calculates the standard error of the estimated parameters given the Hessian matrix.
+- rob_stderror(hessian, bhhh, params): Calculates the robust standard error of the estimated parameters given the Hessian and BHHH matrices.
+- correlation_matrix(hessian): Computes the correlation matrix from the Hessian matrix.
+- rob_correlation_matrix(hessian, bhhh): Computes the robust correlation matrix from the Hessian and BHHH matrices.
 """
 import numpy as np
 from scipy import stats
diff --git a/pycmtensor/utils.py b/pycmtensor/utils.py
index bcabcc3..be20978 100644
--- a/pycmtensor/utils.py
+++ b/pycmtensor/utils.py
@@ -1,17 +1,49 @@
-# utils.py
-"""PyCMTensor utils module"""
+"""PyCMTensor utils module
+
+This module provides utility functions for formatting time and numbers.
+
+Functions:
+- time_format(seconds): Converts a number of seconds into a formatted string representing the time in hours, minutes, and seconds.
+- human_format(number): Converts a number into a human-readable format with a magnitude suffix.
+
+Example Usage:
+```python
+time_str = time_format(3661)
+print(time_str)  # Output: '01:01:01'
+
+number_str = human_format(1234567890)
+print(number_str)  # Output: '1.23B'
+```
+"""
+
+suffixes = ["", "K", "M", "B", "T", "P"]
 
 
 def time_format(seconds):
+    """
+    Converts a number of seconds into a formatted string representing the time in hours, minutes, and seconds.
+
+    Args:
+        seconds (int or float): The number of seconds.
+
+    Returns:
+        str: The formatted time string in the format 'HH:MM:SS'.
+    """
     minutes, seconds = divmod(round(seconds), 60)
-    if minutes >= 60:
-        hours, minutes = divmod(minutes, 60)
-    else:
-        hours = 0
+    hours, minutes = divmod(minutes, 60)
     return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
 
 
 def human_format(number):
+    """
+    Converts a number into a human-readable format with a magnitude suffix.
+
+    Args:
+        number (int or float): The number to be converted.
+
+    Returns:
+        str: The formatted number string with a magnitude suffix.
+    """
     number = float(f"{number:.3g}")
     magnitude = 0
     while abs(number) >= 1000:
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index c105899..19e6c43 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -36,8 +36,8 @@ def test_split_frac_one():
 
     len_dataset = len(ds.ds["travel_mode"])
     assert ds.split_frac == 1
-    assert len(ds.train_index) == len_dataset
-    assert len(ds.valid_index) == len_dataset
+    assert len(ds.idx_train) == len_dataset
+    assert len(ds.idx_valid) == len_dataset
 
 
 def test_dataset_property(lpmc_ds):
@@ -51,8 +51,8 @@ def test_dataset_property(lpmc_ds):
     assert lpmc_ds.n_train == len_train_dataset
     assert lpmc_ds.n_valid == len_valid_dataset
 
-    assert all(lpmc_ds.train_index == lpmc_ds.index[:len_train_dataset])
-    assert all(lpmc_ds.valid_index == lpmc_ds.index[len_train_dataset:])
+    assert all(lpmc_ds.idx_train == lpmc_ds.index[:len_train_dataset])
+    assert all(lpmc_ds.idx_valid == lpmc_ds.index[len_train_dataset:])
 
     assert ds == lpmc_ds()