Merge pull request #78 from mwong009/1.5.0-beta

1.5.0 beta
mwong009 · Jul 26, 2023 · 071d1f5 · 071d1f5
2 parents 9f8027f + ee22300
commit 071d1f5
Show file tree

Hide file tree

Showing 12 changed files with 735 additions and 400 deletions.
diff --git a/docs/developer_guide/api.md b/docs/developer_guide/api.md
diff --git a/docs/developer_guide/api/config.md b/docs/developer_guide/api/config.md
@@ -1,13 +1,14 @@
-# config.py
+# defaultconfig.py
 
 ---
 
 See [configuration](../../user_guide/configuration.md) for a list of available configuration settings.
 
-::: pycmtensor.config
+::: pycmtensor.defaultconfig
 	handler: python
-    options:
-	  show_source: false 
+	options:
+	  show_root_heading: false
+	  show_root_toc_entry: false
 	  heading_level: 2
 	  members:
-	    - Config
+		- Config
diff --git a/docs/developer_guide/api/optimizers.md b/docs/developer_guide/api/optimizers.md
@@ -0,0 +1,9 @@
+# optimizers.py
+
+---
+
+::: pycmtensor.optimizers
+	handler: python
+	options:
+	  heading_level: 2
+	  members_order: source
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -22,6 +22,7 @@ nav:
       - models: 
         - basic: developer_guide/api/models/basic.md
         - MNL: developer_guide/api/models/MNL.md
+      - optimizers: developer_guide/api/optimizers.md
   - About:
     - Contributing: about/contributing.md
     - Release notes: about/release_notes.md
@@ -80,6 +81,7 @@ markdown_extensions:
       guess_lang: False
       linenums: False
       use_pygments: False
+  - footnotes
   - pymdownx.arithmatex:
       generic: True
   - pymdownx.b64

diff --git a/poetry.lock b/poetry.lock
diff --git a/pycmtensor/__init__.py b/pycmtensor/__init__.py
@@ -10,7 +10,7 @@
 
 import pycmtensor.optimizers
 import pycmtensor.scheduler
-from pycmtensor.config import config
+from pycmtensor.defaultconfig import config
 
 # aesara configs
 

diff --git a/pycmtensor/dataset.py b/pycmtensor/dataset.py
@@ -1,7 +1,10 @@
 # dataset.py
 # converts pandas dataframe into an xarray dataset
 
+from typing import Union
+
 import aesara.tensor as aet
+from aesara.tensor.var import TensorVariable
 
 from pycmtensor import config
 
@@ -14,24 +17,43 @@ class Dataset:
     def __init__(self, df, choice):
         """Base PyCMTensor Dataset class object
 
+        This class stores the data in an array format, and a symbolic tensor reference
+        variable object. To call the tensor variable, we invoke the label of the
+        variable as an item in the Dataset class, like so:
+        ```python
+        ds = Dataset(df=df, choice="choice")
+        return ds["label_of_variable"]
+        ```
+
+        To call the data array, we use the `train_dataset()` or `valid_dataset()`
+        method. See method reference for info about the arguments. For example:
+        ```python
+        # to get the data array for variable "time"
+        arr = ds.train_dataset(ds["time"])
+        ```
+
         Args:
             df (pandas.DataFrame): the pandas dataframe object to load
             choice (str): the name of the choice variable
 
         Attributes:
             n (int): total number of rows in the dataset
-            x (list[TensorVariable]): a list of (input) `TensorVariable` objects to
-                build the tensor expression from
+            x (list[TensorVariable]): the full list of (input) `TensorVariable` objects
+                to build the tensor expression from
             y (TensorVariable): the output (choice) `TensorVariable` object
-            scale (dict): a dictionary of `float` values to store the scaling factor used for each variable
+            scale (dict): a dictionary of `float` values to store the scaling factor
+                used for each variable
             choice (str): the name of the choice variable
-            ds (dict): a dictionary of `np.ndarray` to store the values of each variable
+            ds (dict): a dictionary of `numpy.ndarray` to store the values of each
+                variable
             split_frac (float): the factor used to split the dataset into training and
                 validation datasets
-            train_index (list):
-            valid_index (list):
-            n_train (int):
-            n_valid (int):
+            train_index (list): the list of values of the indices of the training
+                dataset
+            valid_index (list): the list of values of the indices of the validation
+                dataset
+            n_train (int): the size of the training dataset
+            n_valid (int): the size of the validation dataset
 
         Example:
             Example initalization of a pandas dataset:
@@ -85,13 +107,24 @@ def __call__(self):
         return self.ds
 
     def __getitem__(self, key):
-        if key in [var.name for var in self.x]:
-            i = [x.name for x in self.x].index(key)
-            return self.x[i]
-        if key == self.y.name:
-            return self.y
+        if isinstance(key, (list, tuple)):
+            return self._make_tensor(key)
+        else:
+            if key in [var.name for var in self.x]:
+                i = [x.name for x in self.x].index(key)
+                return self.x[i]
+            if key == self.y.name:
+                return self.y
+            else:
+                raise KeyError
+
+    def _make_tensor(self, keys):
+        # if tensor inputs are list of strings, convert them to tensors
+        if all(isinstance(k, str) for k in keys):
+            keys = [self[k] for k in keys]
         else:
-            raise KeyError
+            raise TypeError(f"Multiple types found in {keys}.")
+        return aet.as_tensor_variable(keys)
 
     @property
     def n_train(self) -> int:
@@ -115,7 +148,7 @@ def drop(self, variables):
         """Method for dropping `variables` from the dataset
 
         Args:
-            variables (list): list of variables from the dataset to drop
+            variables (list[str]): list of `str` variables from the dataset to drop
 
         Raises:
             KeyError: raises an error if any item in `variables` is not found in the dataset or item is the choice variable
@@ -128,13 +161,14 @@ def drop(self, variables):
                 i = [x.name for x in self.x].index(variable)
                 del self.x[i]
                 del self.scale[variable]
+                del self.ds[variable]
                 debug(f"Dropped input variable '{variable}' from dataset")
 
             else:
                 raise KeyError
 
     def scale_variable(self, variable, factor):
-        """Multiply values of the `variable` by factor 1/factor.
+        """Multiply values of the `variable` by $1/\\textrm{factor}$.
 
         Args:
             variable (str): the name of the variable or a list of variable names
@@ -144,24 +178,36 @@ def scale_variable(self, variable, factor):
         self.scale[variable] = self.scale[variable] * factor
 
     def split(self, frac):
-        """TODO"""
-        n = round(self.n * frac)
+        """Method to split dataset into training and validation subsets
+
+        Args:
+            frac (float): the fraction to split the dataset into the training set. The training set will be indexed from `0` to `frac` $\\times$ `Dataset.n`. The validation dataset will be from the last index of the training set to the last row of the dataset.
+
+        Note:
+            The actual splitting of the dataset is done during the training procedure,
+            or when invoking the `train_dataset()` or `valid_dataset()` methods
+
+        """
+
         self.split_frac = frac
         info(f"n_train_samples:{self.n_train} n_valid_samples:{self.n_valid}")
 
     def _dataset_slice(self, tensors, index, batch_size, shift, n_index):
-        """Internal method call for self.train_dataset or self.valid_dataset
+        """Internal method call for self.train_dataset or self.valid_dataset"""
 
-        Args:
-            tensors (TensorVariable): tensor or list of tensors
-            index (int):
-            batch_size (int):
-            shift (int):
-            n_index (list): list of index values of the [train|valid] dataset
-        """
         if not isinstance(tensors, list):
             tensors = [tensors]
 
+        # check if all tensors are of the same type tensors
+        if all(isinstance(t, TensorVariable) for t in tensors):
+            pass
+        # if tensor inputs are list of strings, convert them to tensors
+        elif all(isinstance(t, str) for t in tensors):
+            tensors = [self[t] for t in tensors]
+        else:
+            raise TypeError(f"Multiple types found in {tensors}.")
+
+        # retrieve tensor names
         tensor_names = [t.name for t in tensors]
         for name in tensor_names:
             if name not in list(self.ds):
@@ -183,12 +229,57 @@ def _dataset_slice(self, tensors, index, batch_size, shift, n_index):
         return _ds
 
     def train_dataset(self, variables, index=None, batch_size=None, shift=None):
-        """Return a slice of the training dataset with the sequence matching the list of variables"""
+        """Returns a slice of the (or the full) training data array with the sequence
+        matching the list of variables.
+
+        Args:
+            variables (Union[list, str, TensorVariable]): a tensor, label, or list of
+                tensors or list of labels
+            index (int): the start of the slice of the data array. If `None` is given,
+                returns the full data array.
+            batch_size (int): length of the slice. If `None` is given, returns the
+                index from `index` to `N` where `N` is the length of the array.
+            shift (int): the offset of the slice between `0` and `batch_size`. If
+                `None` is given, `shift=0`.
+
+        Returns:
+            (list): a list of array object(s) corresponding to the input variables
+
+        !!! Example
+            How to retrieve data array from Dataset:
+            ```python
+            ds = Dataset(df, choice="choice")
+
+            # index "age" and "location" data arrays
+            return ds.train_dataset([ds["age"], ds["location"]])
+
+            # similar result
+            return ds.train_dataset(["age", "location"])
+            ```
+        """
+
         n_index = self.train_index
+
         return self._dataset_slice(variables, index, batch_size, shift, n_index)
 
     def valid_dataset(self, variables, index=None, batch_size=None, shift=None):
-        """Return a slice of the valid dataset with the sequence matching the list of variables"""
+        """Returns a slice of the (or the full) validation data array with the sequence
+        matching the list of variables.
+
+        Args:
+            variables (Union[list, str, TensorVariable]): a tensor, label, or list of
+                tensors or list of labels
+            index (int): the start of the slice of the data array. If `None` is given,
+                returns the full data array.
+            batch_size (int): length of the slice. If `None` is given, returns the
+                index from `index` to `N` where `N` is the length of the array.
+            shift (int): the offset of the slice between `0` and `batch_size`. If
+                `None` is given, `shift=0`.
+
+        Returns:
+            (list): a list of array object(s) corresponding to the input variables
+        """
+
         n_index = self.valid_index
 
         return self._dataset_slice(variables, index, batch_size, shift, n_index)
diff --git a/pycmtensor/config.py → pycmtensor/defaultconfig.py b/pycmtensor/config.py → pycmtensor/defaultconfig.py
@@ -1,4 +1,4 @@
-# config.py
+# defaultconfig.py
 """PyCMTensor config module"""
 import configparser
 import multiprocessing