Skip to content

Commit

Permalink
Merge pull request #78 from mwong009/1.5.0-beta
Browse files Browse the repository at this point in the history
1.5.0 beta
  • Loading branch information
mwong009 authored Jul 26, 2023
2 parents 9f8027f + ee22300 commit 071d1f5
Show file tree
Hide file tree
Showing 12 changed files with 735 additions and 400 deletions.
3 changes: 0 additions & 3 deletions docs/developer_guide/api.md

This file was deleted.

11 changes: 6 additions & 5 deletions docs/developer_guide/api/config.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# config.py
# defaultconfig.py

---

See [configuration](../../user_guide/configuration.md) for a list of available configuration settings.

::: pycmtensor.config
::: pycmtensor.defaultconfig
handler: python
options:
show_source: false
options:
show_root_heading: false
show_root_toc_entry: false
heading_level: 2
members:
- Config
- Config
9 changes: 9 additions & 0 deletions docs/developer_guide/api/optimizers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# optimizers.py

---

::: pycmtensor.optimizers
handler: python
options:
heading_level: 2
members_order: source
2 changes: 2 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ nav:
- models:
- basic: developer_guide/api/models/basic.md
- MNL: developer_guide/api/models/MNL.md
- optimizers: developer_guide/api/optimizers.md
- About:
- Contributing: about/contributing.md
- Release notes: about/release_notes.md
Expand Down Expand Up @@ -80,6 +81,7 @@ markdown_extensions:
guess_lang: False
linenums: False
use_pygments: False
- footnotes
- pymdownx.arithmatex:
generic: True
- pymdownx.b64
Expand Down
186 changes: 129 additions & 57 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pycmtensor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pycmtensor.optimizers
import pycmtensor.scheduler
from pycmtensor.config import config
from pycmtensor.defaultconfig import config

# aesara configs

Expand Down
147 changes: 119 additions & 28 deletions pycmtensor/dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# dataset.py
# converts pandas dataframe into an xarray dataset

from typing import Union

import aesara.tensor as aet
from aesara.tensor.var import TensorVariable

from pycmtensor import config

Expand All @@ -14,24 +17,43 @@ class Dataset:
def __init__(self, df, choice):
"""Base PyCMTensor Dataset class object
This class stores the data in an array format, and a symbolic tensor reference
variable object. To call the tensor variable, we invoke the label of the
variable as an item in the Dataset class, like so:
```python
ds = Dataset(df=df, choice="choice")
return ds["label_of_variable"]
```
To call the data array, we use the `train_dataset()` or `valid_dataset()`
method. See method reference for info about the arguments. For example:
```python
# to get the data array for variable "time"
arr = ds.train_dataset(ds["time"])
```
Args:
df (pandas.DataFrame): the pandas dataframe object to load
choice (str): the name of the choice variable
Attributes:
n (int): total number of rows in the dataset
x (list[TensorVariable]): a list of (input) `TensorVariable` objects to
build the tensor expression from
x (list[TensorVariable]): the full list of (input) `TensorVariable` objects
to build the tensor expression from
y (TensorVariable): the output (choice) `TensorVariable` object
scale (dict): a dictionary of `float` values to store the scaling factor used for each variable
scale (dict): a dictionary of `float` values to store the scaling factor
used for each variable
choice (str): the name of the choice variable
ds (dict): a dictionary of `np.ndarray` to store the values of each variable
ds (dict): a dictionary of `numpy.ndarray` to store the values of each
variable
split_frac (float): the factor used to split the dataset into training and
validation datasets
train_index (list):
valid_index (list):
n_train (int):
n_valid (int):
train_index (list): the list of values of the indices of the training
dataset
valid_index (list): the list of values of the indices of the validation
dataset
n_train (int): the size of the training dataset
n_valid (int): the size of the validation dataset
Example:
Example initalization of a pandas dataset:
Expand Down Expand Up @@ -85,13 +107,24 @@ def __call__(self):
return self.ds

def __getitem__(self, key):
if key in [var.name for var in self.x]:
i = [x.name for x in self.x].index(key)
return self.x[i]
if key == self.y.name:
return self.y
if isinstance(key, (list, tuple)):
return self._make_tensor(key)
else:
if key in [var.name for var in self.x]:
i = [x.name for x in self.x].index(key)
return self.x[i]
if key == self.y.name:
return self.y
else:
raise KeyError

def _make_tensor(self, keys):
# if tensor inputs are list of strings, convert them to tensors
if all(isinstance(k, str) for k in keys):
keys = [self[k] for k in keys]
else:
raise KeyError
raise TypeError(f"Multiple types found in {keys}.")
return aet.as_tensor_variable(keys)

@property
def n_train(self) -> int:
Expand All @@ -115,7 +148,7 @@ def drop(self, variables):
"""Method for dropping `variables` from the dataset
Args:
variables (list): list of variables from the dataset to drop
variables (list[str]): list of `str` variables from the dataset to drop
Raises:
KeyError: raises an error if any item in `variables` is not found in the dataset or item is the choice variable
Expand All @@ -128,13 +161,14 @@ def drop(self, variables):
i = [x.name for x in self.x].index(variable)
del self.x[i]
del self.scale[variable]
del self.ds[variable]
debug(f"Dropped input variable '{variable}' from dataset")

else:
raise KeyError

def scale_variable(self, variable, factor):
"""Multiply values of the `variable` by factor 1/factor.
"""Multiply values of the `variable` by $1/\\textrm{factor}$.
Args:
variable (str): the name of the variable or a list of variable names
Expand All @@ -144,24 +178,36 @@ def scale_variable(self, variable, factor):
self.scale[variable] = self.scale[variable] * factor

def split(self, frac):
"""TODO"""
n = round(self.n * frac)
"""Method to split dataset into training and validation subsets
Args:
frac (float): the fraction to split the dataset into the training set. The training set will be indexed from `0` to `frac` $\\times$ `Dataset.n`. The validation dataset will be from the last index of the training set to the last row of the dataset.
Note:
The actual splitting of the dataset is done during the training procedure,
or when invoking the `train_dataset()` or `valid_dataset()` methods
"""

self.split_frac = frac
info(f"n_train_samples:{self.n_train} n_valid_samples:{self.n_valid}")

def _dataset_slice(self, tensors, index, batch_size, shift, n_index):
"""Internal method call for self.train_dataset or self.valid_dataset
"""Internal method call for self.train_dataset or self.valid_dataset"""

Args:
tensors (TensorVariable): tensor or list of tensors
index (int):
batch_size (int):
shift (int):
n_index (list): list of index values of the [train|valid] dataset
"""
if not isinstance(tensors, list):
tensors = [tensors]

# check if all tensors are of the same type tensors
if all(isinstance(t, TensorVariable) for t in tensors):
pass
# if tensor inputs are list of strings, convert them to tensors
elif all(isinstance(t, str) for t in tensors):
tensors = [self[t] for t in tensors]
else:
raise TypeError(f"Multiple types found in {tensors}.")

# retrieve tensor names
tensor_names = [t.name for t in tensors]
for name in tensor_names:
if name not in list(self.ds):
Expand All @@ -183,12 +229,57 @@ def _dataset_slice(self, tensors, index, batch_size, shift, n_index):
return _ds

def train_dataset(self, variables, index=None, batch_size=None, shift=None):
"""Return a slice of the training dataset with the sequence matching the list of variables"""
"""Returns a slice of the (or the full) training data array with the sequence
matching the list of variables.
Args:
variables (Union[list, str, TensorVariable]): a tensor, label, or list of
tensors or list of labels
index (int): the start of the slice of the data array. If `None` is given,
returns the full data array.
batch_size (int): length of the slice. If `None` is given, returns the
index from `index` to `N` where `N` is the length of the array.
shift (int): the offset of the slice between `0` and `batch_size`. If
`None` is given, `shift=0`.
Returns:
(list): a list of array object(s) corresponding to the input variables
!!! Example
How to retrieve data array from Dataset:
```python
ds = Dataset(df, choice="choice")
# index "age" and "location" data arrays
return ds.train_dataset([ds["age"], ds["location"]])
# similar result
return ds.train_dataset(["age", "location"])
```
"""

n_index = self.train_index

return self._dataset_slice(variables, index, batch_size, shift, n_index)

def valid_dataset(self, variables, index=None, batch_size=None, shift=None):
"""Return a slice of the valid dataset with the sequence matching the list of variables"""
"""Returns a slice of the (or the full) validation data array with the sequence
matching the list of variables.
Args:
variables (Union[list, str, TensorVariable]): a tensor, label, or list of
tensors or list of labels
index (int): the start of the slice of the data array. If `None` is given,
returns the full data array.
batch_size (int): length of the slice. If `None` is given, returns the
index from `index` to `N` where `N` is the length of the array.
shift (int): the offset of the slice between `0` and `batch_size`. If
`None` is given, `shift=0`.
Returns:
(list): a list of array object(s) corresponding to the input variables
"""

n_index = self.valid_index

return self._dataset_slice(variables, index, batch_size, shift, n_index)
2 changes: 1 addition & 1 deletion pycmtensor/config.py → pycmtensor/defaultconfig.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# config.py
# defaultconfig.py
"""PyCMTensor config module"""
import configparser
import multiprocessing
Expand Down
Loading

0 comments on commit 071d1f5

Please sign in to comment.