Skip to content

Commit

Permalink
Merge pull request #290 from CITCOM-project/gp-formulae
Browse files Browse the repository at this point in the history
GP formulae
  • Loading branch information
jmafoster1 authored Sep 17, 2024
2 parents bfd0534 + ed832da commit d5f0dad
Show file tree
Hide file tree
Showing 44 changed files with 1,924 additions and 1,512 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/lint-format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ jobs:

- name: Archive production artifacts
if: ${{ success() }} || ${{ failure() }}
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v3
with:
name: MegaLinter reports
path: |
megalinter-reports
mega-linter.log
mega-linter.log
4 changes: 2 additions & 2 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,8 @@ min-public-methods=2
[EXCEPTIONS]

# Exceptions that will emit a warning when caught.
overgeneral-exceptions=BaseException,
Exception
overgeneral-exceptions=builtins.BaseException,
builtins.Exception


[FORMAT]
Expand Down
73 changes: 73 additions & 0 deletions causal_testing/estimation/abstract_estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""This module contains the Estimator abstract class"""

import logging
from abc import ABC, abstractmethod
from typing import Any

import pandas as pd

logger = logging.getLogger(__name__)


class Estimator(ABC):
# pylint: disable=too-many-instance-attributes
"""An estimator contains all of the information necessary to compute a causal estimate for the effect of changing
a set of treatment variables to a set of values.
All estimators must implement the following two methods:
1) add_modelling_assumptions: The validity of a model-assisted causal inference result depends on whether
the modelling assumptions imposed by a model actually hold. Therefore, for each model, is important to state
the modelling assumption upon which the validity of the results depend. To achieve this, the estimator object
maintains a list of modelling assumptions (as strings). If a user wishes to implement their own estimator, they
must implement this method and add all assumptions to the list of modelling assumptions.
2) estimate_ate: All estimators must be capable of returning the average treatment effect as a minimum. That is, the
average effect of the intervention (changing treatment from control to treated value) on the outcome of interest
adjusted for all confounders.
"""

def __init__(
# pylint: disable=too-many-arguments
self,
treatment: str,
treatment_value: float,
control_value: float,
adjustment_set: set,
outcome: str,
df: pd.DataFrame = None,
effect_modifiers: dict[str:Any] = None,
alpha: float = 0.05,
query: str = "",
):
self.treatment = treatment
self.treatment_value = treatment_value
self.control_value = control_value
self.adjustment_set = adjustment_set
self.outcome = outcome
self.alpha = alpha
self.df = df.query(query) if query else df

if effect_modifiers is None:
self.effect_modifiers = {}
else:
self.effect_modifiers = effect_modifiers
self.modelling_assumptions = []
if query:
self.modelling_assumptions.append(query)
self.add_modelling_assumptions()
logger.debug("Effect Modifiers: %s", self.effect_modifiers)

@abstractmethod
def add_modelling_assumptions(self):
"""
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
must hold if the resulting causal inference is to be considered valid.
"""

def compute_confidence_intervals(self) -> list[float, float]:
"""
Estimate the 95% Wald confidence intervals for the effect of changing the treatment from control values to
treatment values on the outcome.
:return: 95% Wald confidence intervals.
"""
119 changes: 119 additions & 0 deletions causal_testing/estimation/abstract_regression_estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""This module contains the RegressionEstimator, which is an abstract class for concrete regression estimators."""

import logging
from typing import Any
from abc import abstractmethod

import pandas as pd
from statsmodels.regression.linear_model import RegressionResultsWrapper
from patsy import dmatrix # pylint: disable = no-name-in-module

from causal_testing.specification.variable import Variable
from causal_testing.estimation.abstract_estimator import Estimator

logger = logging.getLogger(__name__)


class RegressionEstimator(Estimator):
"""A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
combination of parameters and functions of the variables (note these functions need not be linear).
"""

def __init__(
# pylint: disable=too-many-arguments
self,
treatment: str,
treatment_value: float,
control_value: float,
adjustment_set: set,
outcome: str,
df: pd.DataFrame = None,
effect_modifiers: dict[Variable:Any] = None,
formula: str = None,
alpha: float = 0.05,
query: str = "",
):
super().__init__(
treatment=treatment,
treatment_value=treatment_value,
control_value=control_value,
adjustment_set=adjustment_set,
outcome=outcome,
df=df,
effect_modifiers=effect_modifiers,
query=query,
)

self.model = None
if effect_modifiers is None:
effect_modifiers = []
if adjustment_set is None:
adjustment_set = []
if formula is not None:
self.formula = formula
else:
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
self.formula = f"{outcome} ~ {'+'.join(terms)}"

@property
@abstractmethod
def regressor(self):
"""
The regressor to use, e.g. ols or logit.
This should be a property accessible with self.regressor.
Define as `regressor = ...`` outside of __init__, not as `self.regressor = ...`, otherwise
you'll get an "cannot instantiate with abstract method" error.
"""

def add_modelling_assumptions(self):
"""
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
must hold if the resulting causal inference is to be considered valid.
"""
self.modelling_assumptions.append(
"The variables in the data must fit a shape which can be expressed as a linear"
"combination of parameters and functions of variables. Note that these functions"
"do not need to be linear."
)

def _run_regression(self, data=None) -> RegressionResultsWrapper:
"""Run logistic regression of the treatment and adjustment set against the outcome and return the model.
:return: The model after fitting to data.
"""
if data is None:
data = self.df
model = self.regressor(formula=self.formula, data=data).fit(disp=0)
self.model = model
return model

def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
"""Estimate the outcomes under control and treatment.
:param data: The data to use, defaults to `self.df`. Controllable for boostrap sampling.
:param: adjustment_config: The values of the adjustment variables to use.
:return: The estimated outcome under control and treatment, with confidence intervals in the form of a
dataframe with columns "predicted", "se", "ci_lower", and "ci_upper".
"""
if adjustment_config is None:
adjustment_config = {}

model = self._run_regression(data)

x = pd.DataFrame(columns=self.df.columns)
x["Intercept"] = 1 # self.intercept
x[self.treatment] = [self.treatment_value, self.control_value]

for k, v in adjustment_config.items():
x[k] = v
for k, v in self.effect_modifiers.items():
x[k] = v
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
for col in x:
if str(x.dtypes[col]) == "object":
x = pd.get_dummies(x, columns=[col], drop_first=True)

# This has to be here in case the treatment variable is in an I(...) block in the self.formula
x[self.treatment] = [self.treatment_value, self.control_value]
return model.get_prediction(x).summary_frame()
75 changes: 75 additions & 0 deletions causal_testing/estimation/cubic_spline_estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""This module contains the CubicSplineRegressionEstimator class, for estimating
continuous outcomes with changes in behaviour"""

import logging
from typing import Any

import pandas as pd

from causal_testing.specification.variable import Variable
from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator

logger = logging.getLogger(__name__)


class CubicSplineRegressionEstimator(LinearRegressionEstimator):
"""A Cubic Spline Regression Estimator is a parametric estimator which restricts the variables in the data to a
combination of parameters and basis functions of the variables.
"""

def __init__(
# pylint: disable=too-many-arguments
self,
treatment: str,
treatment_value: float,
control_value: float,
adjustment_set: set,
outcome: str,
basis: int,
df: pd.DataFrame = None,
effect_modifiers: dict[Variable:Any] = None,
formula: str = None,
alpha: float = 0.05,
expected_relationship=None,
):
super().__init__(
treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula, alpha
)

self.expected_relationship = expected_relationship

if effect_modifiers is None:
effect_modifiers = []

if formula is None:
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
self.formula = f"{outcome} ~ cr({'+'.join(terms)}, df={basis})"

def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series:
"""Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
by changing the treatment variable from the control value to the treatment value. Here, we actually
calculate the expected outcomes under control and treatment and divide one by the other. This
allows for custom terms to be put in such as squares, inverses, products, etc.
:param: adjustment_config: The configuration of the adjustment set as a dict mapping variable names to
their values. N.B. Every variable in the adjustment set MUST have a value in
order to estimate the outcome under control and treatment.
:return: The average treatment effect.
"""
model = self._run_regression()

x = {"Intercept": 1, self.treatment: self.treatment_value}
if adjustment_config is not None:
for k, v in adjustment_config.items():
x[k] = v
if self.effect_modifiers is not None:
for k, v in self.effect_modifiers.items():
x[k] = v

treatment = model.predict(x).iloc[0]

x[self.treatment] = self.control_value
control = model.predict(x).iloc[0]

return pd.Series(treatment - control)
Loading

0 comments on commit d5f0dad

Please sign in to comment.