-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #290 from CITCOM-project/gp-formulae
GP formulae
- Loading branch information
Showing
44 changed files
with
1,924 additions
and
1,512 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
"""This module contains the Estimator abstract class""" | ||
|
||
import logging | ||
from abc import ABC, abstractmethod | ||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Estimator(ABC): | ||
# pylint: disable=too-many-instance-attributes | ||
"""An estimator contains all of the information necessary to compute a causal estimate for the effect of changing | ||
a set of treatment variables to a set of values. | ||
All estimators must implement the following two methods: | ||
1) add_modelling_assumptions: The validity of a model-assisted causal inference result depends on whether | ||
the modelling assumptions imposed by a model actually hold. Therefore, for each model, is important to state | ||
the modelling assumption upon which the validity of the results depend. To achieve this, the estimator object | ||
maintains a list of modelling assumptions (as strings). If a user wishes to implement their own estimator, they | ||
must implement this method and add all assumptions to the list of modelling assumptions. | ||
2) estimate_ate: All estimators must be capable of returning the average treatment effect as a minimum. That is, the | ||
average effect of the intervention (changing treatment from control to treated value) on the outcome of interest | ||
adjusted for all confounders. | ||
""" | ||
|
||
def __init__( | ||
# pylint: disable=too-many-arguments | ||
self, | ||
treatment: str, | ||
treatment_value: float, | ||
control_value: float, | ||
adjustment_set: set, | ||
outcome: str, | ||
df: pd.DataFrame = None, | ||
effect_modifiers: dict[str:Any] = None, | ||
alpha: float = 0.05, | ||
query: str = "", | ||
): | ||
self.treatment = treatment | ||
self.treatment_value = treatment_value | ||
self.control_value = control_value | ||
self.adjustment_set = adjustment_set | ||
self.outcome = outcome | ||
self.alpha = alpha | ||
self.df = df.query(query) if query else df | ||
|
||
if effect_modifiers is None: | ||
self.effect_modifiers = {} | ||
else: | ||
self.effect_modifiers = effect_modifiers | ||
self.modelling_assumptions = [] | ||
if query: | ||
self.modelling_assumptions.append(query) | ||
self.add_modelling_assumptions() | ||
logger.debug("Effect Modifiers: %s", self.effect_modifiers) | ||
|
||
@abstractmethod | ||
def add_modelling_assumptions(self): | ||
""" | ||
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that | ||
must hold if the resulting causal inference is to be considered valid. | ||
""" | ||
|
||
def compute_confidence_intervals(self) -> list[float, float]: | ||
""" | ||
Estimate the 95% Wald confidence intervals for the effect of changing the treatment from control values to | ||
treatment values on the outcome. | ||
:return: 95% Wald confidence intervals. | ||
""" |
119 changes: 119 additions & 0 deletions
119
causal_testing/estimation/abstract_regression_estimator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
"""This module contains the RegressionEstimator, which is an abstract class for concrete regression estimators.""" | ||
|
||
import logging | ||
from typing import Any | ||
from abc import abstractmethod | ||
|
||
import pandas as pd | ||
from statsmodels.regression.linear_model import RegressionResultsWrapper | ||
from patsy import dmatrix # pylint: disable = no-name-in-module | ||
|
||
from causal_testing.specification.variable import Variable | ||
from causal_testing.estimation.abstract_estimator import Estimator | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class RegressionEstimator(Estimator): | ||
"""A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear | ||
combination of parameters and functions of the variables (note these functions need not be linear). | ||
""" | ||
|
||
def __init__( | ||
# pylint: disable=too-many-arguments | ||
self, | ||
treatment: str, | ||
treatment_value: float, | ||
control_value: float, | ||
adjustment_set: set, | ||
outcome: str, | ||
df: pd.DataFrame = None, | ||
effect_modifiers: dict[Variable:Any] = None, | ||
formula: str = None, | ||
alpha: float = 0.05, | ||
query: str = "", | ||
): | ||
super().__init__( | ||
treatment=treatment, | ||
treatment_value=treatment_value, | ||
control_value=control_value, | ||
adjustment_set=adjustment_set, | ||
outcome=outcome, | ||
df=df, | ||
effect_modifiers=effect_modifiers, | ||
query=query, | ||
) | ||
|
||
self.model = None | ||
if effect_modifiers is None: | ||
effect_modifiers = [] | ||
if adjustment_set is None: | ||
adjustment_set = [] | ||
if formula is not None: | ||
self.formula = formula | ||
else: | ||
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers)) | ||
self.formula = f"{outcome} ~ {'+'.join(terms)}" | ||
|
||
@property | ||
@abstractmethod | ||
def regressor(self): | ||
""" | ||
The regressor to use, e.g. ols or logit. | ||
This should be a property accessible with self.regressor. | ||
Define as `regressor = ...`` outside of __init__, not as `self.regressor = ...`, otherwise | ||
you'll get an "cannot instantiate with abstract method" error. | ||
""" | ||
|
||
def add_modelling_assumptions(self): | ||
""" | ||
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that | ||
must hold if the resulting causal inference is to be considered valid. | ||
""" | ||
self.modelling_assumptions.append( | ||
"The variables in the data must fit a shape which can be expressed as a linear" | ||
"combination of parameters and functions of variables. Note that these functions" | ||
"do not need to be linear." | ||
) | ||
|
||
def _run_regression(self, data=None) -> RegressionResultsWrapper: | ||
"""Run logistic regression of the treatment and adjustment set against the outcome and return the model. | ||
:return: The model after fitting to data. | ||
""" | ||
if data is None: | ||
data = self.df | ||
model = self.regressor(formula=self.formula, data=data).fit(disp=0) | ||
self.model = model | ||
return model | ||
|
||
def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame: | ||
"""Estimate the outcomes under control and treatment. | ||
:param data: The data to use, defaults to `self.df`. Controllable for boostrap sampling. | ||
:param: adjustment_config: The values of the adjustment variables to use. | ||
:return: The estimated outcome under control and treatment, with confidence intervals in the form of a | ||
dataframe with columns "predicted", "se", "ci_lower", and "ci_upper". | ||
""" | ||
if adjustment_config is None: | ||
adjustment_config = {} | ||
|
||
model = self._run_regression(data) | ||
|
||
x = pd.DataFrame(columns=self.df.columns) | ||
x["Intercept"] = 1 # self.intercept | ||
x[self.treatment] = [self.treatment_value, self.control_value] | ||
|
||
for k, v in adjustment_config.items(): | ||
x[k] = v | ||
for k, v in self.effect_modifiers.items(): | ||
x[k] = v | ||
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe") | ||
for col in x: | ||
if str(x.dtypes[col]) == "object": | ||
x = pd.get_dummies(x, columns=[col], drop_first=True) | ||
|
||
# This has to be here in case the treatment variable is in an I(...) block in the self.formula | ||
x[self.treatment] = [self.treatment_value, self.control_value] | ||
return model.get_prediction(x).summary_frame() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
"""This module contains the CubicSplineRegressionEstimator class, for estimating | ||
continuous outcomes with changes in behaviour""" | ||
|
||
import logging | ||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from causal_testing.specification.variable import Variable | ||
from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class CubicSplineRegressionEstimator(LinearRegressionEstimator): | ||
"""A Cubic Spline Regression Estimator is a parametric estimator which restricts the variables in the data to a | ||
combination of parameters and basis functions of the variables. | ||
""" | ||
|
||
def __init__( | ||
# pylint: disable=too-many-arguments | ||
self, | ||
treatment: str, | ||
treatment_value: float, | ||
control_value: float, | ||
adjustment_set: set, | ||
outcome: str, | ||
basis: int, | ||
df: pd.DataFrame = None, | ||
effect_modifiers: dict[Variable:Any] = None, | ||
formula: str = None, | ||
alpha: float = 0.05, | ||
expected_relationship=None, | ||
): | ||
super().__init__( | ||
treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula, alpha | ||
) | ||
|
||
self.expected_relationship = expected_relationship | ||
|
||
if effect_modifiers is None: | ||
effect_modifiers = [] | ||
|
||
if formula is None: | ||
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers)) | ||
self.formula = f"{outcome} ~ cr({'+'.join(terms)}, df={basis})" | ||
|
||
def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series: | ||
"""Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused | ||
by changing the treatment variable from the control value to the treatment value. Here, we actually | ||
calculate the expected outcomes under control and treatment and divide one by the other. This | ||
allows for custom terms to be put in such as squares, inverses, products, etc. | ||
:param: adjustment_config: The configuration of the adjustment set as a dict mapping variable names to | ||
their values. N.B. Every variable in the adjustment set MUST have a value in | ||
order to estimate the outcome under control and treatment. | ||
:return: The average treatment effect. | ||
""" | ||
model = self._run_regression() | ||
|
||
x = {"Intercept": 1, self.treatment: self.treatment_value} | ||
if adjustment_config is not None: | ||
for k, v in adjustment_config.items(): | ||
x[k] = v | ||
if self.effect_modifiers is not None: | ||
for k, v in self.effect_modifiers.items(): | ||
x[k] = v | ||
|
||
treatment = model.predict(x).iloc[0] | ||
|
||
x[self.treatment] = self.control_value | ||
control = model.predict(x).iloc[0] | ||
|
||
return pd.Series(treatment - control) |
Oops, something went wrong.