Merge pull request #290 from CITCOM-project/gp-formulae

GP formulae
CITCOM-project · Sep 17, 2024 · d5f0dad · d5f0dad
2 parents bfd0534 + ed832da
commit d5f0dad
Show file tree

Hide file tree

Showing 44 changed files with 1,924 additions and 1,512 deletions.
diff --git a/.github/workflows/lint-format.yaml b/.github/workflows/lint-format.yaml
@@ -25,9 +25,9 @@ jobs:
 
       - name: Archive production artifacts
         if: ${{ success() }} || ${{ failure() }}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: MegaLinter reports
           path: |
             megalinter-reports
-            mega-linter.log
+            mega-linter.log
diff --git a/.pylintrc b/.pylintrc
@@ -371,8 +371,8 @@ min-public-methods=2
 [EXCEPTIONS]
 
 # Exceptions that will emit a warning when caught.
-overgeneral-exceptions=BaseException,
-                       Exception
+overgeneral-exceptions=builtins.BaseException,
+                       builtins.Exception
 
 
 [FORMAT]

diff --git a/causal_testing/estimation/abstract_estimator.py b/causal_testing/estimation/abstract_estimator.py
@@ -0,0 +1,73 @@
+"""This module contains the Estimator abstract class"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class Estimator(ABC):
+    # pylint: disable=too-many-instance-attributes
+    """An estimator contains all of the information necessary to compute a causal estimate for the effect of changing
+    a set of treatment variables to a set of values.
+
+    All estimators must implement the following two methods:
+
+    1) add_modelling_assumptions: The validity of a model-assisted causal inference result depends on whether
+    the modelling assumptions imposed by a model actually hold. Therefore, for each model, is important to state
+    the modelling assumption upon which the validity of the results depend. To achieve this, the estimator object
+    maintains a list of modelling assumptions (as strings). If a user wishes to implement their own estimator, they
+    must implement this method and add all assumptions to the list of modelling assumptions.
+
+    2) estimate_ate: All estimators must be capable of returning the average treatment effect as a minimum. That is, the
+    average effect of the intervention (changing treatment from control to treated value) on the outcome of interest
+    adjusted for all confounders.
+    """
+
+    def __init__(
+        # pylint: disable=too-many-arguments
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: set,
+        outcome: str,
+        df: pd.DataFrame = None,
+        effect_modifiers: dict[str:Any] = None,
+        alpha: float = 0.05,
+        query: str = "",
+    ):
+        self.treatment = treatment
+        self.treatment_value = treatment_value
+        self.control_value = control_value
+        self.adjustment_set = adjustment_set
+        self.outcome = outcome
+        self.alpha = alpha
+        self.df = df.query(query) if query else df
+
+        if effect_modifiers is None:
+            self.effect_modifiers = {}
+        else:
+            self.effect_modifiers = effect_modifiers
+        self.modelling_assumptions = []
+        if query:
+            self.modelling_assumptions.append(query)
+        self.add_modelling_assumptions()
+        logger.debug("Effect Modifiers: %s", self.effect_modifiers)
+
+    @abstractmethod
+    def add_modelling_assumptions(self):
+        """
+        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
+        must hold if the resulting causal inference is to be considered valid.
+        """
+
+    def compute_confidence_intervals(self) -> list[float, float]:
+        """
+        Estimate the 95% Wald confidence intervals for the effect of changing the treatment from control values to
+        treatment values on the outcome.
+        :return: 95% Wald confidence intervals.
+        """
diff --git a/causal_testing/estimation/abstract_regression_estimator.py b/causal_testing/estimation/abstract_regression_estimator.py
@@ -0,0 +1,119 @@
+"""This module contains the RegressionEstimator, which is an abstract class for concrete regression estimators."""
+
+import logging
+from typing import Any
+from abc import abstractmethod
+
+import pandas as pd
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+from patsy import dmatrix  # pylint: disable = no-name-in-module
+
+from causal_testing.specification.variable import Variable
+from causal_testing.estimation.abstract_estimator import Estimator
+
+logger = logging.getLogger(__name__)
+
+
+class RegressionEstimator(Estimator):
+    """A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
+    combination of parameters and functions of the variables (note these functions need not be linear).
+    """
+
+    def __init__(
+        # pylint: disable=too-many-arguments
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: set,
+        outcome: str,
+        df: pd.DataFrame = None,
+        effect_modifiers: dict[Variable:Any] = None,
+        formula: str = None,
+        alpha: float = 0.05,
+        query: str = "",
+    ):
+        super().__init__(
+            treatment=treatment,
+            treatment_value=treatment_value,
+            control_value=control_value,
+            adjustment_set=adjustment_set,
+            outcome=outcome,
+            df=df,
+            effect_modifiers=effect_modifiers,
+            query=query,
+        )
+
+        self.model = None
+        if effect_modifiers is None:
+            effect_modifiers = []
+        if adjustment_set is None:
+            adjustment_set = []
+        if formula is not None:
+            self.formula = formula
+        else:
+            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
+            self.formula = f"{outcome} ~ {'+'.join(terms)}"
+
+    @property
+    @abstractmethod
+    def regressor(self):
+        """
+        The regressor to use, e.g. ols or logit.
+        This should be a property accessible with self.regressor.
+        Define as `regressor = ...`` outside of __init__, not as `self.regressor = ...`, otherwise
+        you'll get an "cannot instantiate with abstract method" error.
+        """
+
+    def add_modelling_assumptions(self):
+        """
+        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
+        must hold if the resulting causal inference is to be considered valid.
+        """
+        self.modelling_assumptions.append(
+            "The variables in the data must fit a shape which can be expressed as a linear"
+            "combination of parameters and functions of variables. Note that these functions"
+            "do not need to be linear."
+        )
+
+    def _run_regression(self, data=None) -> RegressionResultsWrapper:
+        """Run logistic regression of the treatment and adjustment set against the outcome and return the model.
+
+        :return: The model after fitting to data.
+        """
+        if data is None:
+            data = self.df
+        model = self.regressor(formula=self.formula, data=data).fit(disp=0)
+        self.model = model
+        return model
+
+    def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
+        """Estimate the outcomes under control and treatment.
+
+        :param data: The data to use, defaults to `self.df`. Controllable for boostrap sampling.
+        :param: adjustment_config: The values of the adjustment variables to use.
+
+        :return: The estimated outcome under control and treatment, with confidence intervals in the form of a
+                 dataframe with columns "predicted", "se", "ci_lower", and "ci_upper".
+        """
+        if adjustment_config is None:
+            adjustment_config = {}
+
+        model = self._run_regression(data)
+
+        x = pd.DataFrame(columns=self.df.columns)
+        x["Intercept"] = 1  # self.intercept
+        x[self.treatment] = [self.treatment_value, self.control_value]
+
+        for k, v in adjustment_config.items():
+            x[k] = v
+        for k, v in self.effect_modifiers.items():
+            x[k] = v
+        x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
+        for col in x:
+            if str(x.dtypes[col]) == "object":
+                x = pd.get_dummies(x, columns=[col], drop_first=True)
+
+        # This has to be here in case the treatment variable is in an I(...) block in the self.formula
+        x[self.treatment] = [self.treatment_value, self.control_value]
+        return model.get_prediction(x).summary_frame()
diff --git a/causal_testing/estimation/cubic_spline_estimator.py b/causal_testing/estimation/cubic_spline_estimator.py
@@ -0,0 +1,75 @@
+"""This module contains the CubicSplineRegressionEstimator class, for estimating
+continuous outcomes with changes in behaviour"""
+
+import logging
+from typing import Any
+
+import pandas as pd
+
+from causal_testing.specification.variable import Variable
+from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
+
+logger = logging.getLogger(__name__)
+
+
+class CubicSplineRegressionEstimator(LinearRegressionEstimator):
+    """A Cubic Spline Regression Estimator is a parametric estimator which restricts the variables in the data to a
+    combination of parameters and basis functions of the variables.
+    """
+
+    def __init__(
+        # pylint: disable=too-many-arguments
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: set,
+        outcome: str,
+        basis: int,
+        df: pd.DataFrame = None,
+        effect_modifiers: dict[Variable:Any] = None,
+        formula: str = None,
+        alpha: float = 0.05,
+        expected_relationship=None,
+    ):
+        super().__init__(
+            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula, alpha
+        )
+
+        self.expected_relationship = expected_relationship
+
+        if effect_modifiers is None:
+            effect_modifiers = []
+
+        if formula is None:
+            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
+            self.formula = f"{outcome} ~ cr({'+'.join(terms)}, df={basis})"
+
+    def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series:
+        """Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
+        by changing the treatment variable from the control value to the treatment value. Here, we actually
+        calculate the expected outcomes under control and treatment and divide one by the other. This
+        allows for custom terms to be put in such as squares, inverses, products, etc.
+
+        :param: adjustment_config: The configuration of the adjustment set as a dict mapping variable names to
+                                   their values. N.B. Every variable in the adjustment set MUST have a value in
+                                   order to estimate the outcome under control and treatment.
+
+        :return: The average treatment effect.
+        """
+        model = self._run_regression()
+
+        x = {"Intercept": 1, self.treatment: self.treatment_value}
+        if adjustment_config is not None:
+            for k, v in adjustment_config.items():
+                x[k] = v
+        if self.effect_modifiers is not None:
+            for k, v in self.effect_modifiers.items():
+                x[k] = v
+
+        treatment = model.predict(x).iloc[0]
+
+        x[self.treatment] = self.control_value
+        control = model.predict(x).iloc[0]
+
+        return pd.Series(treatment - control)