diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 6cf7de96..cb3891ba 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,6 +13,7 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM +from .plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR from .utils.blp import DoubleMLBLP @@ -42,6 +43,7 @@ "DoubleMLBLP", "DoubleMLPolicyTree", "DoubleMLSSM", + "DoubleMLLPLR", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 05481bf1..4e11b13c 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -22,7 +22,7 @@ class DoubleML(SampleSplittingMixin, ABC): """Double Machine Learning.""" - def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): + def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=False): # check and pick up obj_dml_data if not isinstance(obj_dml_data, DoubleMLBaseData): raise TypeError( @@ -34,18 +34,10 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): if obj_dml_data.n_cluster_vars > 2: raise NotImplementedError("Multi-way (n_ways > 2) clustering not yet implemented.") self._is_cluster_data = True - self._is_panel_data = False - if isinstance(obj_dml_data, DoubleMLPanelData): - self._is_panel_data = True - self._is_did_data = False - if isinstance(obj_dml_data, DoubleMLDIDData): - self._is_did_data = True - self._is_ssm_data = False - if isinstance(obj_dml_data, DoubleMLSSMData): - self._is_ssm_data = True - self._is_rdd_data = False - if isinstance(obj_dml_data, DoubleMLRDDData): - self._is_rdd_data = True + self._is_panel_data = isinstance(obj_dml_data, DoubleMLPanelData) + self._is_did_data = isinstance(obj_dml_data, DoubleMLDIDData) + self._is_ssm_data = isinstance(obj_dml_data, DoubleMLSSMData) + self._is_rdd_data = isinstance(obj_dml_data, DoubleMLRDDData) self._dml_data = obj_dml_data self._n_obs = self._dml_data.n_obs @@ -108,6 +100,9 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): self._smpls = None self._smpls_cluster = None self._n_obs_sample_splitting = self.n_obs + self._double_sample_splitting = double_sample_splitting + if self._double_sample_splitting: + self._smpls_inner = None if draw_sample_splitting: self.draw_sample_splitting() self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) @@ -263,6 +258,13 @@ def learner(self): """ return self._learner + @property + def predictions_names(self): + """ + The names of predictions for the nuisance functions. + """ + return list(self.params_names) + @property def learner_names(self): """ @@ -359,6 +361,21 @@ def smpls(self): raise ValueError(err_msg) return self._smpls + @property + def smpls_inner(self): + """ + The partition used for cross-fitting. + """ + if not self._double_sample_splitting: + raise ValueError("smpls_inner is only available for double sample splitting.") + if self._smpls_inner is None: + err_msg = ( + "Sample splitting not specified. Either draw samples via .draw_sample splitting() " + + "or set external samples via .set_sample_splitting()." + ) + raise ValueError(err_msg) + return self._smpls_inner + @property def smpls_cluster(self): """ @@ -507,6 +524,18 @@ def summary(self): def __smpls(self): return self._smpls[self._i_rep] + @property + def __smpls__inner(self): + if not self._double_sample_splitting: + raise ValueError("smpls_inner is only available for double sample splitting.") + if self._smpls_inner is None: + err_msg = ( + "Sample splitting not specified. Either draw samples via .draw_sample splitting() " + + "or set external samples via .set_sample_splitting()." + ) + raise ValueError(err_msg) + return self._smpls_inner[self._i_rep] + @property def __smpls_cluster(self): return self._smpls_cluster[self._i_rep] @@ -1059,7 +1088,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m _check_external_predictions( external_predictions=external_predictions, valid_treatments=self._dml_data.d_cols, - valid_learners=self.params_names, + valid_learners=self.predictions_names, n_obs=self.n_obs, n_rep=self.n_rep, ) @@ -1081,7 +1110,10 @@ def _initalize_fit(self, store_predictions, store_models): def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models): ext_prediction_dict = _set_external_predictions( - external_predictions, learners=self.params_names, treatment=self._dml_data.d_cols[self._i_treat], i_rep=self._i_rep + external_predictions, + learners=self.predictions_names, + treatment=self._dml_data.d_cols[self._i_treat], + i_rep=self._i_rep, ) # ml estimation of nuisance models and computation of score elements @@ -1146,8 +1178,8 @@ def _initialize_arrays(self): self._all_se = np.full((n_thetas, n_rep), np.nan) def _initialize_predictions_and_targets(self): - self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} - self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} + self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} + self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} def _initialize_nuisance_loss(self): self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names} @@ -1158,7 +1190,7 @@ def _initialize_models(self): } def _store_predictions_and_targets(self, preds, targets): - for learner in self.params_names: + for learner in self.predictions_names: self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner] self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner] diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index d7d8b2e1..2f63d88e 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -1,7 +1,7 @@ from abc import abstractmethod from doubleml.utils._checks import _check_sample_splitting -from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling +from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLDoubleResampling, DoubleMLResampling class SampleSplittingMixin: @@ -17,6 +17,8 @@ class SampleSplittingMixin: `sample splitting `_ in the DoubleML user guide. """ + _double_sample_splitting = False + def draw_sample_splitting(self): """ Draw sample splitting for DoubleML models. @@ -29,6 +31,8 @@ def draw_sample_splitting(self): self : object """ if self._is_cluster_data: + if self._double_sample_splitting: + raise ValueError("Cluster data not supported for double sample splitting.") obj_dml_resampling = DoubleMLClusterResampling( n_folds=self._n_folds_per_cluster, n_rep=self.n_rep, @@ -38,10 +42,20 @@ def draw_sample_splitting(self): ) self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() else: - obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata - ) - self._smpls = obj_dml_resampling.split_samples() + if self._double_sample_splitting: + obj_dml_resampling = DoubleMLDoubleResampling( + n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata, + ) + self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() + else: + obj_dml_resampling = DoubleMLResampling( + n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata + ) + self._smpls = obj_dml_resampling.split_samples() return self @@ -104,6 +118,9 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): >>> dml_plr_obj.set_sample_splitting(smpls) # doctest: +ELLIPSIS """ + if self._double_sample_splitting: + raise ValueError("set_sample_splitting not supported for double sample splitting.") + self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting ) diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py index 57dd6e62..f1112db9 100644 --- a/doubleml/double_ml_score_mixins.py +++ b/doubleml/double_ml_score_mixins.py @@ -86,6 +86,7 @@ class NonLinearScoreMixin: _score_type = "nonlinear" _coef_start_val = np.nan _coef_bounds = None + _error_on_convergence_failure = False @property @abstractmethod @@ -149,12 +150,16 @@ def score_deriv(theta): theta_hat = root_res.root if not root_res.converged: score_val = score(theta_hat) - warnings.warn( + msg = ( "Could not find a root of the score function.\n " f"Flag: {root_res.flag}.\n" f"Score value found is {score_val} " f"for parameter theta equal to {theta_hat}." ) + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds) @@ -186,12 +191,16 @@ def score_squared(theta): score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( + msg = ( "Could not find a root of the score function.\n " f"Minimum score value found is {score_val} " f"for parameter theta equal to {theta_hat}.\n " "No theta found such that the score function evaluates to a negative value." ) + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: def neg_score(theta): @@ -202,11 +211,15 @@ def neg_score(theta): neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( + msg = ( "Could not find a root of the score function. " f"Maximum score value found is {-1 * neg_score_val} " f"for parameter theta equal to {theta_hat}. " "No theta found such that the score function evaluates to a positive value." ) + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) return theta_hat diff --git a/doubleml/irm/tests/test_datasets.py b/doubleml/irm/tests/test_datasets.py new file mode 100644 index 00000000..79bf6794 --- /dev/null +++ b/doubleml/irm/tests/test_datasets.py @@ -0,0 +1,156 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml import DoubleMLData +from doubleml.irm.datasets import ( + make_confounded_irm_data, + make_heterogeneous_data, + make_iivm_data, + make_irm_data, + make_irm_data_discrete_treatments, + make_ssm_data, +) + +msg_inv_return_type = "Invalid return_type." + + +@pytest.mark.ci +def test_make_irm_data_return_types(): + np.random.seed(3141) + res = make_irm_data(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_irm_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d = make_irm_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_irm_data(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_iivm_data_return_types(): + np.random.seed(3141) + res = make_iivm_data(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_iivm_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_iivm_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_iivm_data(n_obs=100, return_type="matrix") + + +@pytest.fixture(scope="function", params=[True, False]) +def linear(request): + return request.param + + +@pytest.mark.ci +def test_make_confounded_irm_data_return_types(linear): + np.random.seed(3141) + res = make_confounded_irm_data(linear=linear) + assert isinstance(res, dict) + assert isinstance(res["x"], np.ndarray) + assert isinstance(res["y"], np.ndarray) + assert isinstance(res["d"], np.ndarray) + + assert isinstance(res["oracle_values"], dict) + assert isinstance(res["oracle_values"]["g_long"], np.ndarray) + assert isinstance(res["oracle_values"]["g_short"], np.ndarray) + assert isinstance(res["oracle_values"]["m_long"], np.ndarray) + assert isinstance(res["oracle_values"]["m_short"], np.ndarray) + assert isinstance(res["oracle_values"]["gamma_a"], float) + assert isinstance(res["oracle_values"]["beta_a"], float) + assert isinstance(res["oracle_values"]["a"], np.ndarray) + assert isinstance(res["oracle_values"]["y_0"], np.ndarray) + assert isinstance(res["oracle_values"]["y_1"], np.ndarray) + assert isinstance(res["oracle_values"]["z"], np.ndarray) + assert isinstance(res["oracle_values"]["cf_y"], float) + assert isinstance(res["oracle_values"]["cf_d_ate"], float) + assert isinstance(res["oracle_values"]["cf_d_atte"], float) + assert isinstance(res["oracle_values"]["rho_ate"], float) + assert isinstance(res["oracle_values"]["rho_atte"], float) + + +@pytest.fixture(scope="function", params=[False, True]) +def binary_treatment(request): + return request.param + + +@pytest.fixture(scope="function", params=[1, 2]) +def n_x(request): + return request.param + + +@pytest.mark.ci +def test_make_heterogeneous_data_return_types(binary_treatment, n_x): + np.random.seed(3141) + res = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=binary_treatment) + assert isinstance(res, dict) + assert isinstance(res["data"], pd.DataFrame) + assert isinstance(res["effects"], np.ndarray) + assert callable(res["treatment_effect"]) + + # test input checks + msg = "n_x must be either 1 or 2." + with pytest.raises(AssertionError, match=msg): + _ = make_heterogeneous_data(n_obs=100, n_x=0, binary_treatment=binary_treatment) + msg = "support_size must be smaller than p." + with pytest.raises(AssertionError, match=msg): + _ = make_heterogeneous_data(n_obs=100, n_x=n_x, support_size=31, binary_treatment=binary_treatment) + msg = "binary_treatment must be a boolean." + with pytest.raises(AssertionError, match=msg): + _ = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=2) + + +@pytest.mark.ci +def test_make_ssm_data_return_types(): + np.random.seed(3141) + res = make_ssm_data(n_obs=100) + assert isinstance(res, DoubleMLData) + res = make_ssm_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z, s = make_ssm_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + assert isinstance(s, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_ssm_data(n_obs=100, return_type="matrix") + + +@pytest.fixture(scope="function", params=[3, 5]) +def n_levels(request): + return request.param + + +def test_make_data_discrete_treatments(n_levels): + np.random.seed(3141) + n = 100 + data_apo = make_irm_data_discrete_treatments(n_obs=n, n_levels=3) + assert isinstance(data_apo, dict) + assert isinstance(data_apo["y"], np.ndarray) + assert isinstance(data_apo["d"], np.ndarray) + assert isinstance(data_apo["x"], np.ndarray) + assert isinstance(data_apo["oracle_values"], dict) + + assert isinstance(data_apo["oracle_values"]["cont_d"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["level_bounds"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["potential_level"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["ite"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["y0"], np.ndarray) + + msg = "n_levels must be at least 2." + with pytest.raises(ValueError, match=msg): + _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1) + + msg = "n_levels must be an integer." + with pytest.raises(ValueError, match=msg): + _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1.1) diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index e81f00c5..283bc91b 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -2,10 +2,8 @@ The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models. """ +from .lplr import DoubleMLLPLR from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR -__all__ = [ - "DoubleMLPLR", - "DoubleMLPLIV", -] +__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"] diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index b2bb7df0..6e8e9bb5 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -4,6 +4,7 @@ from ._make_pliv_data import _make_pliv_data from .dgp_confounded_plr_data import make_confounded_plr_data +from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 from .dgp_pliv_CHS2015 import make_pliv_CHS2015 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 @@ -15,5 +16,6 @@ "make_confounded_plr_data", "make_pliv_CHS2015", "make_pliv_multiway_cluster_CKMS2021", + "make_lplr_LZZ2020", "_make_pliv_data", ] diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py new file mode 100644 index 00000000..284da7d8 --- /dev/null +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd +from scipy.special import expit + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_lplr_LZZ2020( + n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs +): + r""" + Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), + designed for use in double/debiased machine learning applications. + + The data generating process is defined as follows: + + - Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`. + - Treatment :math:`d_i = a_0(x_i)`. + - Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function. + - Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`. + + The nuisance functions are defined as: + + .. math:: + \begin{aligned} + a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ + &\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8} + - 0.2\, x_{i,9} x_{i,10} \\ + r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + \end{aligned} + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + dim_x : int + Number of covariates. + alpha : float + Value of the causal parameter. + return_type : str + Determines the return format. One of: + + - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. + - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. + - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). + balanced_r0 : bool, default True + If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced + heterogeneity). If False, uses an "unbalanced" r_0 specification with larger + share of Y=0. + treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous' + Determines how the treatment d is generated from a_0(x): + - 'continuous': d = a_0(x) (continuous treatment). + - 'binary': d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) . + - 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ). + + **kwargs + Optional keyword arguments (currently unused in this implementation). + + Returns + ------- + Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] + The generated data in the specified format. + + References + ---------- + Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. + "Double/Debiased Machine Learning for Logistic Partially Linear Model." + The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. + + """ + + if balanced_r0: + + def r_0(X): + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 1 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 0.25 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) + + else: + + def r_0(X): + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 4 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 1.5 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) + + def a_0(X): + return ( + 2 / (1 + np.exp(X[:, 0])) + + -2 / (1 + np.exp(X[:, 1])) + + 1 * np.sin(X[:, 2]) + + 1 * np.cos(X[:, 3]) + + 0.5 * np.where(X[:, 4] > 0, 1, 0) + + -0.5 * np.where(X[:, 5] > 0, 1, 0) + + 0.2 * X[:, 6] * X[:, 7] + + -0.2 * X[:, 8] * X[:, 9] + ) + + sigma = np.full((dim_x, dim_x), 0.2) + np.fill_diagonal(sigma, 1) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) + np.clip(x, -2, 2, out=x) + + if treatment == "continuous": + d = a_0(x) + elif treatment == "binary": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont - d_cont.mean())) + elif treatment == "binary_unbalanced": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont)) + else: + raise ValueError("Invalid treatment type.") + + p = expit(alpha * d[:] + r_0(x)) + + y = np.random.binomial(1, p) + + if return_type in _array_alias: + return x, y, d, p + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py new file mode 100644 index 00000000..f452e02d --- /dev/null +++ b/doubleml/plm/lplr.py @@ -0,0 +1,571 @@ +import inspect +import warnings + +import numpy as np +import scipy +from sklearn.base import clone +from sklearn.utils import check_X_y +from sklearn.utils.multiclass import type_of_target + +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import NonLinearScoreMixin +from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score +from doubleml.utils._estimation import ( + _dml_cv_predict, + _dml_tune, +) + + +class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): + """Double machine learning for partially logistic models (binary outcomes) + + Parameters + ---------- + obj_dml_data : DoubleMLData + The DoubleMLData object providing the data and variable specification. + The outcome variable y must be binary with values {0, 1}. + ml_M : estimator + Classifier for M_0(D, X) = P[Y = 1 | D, X]. Must implement fit() and predict_proba(). + ml_t : estimator + Regressor for the auxiliary regression used to predict log-odds. Must implement fit() and predict(). + ml_m : estimator + Learner for m_0(X) = E[D | X]. For binary treatments a classifier with predict_proba() is expected; + for continuous treatments a regressor with predict() is expected. + ml_a : estimator, optional + Optional alternative learner for E[D | X]. If not provided, a clone of ml_m is used. + Must support the same prediction interface as ml_m. + n_folds : int, default=5 + Number of outer cross-fitting folds. + n_folds_inner : int, default=5 + Number of inner folds for nested resampling used internally. + n_rep : int, default=1 + Number of repetitions for sample splitting. + score : {'nuisance_space', 'instrument'}, default='nuisance_space' + Score to use. 'nuisance_space' estimates m on subsamples with y=0; 'instrument' uses an instrument-type score. + draw_sample_splitting : bool, default=True + Whether to draw sample splitting during initialization. + error_on_convergence_failure : bool, default=False + If True, raise an error on convergence failure of score. + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.plm.datasets import make_lplr_LZZ2020 + >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier + >>> from sklearn.base import clone + >>> np.random.seed(42) + >>> ml_t = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) + >>> dml_lplr_obj.fit().summary # doctest: +SKIP + coef std err t P>|t| 2.5 % 97.5 % + d 0.661166 0.172672 3.829038 0.000129 0.322736 0.999596 + + Notes + ----- + **Partially logistic regression (PLR)** models take the form + + .. math:: + + Y = \\text{expit} ( D \\theta_0 + r_0(X)) + + where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. + The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates. + """ + + def __init__( + self, + obj_dml_data, + ml_M, + ml_t, + ml_m, + ml_a=None, + n_folds=5, + n_folds_inner=5, + n_rep=1, + score="nuisance_space", + draw_sample_splitting=True, + error_on_convergence_failure=False, + ): + self.n_folds_inner = n_folds_inner + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=True) + + self._error_on_convergence_failure = error_on_convergence_failure + + self._check_data(self._dml_data) + valid_scores = ["nuisance_space", "instrument"] + _check_score(self.score, valid_scores, allow_callable=False) + + _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) + _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) + + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) + self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} + # replace aggregated inner names with per-inner-fold names + inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds_inner)] + inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds_inner)] + self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names + + if ml_a is not None: + ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) + self._learner["ml_a"] = ml_a + self._ml_a_provided = True + else: + self._learner["ml_a"] = clone(ml_m) + ml_a_is_classifier = ml_m_is_classifier + self._ml_a_provided = False + + self._predict_method = {"ml_t": "predict", "ml_M": "predict_proba"} + + if ml_m_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method["ml_m"] = "predict_proba" + else: + raise ValueError( + f"The ml_m learner {str(ml_m)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) + else: + if self._dml_data.binary_treats.any(): + warnings.warn( + f"The ml_m learner {str(ml_m)} was identified as regressor " + "but at least one treatment variable is binary with values 0 and 1." + ) + self._predict_method["ml_m"] = "predict" + + if ml_a_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method["ml_a"] = "predict_proba" + else: + raise ValueError( + f"The ml_a learner {str(ml_a)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) + else: + if self._dml_data.binary_treats.any(): + warnings.warn( + f"The ml_a learner {str(ml_a)} was identified as regressor but at least one treatment variable is " + f"binary with values 0 and 1." + ) + self._predict_method["ml_a"] = "predict" + + if score == "instrument": + sig = inspect.signature(self.learner["ml_a"].fit) + if "sample_weight" not in sig.parameters: + raise ValueError('Learner "ml_a" who supports sample_weight is required for score type "instrument"') + + self._initialize_ml_nuisance_params() + self._external_predictions_implemented = True + self._sensitivity_implemented = False + + def _initialize_ml_nuisance_params(self): + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} + + def _check_data(self, obj_dml_data): + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError("The outcome variable y must be binary with values 0 and 1.") + + def _double_dml_cv_predict( + self, + estimator, + estimator_name, + x, + y, + smpls=None, + smpls_inner=None, + n_jobs=None, + est_params=None, + method="predict", + sample_weights=None, + ): + res = {} + res["preds"] = np.zeros(y.shape, dtype=float) + res["preds_inner"] = [] + res["targets_inner"] = [] + res["models"] = [] + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + res_inner = _dml_cv_predict( + estimator, + x, + y, + smpls=smpls_double_split, + n_jobs=n_jobs, + est_params=est_params, + method=method, + return_models=True, + sample_weights=sample_weights, + ) + _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) + + res["preds_inner"].append(res_inner["preds"]) + res["targets_inner"].append(res_inner["targets"]) + for model in res_inner["models"]: + res["models"].append(model) + if method == "predict_proba": + res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + else: + res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds"] /= len(smpls) + res["targets"] = np.copy(y) + return res + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + m_external = external_predictions["ml_m"] is not None + M_external = external_predictions["ml_M"] is not None + t_external = external_predictions["ml_t"] is not None + a_external = external_predictions["ml_a"] is not None + + if M_external: + # expect per-inner-fold keys ml_M_inner_i + missing = [ + i + for i in range(self.n_folds_inner) + if f"ml_M_inner_{i}" not in external_predictions.keys() or external_predictions[f"ml_M_inner_{i}"] is None + ] + if len(missing) > 0: + raise ValueError( + "When providing external predictions for ml_M, also inner predictions for all inner folds " + f"have to be provided (missing: {', '.join([str(i) for i in missing])})." + ) + M_hat_inner = [external_predictions[f"ml_M_inner_{i}"] for i in range(self.n_folds_inner)] + M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None} + else: + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self._DoubleML__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_M"), + method=self._predict_method["ml_M"], + ) + + # nuisance m + if m_external: + m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} + else: + if self.score == "instrument": + weights = M_hat["preds"] * (1 - M_hat["preds"]) + filtered_smpls = smpls + elif self.score == "nuisance_space": + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append((train_filtered, test)) + weights = None + + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=filtered_smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + sample_weights=weights, + ) + + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) + + if self._check_learner(self._learner["ml_m"], "ml_m", regressor=True, classifier=True): + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) + + if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: + binary_preds = type_of_target(m_hat["preds"]) == "binary" + zero_one_preds = np.all((np.power(m_hat["preds"], 2) - m_hat["preds"]) == 0) + if binary_preds & zero_one_preds: + raise ValueError( + f"For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, " + f"predictions obtained with the ml_m learner {str(self._learner['ml_m'])} are also " + "observed to be binary with values 0 and 1. Make sure that for classifiers " + "probabilities and not labels are predicted." + ) + + if a_external: + # expect per-inner-fold keys ml_a_inner_i + missing = [ + i + for i in range(self.n_folds_inner) + if f"ml_a_inner_{i}" not in external_predictions.keys() or external_predictions[f"ml_a_inner_{i}"] is None + ] + if len(missing) > 0: + raise ValueError( + "When providing external predictions for ml_a, also inner predictions for all inner folds " + f"have to be provided (missing: {', '.join([str(i) for i in missing])})." + ) + a_hat_inner = [external_predictions[f"ml_a_inner_{i}"] for i in range(self.n_folds_inner)] + a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None} + else: + a_hat = self._double_dml_cv_predict( + self._learner["ml_a"], + "ml_a", + x, + d, + smpls=smpls, + smpls_inner=self._DoubleML__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_a"), + method=self._predict_method["ml_a"], + ) + + W_inner = [] + beta = np.zeros(d.shape, dtype=float) + + for i, (train, test) in enumerate(smpls): + M_iteration = M_hat["preds_inner"][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + d_tilde = (d - a_hat["preds_inner"][i])[train] + beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde**2) + + # Use preliminary beta estimates as starting value for root finding + self._coef_start_val = np.average(beta) + + # nuisance t + if t_external: + t_hat = {"preds": external_predictions["ml_t"], "targets": None, "models": None} + else: + t_hat = _dml_cv_predict( + self._learner["ml_t"], + x, + W_inner, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_t"), + method=self._predict_method["ml_t"], + return_models=return_models, + ) + _check_finite_predictions(t_hat["preds"], self._learner["ml_t"], "ml_t", smpls) + + r_hat = {} + r_hat["preds"] = t_hat["preds"] - beta * a_hat["preds"] + + psi_elements = self._score_elements(y, d, r_hat["preds"], m_hat["preds"]) + + preds = { + "predictions": { + "ml_r": r_hat["preds"], + "ml_m": m_hat["preds"], + "ml_a": a_hat["preds"], + "ml_t": t_hat["preds"], + "ml_M": M_hat["preds"], + # store inner predictions as separate keys per inner fold + # ml_M inner + **{f"ml_M_inner_{i}": M_hat["preds_inner"][i] for i in range(len(M_hat["preds_inner"]))}, + # ml_a inner + **{f"ml_a_inner_{i}": a_hat["preds_inner"][i] for i in range(len(a_hat["preds_inner"]))}, + }, + "targets": { + "ml_r": None, + "ml_m": m_hat["targets"], + "ml_a": a_hat["targets"], + "ml_t": t_hat["targets"], + "ml_M": M_hat["targets"], + # store inner targets as separate keys per inner fold (None if external) + **( + { + f"ml_M_inner_{i}": ( + M_hat.get("targets_inner")[i] + if M_hat.get("targets_inner") is not None and i < len(M_hat["targets_inner"]) + else None + ) + for i in range(len(M_hat.get("preds_inner", []))) + } + ), + **( + { + f"ml_a_inner_{i}": ( + a_hat.get("targets_inner")[i] + if a_hat.get("targets_inner") is not None and i < len(a_hat["targets_inner"]) + else None + ) + for i in range(len(a_hat.get("preds_inner", []))) + } + ), + }, + "models": { + "ml_r": None, + "ml_m": m_hat["models"], + "ml_a": a_hat["models"], + "ml_t": t_hat["models"], + "ml_M": M_hat["models"], + }, + } + + return psi_elements, preds + + @property + def predictions_names(self): + """ + The names of predictions for the nuisance functions. + """ + return self._predictions_names + + def _score_elements(self, y, d, r_hat, m_hat): + # compute residual + d_tilde = d - m_hat + psi_hat = scipy.special.expit(-r_hat) + score_const = d_tilde * (1 - y) * np.exp(r_hat) + psi_elements = { + "y": y, + "d": d, + "d_tilde": d_tilde, + "r_hat": r_hat, + "m_hat": m_hat, + "psi_hat": psi_hat, + "score_const": score_const, + } + + return psi_elements + + @property + def _score_element_names(self): + return ["y", "d", "d_tilde", "r_hat", "m_hat", "psi_hat", "score_const"] + + def _sensitivity_element_est(self, preds): + raise NotImplementedError() + + def _nuisance_tuning( + self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + ): + if self._i_rep is None: + raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.") + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + + if scoring_methods is None: + scoring_methods = {"ml_m": None, "ml_M": None, "ml_a": None, "ml_t": None} + + train_inds = [train_index for (train_index, _) in smpls] + M_tune_res = _dml_tune( + y, + x_d_concat, + train_inds, + self._learner["ml_M"], + param_grids["ml_M"], + scoring_methods["ml_M"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + filtered_train_inds = [] + if self.score == "nuisance_space": + for train, _ in smpls: + train_filtered = train[y[train] == 0] + filtered_train_inds.append(train_filtered) + elif self.score == "instrument": + filtered_train_inds = train_inds + + m_tune_res = _dml_tune( + d, + x, + filtered_train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + a_tune_res = _dml_tune( + d, + x, + train_inds, + self._learner["ml_a"], + param_grids["ml_a"], + scoring_methods["ml_a"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + M_best_params = [xx.best_params_ for xx in M_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + a_best_params = [xx.best_params_ for xx in a_tune_res] + + # Create targets for tuning ml_t + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self._DoubleML__smpls__inner, + n_jobs=n_jobs_cv, + est_params=M_best_params, + method=self._predict_method["ml_M"], + ) + + W_inner = [] + for i, (train, _) in enumerate(smpls): + M_iteration = M_hat["preds_inner"][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + + # Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN + W_targets = [] + for i, train in enumerate(train_inds): + wt = np.full(x.shape[0], np.nan, dtype=float) + wt[train] = W_inner[i] + W_targets.append(wt) + + t_tune_res = _dml_tune( + W_inner, + x, + train_inds, + self._learner["ml_t"], + param_grids["ml_t"], + scoring_methods["ml_t"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + fold_specific_target=True, + ) + t_best_params = [xx.best_params_ for xx in t_tune_res] + + # Update params and tune_res to include ml_a and ml_t + params = {"ml_M": M_best_params, "ml_m": m_best_params, "ml_a": a_best_params, "ml_t": t_best_params} + tune_res = {"M_tune": M_tune_res, "m_tune": m_tune_res, "a_tune": a_tune_res, "t_tune": t_tune_res} + + res = {"params": params, "tune_res": tune_res} + + return res + + def _compute_score(self, psi_elements, coef): + if self.score == "nuisance_space": + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] + score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + elif self.score == "instrument": + score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"])) * psi_elements[ + "d_tilde" + ] + + return score + + def _compute_score_deriv(self, psi_elements, coef, inds=None): + if self.score == "nuisance_space": + deriv_1 = -psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + elif self.score == "instrument": + expit = scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"]) + deriv = -psi_elements["d"] * expit * (1 - expit) * psi_elements["d_tilde"] + + return deriv diff --git a/doubleml/plm/tests/test_datasets.py b/doubleml/plm/tests/test_datasets.py new file mode 100644 index 00000000..5e16b9ac --- /dev/null +++ b/doubleml/plm/tests/test_datasets.py @@ -0,0 +1,151 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml import DoubleMLData +from doubleml.plm.datasets import ( + _make_pliv_data, + make_confounded_plr_data, + make_lplr_LZZ2020, + make_pliv_CHS2015, + make_pliv_multiway_cluster_CKMS2021, + make_plr_CCDDHNR2018, + make_plr_turrell2018, +) + +msg_inv_return_type = "Invalid return_type." + + +@pytest.mark.ci +def test_make_plr_CCDDHNR2018_return_types(): + np.random.seed(3141) + res = make_plr_CCDDHNR2018(n_obs=100, return_type=DoubleMLData) + assert isinstance(res, DoubleMLData) + res = make_plr_CCDDHNR2018(n_obs=100, return_type=pd.DataFrame) + assert isinstance(res, pd.DataFrame) + x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type=np.ndarray) + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_plr_CCDDHNR2018(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_plr_turrell2018_return_types(): + np.random.seed(3141) + res = make_plr_turrell2018(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_plr_turrell2018(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d = make_plr_turrell2018(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_plr_turrell2018(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_confounded_plr_data_return_types(): + np.random.seed(3141) + res = make_confounded_plr_data(theta=5.0) + assert isinstance(res, dict) + assert isinstance(res["x"], np.ndarray) + assert isinstance(res["y"], np.ndarray) + assert isinstance(res["d"], np.ndarray) + + assert isinstance(res["oracle_values"], dict) + assert isinstance(res["oracle_values"]["g_long"], np.ndarray) + assert isinstance(res["oracle_values"]["g_short"], np.ndarray) + assert isinstance(res["oracle_values"]["m_long"], np.ndarray) + assert isinstance(res["oracle_values"]["m_short"], np.ndarray) + assert isinstance(res["oracle_values"]["theta"], float) + assert isinstance(res["oracle_values"]["gamma_a"], float) + assert isinstance(res["oracle_values"]["beta_a"], float) + assert isinstance(res["oracle_values"]["a"], np.ndarray) + assert isinstance(res["oracle_values"]["z"], np.ndarray) + + +@pytest.mark.ci +def test_make_pliv_data_return_types(): + np.random.seed(3141) + res = _make_pliv_data(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = _make_pliv_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = _make_pliv_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = _make_pliv_data(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_pliv_CHS2015_return_types(): + np.random.seed(3141) + res = make_pliv_CHS2015(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_pliv_CHS2015(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_pliv_CHS2015(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_pliv_CHS2015(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_pliv_multiway_cluster_CKMS2021_return_types(): + np.random.seed(3141) + res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(cluster_vars, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="matrix") + + +@pytest.mark.ci +def test_make_lplr_LZZ2020_return_types(): + np.random.seed(3141) + res = make_lplr_LZZ2020(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_lplr_LZZ2020(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_lplr_LZZ2020(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_lplr_LZZ2020(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_lplr_LZZ2020_variants(): + np.random.seed(3141) + res = make_lplr_LZZ2020(n_obs=100, treatment="binary") + assert np.array_equal(np.unique(res.d), np.array([0, 1])) + res = make_lplr_LZZ2020(n_obs=100, treatment="binary_unbalanced") + assert np.array_equal(np.unique(res.d), np.array([0, 1])) + res = make_lplr_LZZ2020(n_obs=100, treatment="continuous") + assert len(np.unique(res.d)) == 100 + + msg = "Invalid treatment type." + with pytest.raises(ValueError, match=msg): + _ = make_lplr_LZZ2020(n_obs=100, treatment="colors") + + res = make_lplr_LZZ2020(n_obs=100, balanced_r0=False) + _, y_unique = np.unique(res.y, return_counts=True) + assert np.abs(y_unique[0] - y_unique[1]) > 10 diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py new file mode 100644 index 00000000..abd7adf5 --- /dev/null +++ b/doubleml/plm/tests/test_lplr.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +import doubleml as dml +from doubleml.plm.datasets import make_lplr_LZZ2020 + + +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_t(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_m_classifier(request): + return request.param + + +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"]) +# TODO: Error for continuous treatment? +def treatment(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_lplr_fixture( + score, + learner_M, + learner_t, + learner_m, + learner_m_classifier, + treatment, +): + n_folds = 5 + alpha = 0.5 + + # collect data + np.random.seed(42) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha, treatment=treatment) + + ml_M = clone(learner_M) + ml_t = clone(learner_t) + if treatment == "continuous": + ml_m = clone(learner_m) + else: + ml_m = clone(learner_m_classifier) + + dml_sel_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m, n_folds=n_folds, score=score) + dml_sel_obj.fit() + + res_dict = { + "coef": dml_sel_obj.coef[0], + "se": dml_sel_obj.se[0], + "true_coef": alpha, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_lplr_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py new file mode 100644 index 00000000..f01cd885 --- /dev/null +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -0,0 +1,338 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import Lasso, LogisticRegression +from sklearn.semi_supervised import LabelSpreading + +from doubleml import DoubleMLLPLR +from doubleml.plm.datasets import make_lplr_LZZ2020 + +np.random.seed(3141) +n = 100 +# create test data and basic learners +dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20) +dml_data_binary = make_lplr_LZZ2020(alpha=0.5, n_obs=n, treatment="binary", dim_x=20) +ml_M = RandomForestClassifier() +ml_t = RandomForestRegressor() +ml_m = RandomForestRegressor() +dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) +dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") + + +@pytest.mark.ci +def test_lplr_exception_data(): + msg = r"The data must be of DoubleMLData.* type\.[\s\S]* of type " r" was passed\." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) + + dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=20) + dml_data_nb.data[dml_data_nb.y_col] = dml_data_nb.data[dml_data_nb.y_col] + 1 + dml_data_nb._set_y_z() + with pytest.raises(TypeError, match="The outcome variable y must be binary with values 0 and 1."): + _ = DoubleMLLPLR(dml_data_nb, ml_M, ml_t, ml_m) + + +@pytest.mark.ci +def test_lplr_exception_scores(): + # LPLR valid scores are 'nuisance_space' and 'instrument' + msg = "Invalid score MAR" + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="MAR") + msg = "score should be a string. 0 was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) + + +@pytest.mark.ci +def test_lplr_exception_resampling(): + msg = "The number of folds must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=1.5) + + msg = "The number of repetitions for the sample splitting must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=1.5) + + msg = "The number of folds must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=0) + + msg = "The number of repetitions for the sample splitting must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=0) + + msg = "draw_sample_splitting must be True or False. Got true." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting="true") + + +@pytest.mark.ci +def test_lplr_exception_get_params(): + msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.get_params("ml_x") + + +@pytest.mark.ci +def test_lplr_exception_smpls(): + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + dml_plr_no_smpls = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting=False) + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_smpls.smpls + + +@pytest.mark.ci +def test_lplr_exception_fit(): + msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(n_jobs_cv="5") + msg = "store_predictions must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_predictions=1) + msg = "store_models must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_models=1) + + +@pytest.mark.ci +def test_lplr_exception_bootstrap(): + dml_lplr_boot = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before bootstrap\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap() + + dml_lplr_boot.fit() + msg = 'Method must be "Bayes", "normal" or "wild". Got Gaussian.' + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(method="Gaussian") + msg = "The number of bootstrap replications must be of int type. 500 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot="500") + msg = "The number of bootstrap replications must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot=0) + + +@pytest.mark.ci +def test_lplr_exception_confint(): + dml_lplr_conf = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before confint\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint() + dml_lplr_conf.fit() + + msg = "joint must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(joint=1) + msg = "The confidence level must be of float type. 5% of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(level="5%") + msg = r"The confidence level must be in \(0,1\). 0.0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(level=0.0) + + msg = r"Apply bootstrap\(\) before confint\(joint=True\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(joint=True) + dml_lplr_conf.bootstrap() + df_lplr_ci = dml_lplr_conf.confint(joint=True) + assert isinstance(df_lplr_ci, pd.DataFrame) + + +@pytest.mark.ci +def test_lplr_exception_set_ml_nuisance_params(): + # invalid learner name + msg = "Invalid nuisance learner g. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) + # invalid treatment variable + msg = "Invalid treatment variable y. Valid treatment variable d." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("ml_M", "y", {"alpha": 0.1}) + + +class _DummyNoSetParams: + def fit(self): + pass + + +class _DummyNoGetParams(_DummyNoSetParams): + def set_params(self): + pass + + +class _DummyNoClassifier(_DummyNoGetParams): + def get_params(self): + pass + + def predict(self): + pass + + +class LogisticRegressionManipulatedType(LogisticRegression): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.estimator_type = None + return tags + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", +) +def test_lplr_exception_learner(): + err_msg_prefix = "Invalid learner provided for ml_t: " + + msg = err_msg_prefix + "provide an instance of a learner instead of a class." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, Lasso, ml_m) + msg = err_msg_prefix + r"BaseEstimator\(\) has no method .fit\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, BaseEstimator(), ml_m) + msg = r"has no method .set_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoSetParams(), ml_m) + msg = r"has no method .get_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoGetParams(), ml_m) + + # ml_m may not be a classifier when treatment is not binary + msg = ( + r"The ml_m learner LogisticRegression\(\) was identified as classifier " + r"but at least one treatment variable is not binary with values 0 and 1\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, LogisticRegression()) + + # ml_m may not be a classifier when treatment is not binary + msg = ( + r"The ml_a learner LogisticRegression\(\) was identified as classifier " + r"but at least one treatment variable is not binary with values 0 and 1\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, ml_a=LogisticRegression()) + + # ml_m may not be a classifier when treatment is not binary + dml_data_binary = make_lplr_LZZ2020(treatment="binary") + msg = 'Learner "ml_a" who supports sample_weight is required for score type "instrument"' + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_m, ml_a=LabelSpreading(), score="instrument") + + # construct a classifier which is not identifiable as classifier via is_classifier by sklearn + log_reg = LogisticRegressionManipulatedType() + msg = ( + r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " + r"neither a regressor nor a classifier. Method predict is used for prediction\." + ) + with pytest.warns(UserWarning, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, log_reg) + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", + r"ignore: Learner provided for ml_m is probably invalid.*is \(probably\) no classifier.*:UserWarning", +) +def test_lplr_exception_and_warning_learner(): + # invalid ml_M (must be a classifier with predict_proba) + with pytest.raises(TypeError): + _ = DoubleMLLPLR(dml_data, _DummyNoClassifier(), ml_t, ml_m) + msg = "Invalid learner provided for ml_M: " + r"Lasso\(\) has no method .predict_proba\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, Lasso(), ml_t, ml_m) + msg = ( + r"The ml_m learner RandomForestRegressor\(\) was identified as regressor but at least one treatment " + r"variable is binary with values 0 and 1." + ) + with pytest.warns(match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_m) + msg = ( + r"The ml_a learner RandomForestRegressor\(\) was identified as regressor but at least one treatment " + r"variable is binary with values 0 and 1." + ) + with pytest.warns(match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_M, ml_a=ml_m) + + +class LassoWithNanPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.nan + return preds + + +class LassoWithInfPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.inf + return preds + + +# Classifier that returns hard labels (0/1) via predict_proba to trigger the binary-predictions error +class HardLabelPredictProba(LogisticRegression): + def predict_proba(self, X): + labels = super().predict(X).astype(int) + return np.column_stack((1 - labels, labels)) + + +@pytest.mark.ci +def test_lplr_nan_prediction(): + msg = r"Predictions from learner LassoWithNanPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithNanPred(), ml_m).fit() + msg = r"Predictions from learner LassoWithInfPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithInfPred(), ml_m).fit() + + +@pytest.mark.ci +def test_double_ml_exception_evaluate_learner(): + dml_lplr_obj = DoubleMLLPLR( + dml_data, + ml_M=LogisticRegression(), + ml_t=Lasso(), + ml_m=RandomForestRegressor(), + n_folds=5, + score="nuisance_space", + ) + + msg = r"Apply fit\(\) before evaluate_learners\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners() + + dml_lplr_obj.fit() + + msg = "metric should be a callable. 'mse' was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_obj.evaluate_learners(metric="mse") + + msg = ( + r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." + ) + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) + + def eval_fct(y_pred, y_true): + return np.nan + + with pytest.raises(ValueError): + dml_lplr_obj.evaluate_learners(metric=eval_fct) + + +@pytest.mark.ci +def test_lplr_exception_binary_predictions_from_classifier(): + # Expect error because ml_m returns binary labels instead of probabilities for a binary treatment + msg = ( + r"For the binary treatment variable d, predictions obtained with the ml_m learner " + r"HardLabelPredictProba\(\) are also observed to be binary with values 0 and 1\. " + r"Make sure that for classifiers probabilities and not labels are predicted\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, HardLabelPredictProba()).fit() diff --git a/doubleml/plm/tests/test_lplr_external_predictions.py b/doubleml/plm/tests/test_lplr_external_predictions.py new file mode 100644 index 00000000..cc8546a8 --- /dev/null +++ b/doubleml/plm/tests/test_lplr_external_predictions.py @@ -0,0 +1,148 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml import DoubleMLData +from doubleml.plm.datasets import make_lplr_LZZ2020 +from doubleml.plm.lplr import DoubleMLLPLR +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def lplr_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_m_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_t_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_M_ext(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_lplr_fixture(lplr_score, n_rep, set_ml_m_ext, set_ml_t_ext, set_ml_M_ext): + ext_predictions = {"d": {}} + + x, y, d, _ = make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type="np.array", treatment="continuous") + + np.random.seed(3141) + dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d) + + kwargs = {"obj_dml_data": dml_data, "score": lplr_score, "n_rep": n_rep} + if lplr_score == "instrument": + # ensure ml_a supports sample_weight + kwargs["ml_a"] = LinearRegression() + + dml_lplr = DoubleMLLPLR(ml_M=LogisticRegression(max_iter=1000), ml_t=LinearRegression(), ml_m=LinearRegression(), **kwargs) + np.random.seed(3141) + dml_lplr.fit(store_predictions=True) + + # prepare external predictions and dummy learners + if set_ml_M_ext: + ext_predictions["d"]["ml_M"] = dml_lplr.predictions["ml_M"][:, :, 0] + # provide inner predictions per inner fold index + for i in range(dml_lplr.n_folds_inner): + ext_predictions["d"][f"ml_M_inner_{i}"] = dml_lplr.predictions[f"ml_M_inner_{i}"][:, :, 0] + ml_M = DMLDummyClassifier() + else: + ml_M = LogisticRegression(max_iter=1000) + + if set_ml_t_ext: + ext_predictions["d"]["ml_t"] = dml_lplr.predictions["ml_t"][:, :, 0] + ml_t = DMLDummyRegressor() + else: + ml_t = LinearRegression() + + if set_ml_m_ext: + ext_predictions["d"]["ml_m"] = dml_lplr.predictions["ml_m"][:, :, 0] + ml_m = DMLDummyRegressor() + ext_predictions["d"]["ml_a"] = dml_lplr.predictions["ml_a"][:, :, 0] + for i in range(dml_lplr.n_folds_inner): + ext_predictions["d"][f"ml_a_inner_{i}"] = dml_lplr.predictions[f"ml_a_inner_{i}"][:, :, 0] + else: + ml_m = LinearRegression() + + # build second model with external predictions + dml_lplr_ext = DoubleMLLPLR(ml_M=ml_M, ml_t=ml_t, ml_m=ml_m, **kwargs) + + np.random.seed(3141) + dml_lplr_ext.fit(external_predictions=ext_predictions) + + res_dict = { + "coef_normal": dml_lplr.coef[0], + "coef_ext": dml_lplr_ext.coef[0], + "se_normal": dml_lplr.se[0], + "se_ext": dml_lplr_ext.se[0], + } + return res_dict + + +@pytest.mark.ci +def test_doubleml_lplr_coef(doubleml_lplr_fixture): + assert math.isclose(doubleml_lplr_fixture["coef_normal"], doubleml_lplr_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_doubleml_lplr_se(doubleml_lplr_fixture): + assert math.isclose(doubleml_lplr_fixture["se_normal"], doubleml_lplr_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_doubleml_lplr_exceptions(): + ext_predictions = {"d": {}} + + x, y, d, _ = make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type="np.array", treatment="continuous") + + np.random.seed(3141) + dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d) + + kwargs = {"obj_dml_data": dml_data} + + dml_lplr = DoubleMLLPLR(ml_M=LogisticRegression(max_iter=1000), ml_t=LinearRegression(), ml_m=LinearRegression(), **kwargs) + np.random.seed(3141) + dml_lplr.fit(store_predictions=True) + + # prepare external predictions and dummy learners + + ml_M = LogisticRegression(max_iter=1000) + ml_t = LinearRegression() + ml_m = LinearRegression() + + # build second model with external predictions + dml_lplr_ext = DoubleMLLPLR(ml_M=ml_M, ml_t=ml_t, ml_m=ml_m, **kwargs) + + ext_predictions["d"]["ml_M"] = dml_lplr.predictions["ml_M"][:, :, 0] + # provide inner predictions per inner fold index + for i in range(dml_lplr.n_folds_inner - 1): + ext_predictions["d"][f"ml_M_inner_{i}"] = dml_lplr.predictions[f"ml_M_inner_{i}"][:, :, 0] + + msg = r"When providing external predictions for ml_M, also inner predictions for all inner folds" + with pytest.raises(ValueError, match=msg): + dml_lplr_ext.fit(external_predictions=ext_predictions) + + ext_predictions["d"][f"ml_M_inner_{dml_lplr.n_folds_inner-1}"] = (dml_lplr.predictions)[ + f"ml_M_inner_{dml_lplr.n_folds_inner-1}" + ][:, :, 0] + ext_predictions["d"]["ml_a"] = dml_lplr.predictions["ml_a"][:, :, 0] + for i in range(dml_lplr.n_folds_inner - 1): + ext_predictions["d"][f"ml_a_inner_{i}"] = dml_lplr.predictions[f"ml_a_inner_{i}"][:, :, 0] + + msg = r"When providing external predictions for ml_a, also inner predictions for all inner folds" + with pytest.raises(ValueError, match=msg): + dml_lplr_ext.fit(external_predictions=ext_predictions) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py new file mode 100644 index 00000000..7c7c4aeb --- /dev/null +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -0,0 +1,121 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +import doubleml as dml + +from ..datasets import make_lplr_LZZ2020 + + +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_t(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_a(request): + return request.param + + +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def score(request): + return request.param + + +def get_par_grid(): + return {"n_estimators": [5, 10, 20]} + + +@pytest.fixture(scope="module") +def dml_lplr_fixture( + learner_M, + learner_t, + learner_m, + learner_a, + score, + tune_on_folds=True, +): + par_grid = { + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), + } + n_folds_tune = 4 + n_folds = 5 + alpha = 0.5 + + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + ml_a = clone(learner_a) + + obj_dml_data = make_lplr_LZZ2020(alpha=alpha) + dml_sel_obj = dml.DoubleMLLPLR( + obj_dml_data, + ml_M, + ml_t, + ml_m, + ml_a=ml_a, + n_folds=n_folds, + score=score, + ) + + # tune hyperparameters + tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLLPLR) + + dml_sel_obj.fit() + + res_dict = { + "coef": dml_sel_obj.coef[0], + "se": dml_sel_obj.se[0], + "true_coef": alpha, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_selection_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) + + +@pytest.mark.ci +def test_lplr_exception_tuning( + learner_M, + learner_t, + learner_m, + learner_a, +): + # LPLR valid scores are 'nuisance_space' and 'instrument' + obj_dml_data = make_lplr_LZZ2020(alpha=0.5) + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + + dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) + par_grid = { + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), + } + msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.tune(par_grid, tune_on_folds=False) diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index f69b681e..95b6ea53 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -1,25 +1,8 @@ -import numpy as np import pandas as pd import pytest from doubleml import DoubleMLData from doubleml.datasets import fetch_401K, fetch_bonus -from doubleml.irm.datasets import ( - make_confounded_irm_data, - make_heterogeneous_data, - make_iivm_data, - make_irm_data, - make_irm_data_discrete_treatments, - make_ssm_data, -) -from doubleml.plm.datasets import ( - _make_pliv_data, - make_confounded_plr_data, - make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, - make_plr_CCDDHNR2018, - make_plr_turrell2018, -) msg_inv_return_type = "Invalid return_type." @@ -53,244 +36,3 @@ def test_fetch_bonus_poly(): n_x = len(data_bonus_wo_poly.x_cols) data_bonus_w_poly = fetch_bonus(polynomial_features=True) assert len(data_bonus_w_poly.x_cols) == ((n_x + 1) * n_x / 2 + n_x) - - -@pytest.mark.ci -def test_make_plr_CCDDHNR2018_return_types(): - np.random.seed(3141) - res = make_plr_CCDDHNR2018(n_obs=100, return_type=DoubleMLData) - assert isinstance(res, DoubleMLData) - res = make_plr_CCDDHNR2018(n_obs=100, return_type=pd.DataFrame) - assert isinstance(res, pd.DataFrame) - x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type=np.ndarray) - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_plr_CCDDHNR2018(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_plr_turrell2018_return_types(): - np.random.seed(3141) - res = make_plr_turrell2018(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_plr_turrell2018(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d = make_plr_turrell2018(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_plr_turrell2018(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_irm_data_return_types(): - np.random.seed(3141) - res = make_irm_data(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_irm_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d = make_irm_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_irm_data(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_iivm_data_return_types(): - np.random.seed(3141) - res = make_iivm_data(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_iivm_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = make_iivm_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_iivm_data(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_pliv_data_return_types(): - np.random.seed(3141) - res = _make_pliv_data(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = _make_pliv_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = _make_pliv_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = _make_pliv_data(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_pliv_CHS2015_return_types(): - np.random.seed(3141) - res = make_pliv_CHS2015(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_pliv_CHS2015(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = make_pliv_CHS2015(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_pliv_CHS2015(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_pliv_multiway_cluster_CKMS2021_return_types(): - np.random.seed(3141) - res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(cluster_vars, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="matrix") - - -@pytest.fixture(scope="function", params=[True, False]) -def linear(request): - return request.param - - -@pytest.mark.ci -def test_make_confounded_irm_data_return_types(linear): - np.random.seed(3141) - res = make_confounded_irm_data(linear=linear) - assert isinstance(res, dict) - assert isinstance(res["x"], np.ndarray) - assert isinstance(res["y"], np.ndarray) - assert isinstance(res["d"], np.ndarray) - - assert isinstance(res["oracle_values"], dict) - assert isinstance(res["oracle_values"]["g_long"], np.ndarray) - assert isinstance(res["oracle_values"]["g_short"], np.ndarray) - assert isinstance(res["oracle_values"]["m_long"], np.ndarray) - assert isinstance(res["oracle_values"]["m_short"], np.ndarray) - assert isinstance(res["oracle_values"]["gamma_a"], float) - assert isinstance(res["oracle_values"]["beta_a"], float) - assert isinstance(res["oracle_values"]["a"], np.ndarray) - assert isinstance(res["oracle_values"]["y_0"], np.ndarray) - assert isinstance(res["oracle_values"]["y_1"], np.ndarray) - assert isinstance(res["oracle_values"]["z"], np.ndarray) - assert isinstance(res["oracle_values"]["cf_y"], float) - assert isinstance(res["oracle_values"]["cf_d_ate"], float) - assert isinstance(res["oracle_values"]["cf_d_atte"], float) - assert isinstance(res["oracle_values"]["rho_ate"], float) - assert isinstance(res["oracle_values"]["rho_atte"], float) - - -@pytest.mark.ci -def test_make_confounded_plr_data_return_types(): - np.random.seed(3141) - res = make_confounded_plr_data(theta=5.0) - assert isinstance(res, dict) - assert isinstance(res["x"], np.ndarray) - assert isinstance(res["y"], np.ndarray) - assert isinstance(res["d"], np.ndarray) - - assert isinstance(res["oracle_values"], dict) - assert isinstance(res["oracle_values"]["g_long"], np.ndarray) - assert isinstance(res["oracle_values"]["g_short"], np.ndarray) - assert isinstance(res["oracle_values"]["m_long"], np.ndarray) - assert isinstance(res["oracle_values"]["m_short"], np.ndarray) - assert isinstance(res["oracle_values"]["theta"], float) - assert isinstance(res["oracle_values"]["gamma_a"], float) - assert isinstance(res["oracle_values"]["beta_a"], float) - assert isinstance(res["oracle_values"]["a"], np.ndarray) - assert isinstance(res["oracle_values"]["z"], np.ndarray) - - -@pytest.fixture(scope="function", params=[False, True]) -def binary_treatment(request): - return request.param - - -@pytest.fixture(scope="function", params=[1, 2]) -def n_x(request): - return request.param - - -@pytest.mark.ci -def test_make_heterogeneous_data_return_types(binary_treatment, n_x): - np.random.seed(3141) - res = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=binary_treatment) - assert isinstance(res, dict) - assert isinstance(res["data"], pd.DataFrame) - assert isinstance(res["effects"], np.ndarray) - assert callable(res["treatment_effect"]) - - # test input checks - msg = "n_x must be either 1 or 2." - with pytest.raises(AssertionError, match=msg): - _ = make_heterogeneous_data(n_obs=100, n_x=0, binary_treatment=binary_treatment) - msg = "support_size must be smaller than p." - with pytest.raises(AssertionError, match=msg): - _ = make_heterogeneous_data(n_obs=100, n_x=n_x, support_size=31, binary_treatment=binary_treatment) - msg = "binary_treatment must be a boolean." - with pytest.raises(AssertionError, match=msg): - _ = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=2) - - -@pytest.mark.ci -def test_make_ssm_data_return_types(): - np.random.seed(3141) - res = make_ssm_data(n_obs=100) - assert isinstance(res, DoubleMLData) - res = make_ssm_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z, s = make_ssm_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - assert isinstance(s, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_ssm_data(n_obs=100, return_type="matrix") - - -@pytest.fixture(scope="function", params=[3, 5]) -def n_levels(request): - return request.param - - -def test_make_data_discrete_treatments(n_levels): - np.random.seed(3141) - n = 100 - data_apo = make_irm_data_discrete_treatments(n_obs=n, n_levels=3) - assert isinstance(data_apo, dict) - assert isinstance(data_apo["y"], np.ndarray) - assert isinstance(data_apo["d"], np.ndarray) - assert isinstance(data_apo["x"], np.ndarray) - assert isinstance(data_apo["oracle_values"], dict) - - assert isinstance(data_apo["oracle_values"]["cont_d"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["level_bounds"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["potential_level"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["ite"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["y0"], np.ndarray) - - msg = "n_levels must be at least 2." - with pytest.raises(ValueError, match=msg): - _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1) - - msg = "n_levels must be an integer." - with pytest.raises(ValueError, match=msg): - _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1.1) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 94b5f824..4fca5318 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -15,6 +15,7 @@ DoubleMLDIDData, DoubleMLIIVM, DoubleMLIRM, + DoubleMLLPLR, DoubleMLLPQ, DoubleMLPLIV, DoubleMLPLR, @@ -23,7 +24,12 @@ ) from doubleml.did.datasets import make_did_SZ2020 from doubleml.irm.datasets import make_iivm_data, make_irm_data -from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 +from doubleml.plm.datasets import ( + make_lplr_LZZ2020, + make_pliv_CHS2015, + make_pliv_multiway_cluster_CKMS2021, + make_plr_CCDDHNR2018, +) from doubleml.utils import PSProcessorConfig from ._utils import DummyDataClass @@ -689,6 +695,28 @@ def test_doubleml_exception_smpls(): _ = dml_pliv_cluster.set_sample_splitting(all_smpls=dml_pliv_cluster.smpls, all_smpls_cluster=all_smpls_cluster) +@pytest.mark.ci +def test_doubleml_exception_smpls_inner(): + dml_plr_no_inner = DoubleMLPLR(dml_data, ml_l, ml_m) + msg = "smpls_inner is only available for double sample splitting." + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_inner.smpls_inner + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_inner._DoubleML__smpls__inner + + dml_data_lplr = make_lplr_LZZ2020() + ml_M = LogisticRegression() + dml_lplr_inner_no_smpls = DoubleMLLPLR(dml_data_lplr, ml_M, ml_m, ml_m, draw_sample_splitting=False) + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + with pytest.raises(ValueError, match=msg): + _ = dml_lplr_inner_no_smpls.smpls_inner + with pytest.raises(ValueError, match=msg): + _ = dml_lplr_inner_no_smpls._DoubleML__smpls__inner + + @pytest.mark.ci def test_doubleml_exception_fit(): msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." diff --git a/doubleml/tests/test_nonlinear_score_mixin.py b/doubleml/tests/test_nonlinear_score_mixin.py index 0fce08c3..d68785aa 100644 --- a/doubleml/tests/test_nonlinear_score_mixin.py +++ b/doubleml/tests/test_nonlinear_score_mixin.py @@ -253,3 +253,29 @@ def test_nonlinear_warnings(generate_data1, coef_bounds): with pytest.warns(UserWarning, match=msg): dml_plr_obj._coef_bounds = coef_bounds dml_plr_obj.fit() + + +@pytest.mark.ci +def test_nonlinear_errors(generate_data1, coef_bounds): + # collect data + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith("X")].tolist() + + np.random.seed(3141) + obj_dml_data = dml.DoubleMLData(data, "y", ["d"], x_cols) + + dml_plr_obj = DoubleMLPLRWithNonLinearScoreMixin(obj_dml_data, LinearRegression(), LinearRegression(), score="no_root_pos") + dml_plr_obj._error_on_convergence_failure = True + + msg = "Could not find a root of the score function." + with pytest.raises(ValueError, match=msg): + dml_plr_obj._coef_bounds = coef_bounds + dml_plr_obj.fit() + + dml_plr_obj = DoubleMLPLRWithNonLinearScoreMixin(obj_dml_data, LinearRegression(), LinearRegression(), score="no_root_neg") + dml_plr_obj._error_on_convergence_failure = True + + msg = "Could not find a root of the score function." + with pytest.raises(ValueError, match=msg): + dml_plr_obj._coef_bounds = coef_bounds + dml_plr_obj.fit() diff --git a/doubleml/tests/test_set_sample_splitting_exceptions.py b/doubleml/tests/test_set_sample_splitting_exceptions.py new file mode 100644 index 00000000..eba513ca --- /dev/null +++ b/doubleml/tests/test_set_sample_splitting_exceptions.py @@ -0,0 +1,28 @@ +import numpy as np +import pytest +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.plm.datasets import make_lplr_LZZ2020 + +np.random.seed(3141) + +dml_data_lplr = make_lplr_LZZ2020(n_obs=10) +n_obs = dml_data_lplr.n_obs +ml_M = LogisticRegression() +ml_t = Lasso() +ml_m = Lasso() +dml_lplr = DoubleMLLPLR(dml_data_lplr, ml_M, ml_t, ml_m, n_folds=7, n_rep=8, draw_sample_splitting=False) + + +@pytest.mark.ci +def test_doubleml_exceptions_double_sample_splitting(): + smpls = (np.arange(n_obs), np.arange(n_obs)) + msg = "set_sample_splitting not supported for double sample splitting." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_sample_splitting(smpls) + + dml_lplr._is_cluster_data = True + msg = "Cluster data not supported for double sample splitting." + with pytest.raises(ValueError, match=msg): + dml_lplr.draw_sample_splitting() diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 3d99d93a..b79c7618 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -38,13 +38,25 @@ def _get_cond_smpls_2d(smpls, bin_var1, bin_var2): return smpls_00, smpls_01, smpls_10, smpls_11 -def _fit(estimator, x, y, train_index, idx=None): - estimator.fit(x[train_index, :], y[train_index]) +def _fit(estimator, x, y, train_index, idx=None, sample_weights=None): + if sample_weights is not None: + estimator.fit(x[train_index, :], y[train_index], sample_weight=sample_weights[train_index]) + else: + estimator.fit(x[train_index, :], y[train_index]) return estimator, idx def _dml_cv_predict( - estimator, x, y, smpls=None, n_jobs=None, est_params=None, method="predict", return_train_preds=False, return_models=False + estimator, + x, + y, + smpls=None, + n_jobs=None, + est_params=None, + method="predict", + return_train_preds=False, + return_models=False, + sample_weights=None, ): n_obs = x.shape[0] @@ -57,14 +69,33 @@ def _dml_cv_predict( res = {"models": None} if not manual_cv_predict: + # prepare fit_params for cross_val_predict + fit_params_for_cv = {"sample_weight": sample_weights} if sample_weights is not None else None + if est_params is None: # if there are no parameters set we redirect to the standard method - preds = cross_val_predict(clone(estimator), x, y, cv=smpls, n_jobs=n_jobs, method=method) + preds = cross_val_predict( + clone(estimator), + x, + y, + cv=smpls, + n_jobs=n_jobs, + method=method, + params=fit_params_for_cv, + ) else: assert isinstance(est_params, dict) # if no fold-specific parameters we redirect to the standard method # warnings.warn("Using the same (hyper-)parameters for all folds") - preds = cross_val_predict(clone(estimator).set_params(**est_params), x, y, cv=smpls, n_jobs=n_jobs, method=method) + preds = cross_val_predict( + clone(estimator).set_params(**est_params), + x, + y, + cv=smpls, + n_jobs=n_jobs, + method=method, + params=fit_params_for_cv, + ) if method == "predict_proba": res["preds"] = preds[:, 1] else: @@ -73,7 +104,7 @@ def _dml_cv_predict( else: if not smpls_is_partition: assert not fold_specific_target, "combination of fold-specific y and no cross-fitting not implemented yet" - assert len(smpls) == 1 + # assert len(smpls) == 1 if method == "predict_proba": assert not fold_specific_target # fold_specific_target only needed for PLIV.partialXZ @@ -95,19 +126,28 @@ def _dml_cv_predict( if est_params is None: fitted_models = parallel( - delayed(_fit)(clone(estimator), x, y_list[idx], train_index, idx) + delayed(_fit)(clone(estimator), x, y_list[idx], train_index, idx, sample_weights=sample_weights) for idx, (train_index, test_index) in enumerate(smpls) ) elif isinstance(est_params, dict): # warnings.warn("Using the same (hyper-)parameters for all folds") fitted_models = parallel( - delayed(_fit)(clone(estimator).set_params(**est_params), x, y_list[idx], train_index, idx) + delayed(_fit)( + clone(estimator).set_params(**est_params), x, y_list[idx], train_index, idx, sample_weights=sample_weights + ) for idx, (train_index, test_index) in enumerate(smpls) ) else: assert len(est_params) == len(smpls), "provide one parameter setting per fold" fitted_models = parallel( - delayed(_fit)(clone(estimator).set_params(**est_params[idx]), x, y_list[idx], train_index, idx) + delayed(_fit)( + clone(estimator).set_params(**est_params[idx]), + x, + y_list[idx], + train_index, + idx, + sample_weights=sample_weights, + ) for idx, (train_index, test_index) in enumerate(smpls) ) @@ -148,10 +188,20 @@ def _dml_cv_predict( def _dml_tune( - y, x, train_inds, learner, param_grid, scoring_method, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + y, + x, + train_inds, + learner, + param_grid, + scoring_method, + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + fold_specific_target=False, ): tune_res = list() - for train_index in train_inds: + for i, train_index in enumerate(train_inds): tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True) if search_mode == "grid_search": g_grid_search = GridSearchCV(learner, param_grid, scoring=scoring_method, cv=tune_resampling, n_jobs=n_jobs_cv) @@ -165,7 +215,10 @@ def _dml_tune( n_jobs=n_jobs_cv, n_iter=n_iter_randomized_search, ) - tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) + if fold_specific_target: + tune_res.append(g_grid_search.fit(x[train_index, :], y[i])) + else: + tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) return tune_res @@ -332,7 +385,7 @@ def _set_external_predictions(external_predictions, learners, treatment, i_rep): ext_prediction_dict[learner] = None elif learner in external_predictions[treatment].keys(): if isinstance(external_predictions[treatment][learner], np.ndarray): - ext_prediction_dict[learner] = external_predictions[treatment][learner][:, i_rep] + ext_prediction_dict[learner] = external_predictions[treatment][learner][:, i_rep].astype(float) else: ext_prediction_dict[learner] = None else: diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 188d2f24..4f49a3d2 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -25,6 +25,56 @@ def split_samples(self): return smpls +class DoubleMLDoubleResampling: + def __init__(self, n_folds, n_folds_inner, n_rep, n_obs, stratify=None): + self.n_folds = n_folds + self.n_folds_inner = n_folds_inner + self.n_rep = n_rep + self.n_obs = n_obs + self.stratify = np.array(stratify) if stratify is not None else None + + if n_folds < 2: + raise ValueError( + "n_folds must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) + if n_folds_inner < 2: + raise ValueError( + "n_folds_inner must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) + + if self.stratify is None: + self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner, n_repeats=1) + else: + self.resampling = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner, n_repeats=1) + + def split_samples(self): + all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] + smpls = [all_smpls[(i_repeat * self.n_folds) : ((i_repeat + 1) * self.n_folds)] for i_repeat in range(self.n_rep)] + smpls_inner = [] + for i_rep in range(self.n_rep): + smpls_inner_rep = [] + for train, test in smpls[i_rep]: + if self.stratify is None: + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split(X=train) + ] + ) + else: + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split(X=train, y=self.stratify[train]) + ] + ) + smpls_inner.append(smpls_inner_rep) + + return smpls, smpls_inner + + class DoubleMLClusterResampling: def __init__(self, n_folds, n_rep, n_obs, n_cluster_vars, cluster_vars): self.n_folds = n_folds diff --git a/doubleml/utils/tests/test_resampling.py b/doubleml/utils/tests/test_resampling.py new file mode 100644 index 00000000..3ecfbada --- /dev/null +++ b/doubleml/utils/tests/test_resampling.py @@ -0,0 +1,49 @@ +import pytest + +from doubleml.utils.resampling import DoubleMLDoubleResampling + + +@pytest.mark.ci +def test_DoubleMLDoubleResampling_stratify(): + n_folds = 5 + n_folds_inner = 3 + n_rep = 2 + n_obs = 100 + stratify = [0] * 50 + [1] * 50 + + obj_dml_double_resampling = DoubleMLDoubleResampling( + n_folds=n_folds, + n_folds_inner=n_folds_inner, + n_rep=n_rep, + n_obs=n_obs, + stratify=stratify, + ) + smpls, smpls_inner = obj_dml_double_resampling.split_samples() + + assert len(smpls) == n_rep + assert len(smpls_inner) == n_rep + + for i_rep in range(n_rep): + assert len(smpls[i_rep]) == n_folds + assert len(smpls_inner[i_rep]) == n_folds + + for i_fold in range(n_folds): + train_ind, _ = smpls[i_rep][i_fold] + smpls_inner_rep_fold = smpls_inner[i_rep][i_fold] + assert len(smpls_inner_rep_fold) == n_folds_inner + + for i_fold_inner in range(n_folds_inner): + train_ind_inner, test_ind_inner = smpls_inner_rep_fold[i_fold_inner] + assert set(train_ind_inner).issubset(set(train_ind)) + assert set(test_ind_inner).issubset(set(train_ind)) + + +@pytest.mark.ci +def test_DoubleMLDoubleResampling_exceptions(): + msg = "n_folds must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLDoubleResampling(1, 5, 1, 100) + + msg = "n_folds_inner must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLDoubleResampling(5, 1, 1, 100)