From df03887c56b81d34ad7c499976ceca24e806e71f Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 13 Jan 2025 10:32:40 +0100 Subject: [PATCH 01/48] Logistic regression implementation WIP --- doubleml/__init__.py | 2 + doubleml/double_ml.py | 6 + doubleml/double_ml_data.py | 1049 +++++++++++++++++++++++++++++++++ doubleml/logistic/logistic.py | 463 +++++++++++++++ doubleml/utils/resampling.py | 45 ++ 5 files changed, 1565 insertions(+) create mode 100644 doubleml/double_ml_data.py create mode 100644 doubleml/logistic/logistic.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 6cf7de962..935491167 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -15,6 +15,7 @@ from .irm.ssm import DoubleMLSSM from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR +from .logistic.logistic import DoubleMLLogit from .utils.blp import DoubleMLBLP from .utils.policytree import DoubleMLPolicyTree @@ -42,6 +43,7 @@ "DoubleMLBLP", "DoubleMLPolicyTree", "DoubleMLSSM", + "DoubleMLLogit", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 05481bf16..1cc6bcf9b 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -585,6 +585,12 @@ def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None, # construct framework for inference self._framework = self.construct_framework() + + + + + + return self def construct_framework(self): diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py new file mode 100644 index 000000000..4f8d7cbc7 --- /dev/null +++ b/doubleml/double_ml_data.py @@ -0,0 +1,1049 @@ +import numpy as np +import pandas as pd +import io + +from abc import ABC, abstractmethod + +from sklearn.utils.validation import check_array, column_or_1d, check_consistent_length +from sklearn.utils import assert_all_finite +from sklearn.utils.multiclass import type_of_target +from .utils._estimation import _assure_2d_array +from .utils._checks import _check_set + + +class DoubleMLBaseData(ABC): + """Base Class Double machine learning data-backends + """ + def __init__(self, + data): + if not isinstance(data, pd.DataFrame): + raise TypeError('data must be of pd.DataFrame type. ' + f'{str(data)} of type {str(type(data))} was passed.') + if not data.columns.is_unique: + raise ValueError('Invalid pd.DataFrame: ' + 'Contains duplicate column names.') + self._data = data + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = '================== DoubleMLBaseData Object ==================\n' + \ + '\n------------------ Data summary ------------------\n' + data_summary + \ + '\n------------------ DataFrame info ------------------\n' + df_info + return res + + def _data_summary_str(self): + data_summary = f'No. Observations: {self.n_obs}\n' + return data_summary + + @property + def data(self): + """ + The data. + """ + return self._data + + @property + def all_variables(self): + """ + All variables available in the dataset. + """ + return self.data.columns + + @property + def n_obs(self): + """ + The number of observations. + """ + return self.data.shape[0] + + # TODO: This and the following property does not make sense but the base class DoubleML needs it (especially for the + # multiple treatment variables case) and other things are also build around it, see for example DoubleML._params + @property + def d_cols(self): + return ['theta'] + + @property + def n_treat(self): + """ + The number of treatment variables. + """ + return 1 + + @property + @abstractmethod + def n_coefs(self): + pass + + +class DoubleMLData(DoubleMLBaseData): + """Double machine learning data-backend. + + :class:`DoubleMLData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + t_col : None or str + The time variable (only relevant/used for DiD Estimators). + Default is ``None``. + + s_col : None or str + The score or selection variable (only relevant/used for RDD or SSM Estimatiors). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLData + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> # initialization from pandas.DataFrame + >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLData(df, 'y', 'd') + >>> # initialization from np.ndarray + >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') + >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) + """ + def __init__(self, + data, + y_col, + d_cols, + x_cols=None, + z_cols=None, + t_col=None, + s_col=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True): + DoubleMLBaseData.__init__(self, data) + + self.y_col = y_col + self.d_cols = d_cols + self.z_cols = z_cols + self.t_col = t_col + self.s_col = s_col + self.x_cols = x_cols + self._check_disjoint_sets_y_d_x_z_t_s() + self.use_other_treat_as_covariate = use_other_treat_as_covariate + self.force_all_x_finite = force_all_x_finite + self._binary_treats = self._check_binary_treats() + self._binary_outcome = self._check_binary_outcome() + self._set_y_z_t_s() + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = '================== DoubleMLData Object ==================\n' + \ + '\n------------------ Data summary ------------------\n' + data_summary + \ + '\n------------------ DataFrame info ------------------\n' + df_info + return res + + def _data_summary_str(self): + data_summary = f'Outcome variable: {self.y_col}\n' \ + f'Treatment variable(s): {self.d_cols}\n' \ + f'Covariates: {self.x_cols}\n' \ + f'Instrument variable(s): {self.z_cols}\n' + if self.t_col is not None: + data_summary += f'Time variable: {self.t_col}\n' + if self.s_col is not None: + data_summary += f'Score/Selection variable: {self.s_col}\n' + data_summary += f'No. Observations: {self.n_obs}\n' + return data_summary + + @classmethod + def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covariate=True, + force_all_x_finite=True): + """ + Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + t : :class:`numpy.ndarray` + Array of the time variable (only relevant/used for DiD models). + Default is ``None``. + + s : :class:`numpy.ndarray` + Array of the score or selection variable (only relevant/used for RDD and SSM models). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLData + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') + >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) + """ + if isinstance(force_all_x_finite, str): + if force_all_x_finite != 'allow-nan': + raise ValueError("Invalid force_all_x_finite " + force_all_x_finite + ". " + + "force_all_x_finite must be True, False or 'allow-nan'.") + elif not isinstance(force_all_x_finite, bool): + raise TypeError("Invalid force_all_x_finite. " + + "force_all_x_finite must be True, False or 'allow-nan'.") + + x = check_array(x, ensure_2d=False, allow_nd=False, + force_all_finite=force_all_x_finite) + d = check_array(d, ensure_2d=False, allow_nd=False) + y = column_or_1d(y, warn=True) + + x = _assure_2d_array(x) + d = _assure_2d_array(d) + + y_col = 'y' + if z is None: + check_consistent_length(x, y, d) + z_cols = None + else: + z = check_array(z, ensure_2d=False, allow_nd=False) + z = _assure_2d_array(z) + check_consistent_length(x, y, d, z) + if z.shape[1] == 1: + z_cols = ['z'] + else: + z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])] + + if t is None: + t_col = None + else: + t = column_or_1d(t, warn=True) + check_consistent_length(x, y, d, t) + t_col = 't' + + if s is None: + s_col = None + else: + s = column_or_1d(s, warn=True) + check_consistent_length(x, y, d, s) + s_col = 's' + + if d.shape[1] == 1: + d_cols = ['d'] + else: + d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])] + + x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])] + + # basline version with features, outcome and treatments + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + [y_col] + d_cols) + + if z is not None: + df_z = pd.DataFrame(z, columns=z_cols) + data = pd.concat([data, df_z], axis=1) + + if t is not None: + data[t_col] = t + + if s is not None: + data[s_col] = s + + return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite) + + @property + def x(self): + """ + Array of covariates; + Dynamic! May depend on the currently set treatment variable; + To get an array of all covariates (independent of the currently set treatment variable) + call ``obj.data[obj.x_cols].values``. + """ + return self._X.values + + @property + def y(self): + """ + Array of outcome variable. + """ + return self._y.values + + @property + def d(self): + """ + Array of treatment variable; + Dynamic! Depends on the currently set treatment variable; + To get an array of all treatment variables (independent of the currently set treatment variable) + call ``obj.data[obj.d_cols].values``. + """ + return self._d.values + + @property + def z(self): + """ + Array of instrumental variables. + """ + if self.z_cols is not None: + return self._z.values + else: + return None + + @property + def t(self): + """ + Array of time variable. + """ + if self.t_col is not None: + return self._t.values + else: + return None + + @property + def s(self): + """ + Array of score or selection variable. + """ + if self.s_col is not None: + return self._s.values + else: + return None + + @property + def n_treat(self): + """ + The number of treatment variables. + """ + return len(self.d_cols) + + @property + def n_coefs(self): + """ + The number of coefficients to be estimated. + """ + return self.n_treat + + @property + def n_instr(self): + """ + The number of instruments. + """ + if self.z_cols is not None: + n_instr = len(self.z_cols) + else: + n_instr = 0 + return n_instr + + @property + def binary_treats(self): + """ + Series with logical(s) indicating whether the treatment variable(s) are binary with values 0 and 1. + """ + return self._binary_treats + + @property + def binary_outcome(self): + """ + Logical indicating whether the outcome variable is binary with values 0 and 1. + """ + return self._binary_outcome + + @property + def x_cols(self): + """ + The covariates. + """ + return self._x_cols + + @x_cols.setter + def x_cols(self, value): + reset_value = hasattr(self, '_x_cols') + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The covariates x_cols must be of str or list type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(se + + t(value)) == len(value): + raise ValueError('Invalid covariates x_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid covariates x_cols. ' + 'At least one covariate is no data column.') + assert set(value).issubset(set(self.all_variables)) + self._x_cols = value + else: + excluded_cols = set.union({self.y_col}, set(self.d_cols)) + if (self.z_cols is not None): + excluded_cols = set.union(excluded_cols, set(self.z_cols)) + for col in [self.t_col, self.s_col]: + col = _check_set(col) + excluded_cols = set.union(excluded_cols, col) + self._x_cols = [col for col in self.data.columns if col not in excluded_cols] + if reset_value: + self._check_disjoint_sets() + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + @property + def d_cols(self): + """ + The treatment variable(s). + """ + return self._d_cols + + @d_cols.setter + def d_cols(self, value): + reset_value = hasattr(self, '_d_cols') + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The treatment variable(s) d_cols must be of str or list type. ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid treatment variable(s) d_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid treatment variable(s) d_cols. ' + 'At least one treatment variable is no data column.') + self._d_cols = value + if reset_value: + self._check_disjoint_sets() + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + @property + def y_col(self): + """ + The outcome variable. + """ + return self._y_col + + @y_col.setter + def y_col(self, value): + reset_value = hasattr(self, '_y_col') + if not isinstance(value, str): + raise TypeError('The outcome variable y_col must be of str type. ' + f'{str(value)} of type {str(type(value))} was passed.') + if value not in self.all_variables: + raise ValueError('Invalid outcome variable y_col. ' + f'{value} is no data column.') + self._y_col = value + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def z_cols(self): + """ + The instrumental variable(s). + """ + return self._z_cols + + @z_cols.setter + def z_cols(self, value): + reset_value = hasattr(self, '_z_cols') + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The instrumental variable(s) z_cols must be of str or list type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid instrumental variable(s) z_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid instrumental variable(s) z_cols. ' + 'At least one instrumental variable is no data column.') + self._z_cols = value + else: + self._z_cols = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def t_col(self): + """ + The time variable. + """ + return self._t_col + + @t_col.setter + def t_col(self, value): + reset_value = hasattr(self, '_t_col') + if value is not None: + if not isinstance(value, str): + raise TypeError('The time variable t_col must be of str type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if value not in self.all_variables: + raise ValueError('Invalid time variable t_col. ' + f'{value} is no data column.') + self._t_col = value + else: + self._t_col = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def s_col(self): + """ + The score or selection variable. + """ + return self._s_col + + @s_col.setter + def s_col(self, value): + reset_value = hasattr(self, '_s_col') + if value is not None: + if not isinstance(value, str): + raise TypeError('The score or selection variable s_col must be of str type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if value not in self.all_variables: + raise ValueError('Invalid score or selection variable s_col. ' + f'{value} is no data column.') + self._s_col = value + else: + self._s_col = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def use_other_treat_as_covariate(self): + """ + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + """ + return self._use_other_treat_as_covariate + + @use_other_treat_as_covariate.setter + def use_other_treat_as_covariate(self, value): + reset_value = hasattr(self, '_use_other_treat_as_covariate') + if not isinstance(value, bool): + raise TypeError('use_other_treat_as_covariate must be True or False. ' + f'Got {str(value)}.') + self._use_other_treat_as_covariate = value + if reset_value: + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + @property + def force_all_x_finite(self): + """ + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + """ + return self._force_all_x_finite + + @force_all_x_finite.setter + def force_all_x_finite(self, value): + reset_value = hasattr(self, '_force_all_x_finite') + if isinstance(value, str): + if value != 'allow-nan': + raise ValueError("Invalid force_all_x_finite " + value + ". " + + "force_all_x_finite must be True, False or 'allow-nan'.") + elif not isinstance(value, bool): + raise TypeError("Invalid force_all_x_finite. " + + "force_all_x_finite must be True, False or 'allow-nan'.") + self._force_all_x_finite = value + if reset_value: + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + def _set_y_z_t_s(self): + assert_all_finite(self.data.loc[:, self.y_col]) + self._y = self.data.loc[:, self.y_col] + if self.z_cols is None: + self._z = None + else: + assert_all_finite(self.data.loc[:, self.z_cols]) + self._z = self.data.loc[:, self.z_cols] + + if self.t_col is None: + self._t = None + else: + assert_all_finite(self.data.loc[:, self.t_col]) + self._t = self.data.loc[:, self.t_col] + + if self.s_col is None: + self._s = None + else: + assert_all_finite(self.data.loc[:, self.s_col]) + self._s = self.data.loc[:, self.s_col] + + def set_x_d(self, treatment_var): + """ + Function that assigns the role for the treatment variables in the multiple-treatment case. + + Parameters + ---------- + treatment_var : str + Active treatment variable that will be set to d. + """ + if not isinstance(treatment_var, str): + raise TypeError('treatment_var must be of str type. ' + f'{str(treatment_var)} of type {str(type(treatment_var))} was passed.') + if treatment_var not in self.d_cols: + raise ValueError('Invalid treatment_var. ' + f'{treatment_var} is not in d_cols.') + if self.use_other_treat_as_covariate: + # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed + # (see https://github.com/DoubleML/doubleml-for-py/issues/83) + xd_list = self.x_cols + self.d_cols + xd_list.remove(treatment_var) + else: + xd_list = self.x_cols + assert_all_finite(self.data.loc[:, treatment_var]) + if self.force_all_x_finite: + assert_all_finite(self.data.loc[:, xd_list], + allow_nan=self.force_all_x_finite == 'allow-nan') + self._d = self.data.loc[:, treatment_var] + self._X = self.data.loc[:, xd_list] + + def _check_binary_treats(self): + is_binary = pd.Series(dtype=bool, index=self.d_cols) + for treatment_var in self.d_cols: + this_d = self.data.loc[:, treatment_var] + binary_treat = (type_of_target(this_d) == 'binary') + zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) + is_binary[treatment_var] = (binary_treat & zero_one_treat) + return is_binary + + def _check_binary_outcome(self): + y = self.data.loc[:, self.y_col] + binary_outcome = (type_of_target(y) == 'binary') + zero_one_outcome = np.all((np.power(y, 2) - y) == 0) + is_binary = (binary_outcome & zero_one_outcome) + return is_binary + + def _check_disjoint_sets(self): + # this function can be extended in inherited subclasses + self._check_disjoint_sets_y_d_x_z_t_s() + + def _check_disjoint_sets_y_d_x_z_t_s(self): + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + + if not y_col_set.isdisjoint(x_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and covariate in ' + '``x_cols``.') + if not y_col_set.isdisjoint(d_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and treatment variable in ' + '``d_cols``.') + # note that the line xd_list = self.x_cols + self.d_cols in method set_x_d needs adaption if an intersection of + # x_cols and d_cols as allowed (see https://github.com/DoubleML/doubleml-for-py/issues/83) + if not d_cols_set.isdisjoint(x_cols_set): + raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and as covariate' + '(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``.') + + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not y_col_set.isdisjoint(z_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental ' + 'variable in ``z_cols``.') + if not d_cols_set.isdisjoint(z_cols_set): + raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' + 'instrumental variable in ``z_cols``.') + if not x_cols_set.isdisjoint(z_cols_set): + raise ValueError('At least one variable/column is set as covariate (``x_cols``) and instrumental ' + 'variable in ``z_cols``.') + + self._check_disjoint_sets_t_s() + + def _check_disjoint_sets_t_s(self): + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + + if self.t_col is not None: + t_col_set = {self.t_col} + if not t_col_set.isdisjoint(x_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and covariate in ' + '``x_cols``.') + if not t_col_set.isdisjoint(d_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and treatment variable in ' + '``d_cols``.') + if not t_col_set.isdisjoint(y_col_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and outcome variable ' + '``y_col``.') + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not t_col_set.isdisjoint(z_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and instrumental ' + 'variable in ``z_cols``.') + + if self.s_col is not None: + s_col_set = {self.s_col} + if not s_col_set.isdisjoint(x_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in ' + '``x_cols``.') + if not s_col_set.isdisjoint(d_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment ' + 'variable in ``d_cols``.') + if not s_col_set.isdisjoint(y_col_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome ' + 'variable ``y_col``.') + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not s_col_set.isdisjoint(z_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' + 'instrumental variable in ``z_cols``.') + if self.t_col is not None: + t_col_set = {self.t_col} + if not s_col_set.isdisjoint(t_col_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time ' + 'variable ``t_col``.') + + +class DoubleMLClusterData(DoubleMLData): + """Double machine learning data-backend for data with cluster variables. + + :class:`DoubleMLClusterData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + cluster_cols : str or list + The cluster variable(s). + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + t_col : None or str + The time variable (only relevant/used for DiD Estimators). + Default is ``None``. + + s_col : None or str + The score or selection variable (only relevant/used for RDD and SSM Estimatiors). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLClusterData + >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + >>> # initialization from pandas.DataFrame + >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') + >>> # initialization from np.ndarray + >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') + >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) + """ + def __init__(self, + data, + y_col, + d_cols, + cluster_cols, + x_cols=None, + z_cols=None, + t_col=None, + s_col=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True): + DoubleMLBaseData.__init__(self, data) + + # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter + self.cluster_cols = cluster_cols + self._set_cluster_vars() + DoubleMLData.__init__(self, + data, + y_col, + d_cols, + x_cols, + z_cols, + t_col, + s_col, + use_other_treat_as_covariate, + force_all_x_finite) + self._check_disjoint_sets_cluster_cols() + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = '================== DoubleMLClusterData Object ==================\n' + \ + '\n------------------ Data summary ------------------\n' + data_summary + \ + '\n------------------ DataFrame info ------------------\n' + df_info + return res + + def _data_summary_str(self): + data_summary = f'Outcome variable: {self.y_col}\n' \ + f'Treatment variable(s): {self.d_cols}\n' \ + f'Cluster variable(s): {self.cluster_cols}\n' \ + f'Covariates: {self.x_cols}\n' \ + f'Instrument variable(s): {self.z_cols}\n' + if self.t_col is not None: + data_summary += f'Time variable: {self.t_col}\n' + if self.s_col is not None: + data_summary += f'Score/Selection variable: {self.s_col}\n' + + data_summary += f'No. Observations: {self.n_obs}\n' + return data_summary + + @classmethod + def from_arrays(cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, + force_all_x_finite=True): + """ + Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + cluster_vars : :class:`numpy.ndarray` + Array of cluster variables. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + t : :class:`numpy.ndarray` + Array of the time variable (only relevant/used for DiD models). + Default is ``None``. + + s : :class:`numpy.ndarray` + Array of the score or selection variable (only relevant/used for RDD or SSM models). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLClusterData + >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') + >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) + """ + dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) + cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) + cluster_vars = _assure_2d_array(cluster_vars) + if cluster_vars.shape[1] == 1: + cluster_cols = ['cluster_var'] + else: + cluster_cols = [f'cluster_var{i + 1}' for i in np.arange(cluster_vars.shape[1])] + + data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) + + return (cls(data, dml_data.y_col, dml_data.d_cols, cluster_cols, + dml_data.x_cols, dml_data.z_cols, dml_data.t_col, dml_data.s_col, + dml_data.use_other_treat_as_covariate, dml_data.force_all_x_finite)) + + @property + def cluster_cols(self): + """ + The cluster variable(s). + """ + return self._cluster_cols + + @cluster_cols.setter + def cluster_cols(self, value): + reset_value = hasattr(self, '_cluster_cols') + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The cluster variable(s) cluster_cols must be of str or list type. ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid cluster variable(s) cluster_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid cluster variable(s) cluster_cols. ' + 'At least one cluster variable is no data column.') + self._cluster_cols = value + if reset_value: + self._check_disjoint_sets() + self._set_cluster_vars() + + @property + def n_cluster_vars(self): + """ + The number of cluster variables. + """ + return len(self.cluster_cols) + + @property + def cluster_vars(self): + """ + Array of cluster variable(s). + """ + return self._cluster_vars.values + + @DoubleMLData.x_cols.setter + def x_cols(self, value): + if value is not None: + # this call might become much easier with https://github.com/python/cpython/pull/26194 + super(self.__class__, self.__class__).x_cols.__set__(self, value) + else: + if self.s_col is None: + if (self.z_cols is not None) & (self.t_col is not None): + y_d_z_t = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z_t] + elif self.z_cols is not None: + y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z] + elif self.t_col is not None: + y_d_t = set.union({self.y_col}, set(self.d_cols), {self.t_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_t] + else: + y_d = set.union({self.y_col}, set(self.d_cols), set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d] + else: + if (self.z_cols is not None) & (self.t_col is not None): + y_d_z_t_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, {self.s_col}, + set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z_t_s] + elif self.z_cols is not None: + y_d_z_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.s_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z_s] + elif self.t_col is not None: + y_d_t_s = set.union({self.y_col}, set(self.d_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_t_s] + else: + y_d_s = set.union({self.y_col}, set(self.d_cols), {self.s_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_s] + # this call might become much easier with https://github.com/python/cpython/pull/26194 + super(self.__class__, self.__class__).x_cols.__set__(self, x_cols) + + def _check_disjoint_sets(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLClusterData, self)._check_disjoint_sets() + self._check_disjoint_sets_cluster_cols() + + def _check_disjoint_sets_cluster_cols(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLClusterData, self)._check_disjoint_sets() + + # special checks for the additional cluster variables + cluster_cols_set = set(self.cluster_cols) + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + t_col_set = {self.t_col} + s_col_set = {self.s_col} + + if not y_col_set.isdisjoint(cluster_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and cluster ' + 'variable in ``cluster_cols``.') + if not d_cols_set.isdisjoint(cluster_cols_set): + raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' + 'cluster variable in ``cluster_cols``.') + # TODO: Is the following combination allowed, or not? + if not x_cols_set.isdisjoint(cluster_cols_set): + raise ValueError('At least one variable/column is set as covariate (``x_cols``) and cluster ' + 'variable in ``cluster_cols``.') + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not z_cols_set.isdisjoint(cluster_cols_set): + raise ValueError('At least one variable/column is set as instrumental variable (``z_cols``) and ' + 'cluster variable in ``cluster_cols``.') + if self.t_col is not None: + if not t_col_set.isdisjoint(cluster_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and ' + 'cluster variable in ``cluster_cols``.') + if self.s_col is not None: + if not s_col_set.isdisjoint(cluster_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' + 'cluster variable in ``cluster_cols``.') + + def _set_cluster_vars(self): + assert_all_finite(self.data.loc[:, self.cluster_cols]) + self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py new file mode 100644 index 000000000..8915215bf --- /dev/null +++ b/doubleml/logistic/logistic.py @@ -0,0 +1,463 @@ +import numpy as np +from doubleml.utils._estimation import ( + _dml_cv_predict, + _trimm, + _predict_zero_one_propensity, + _cond_targets, + _get_bracket_guess, + _default_kde, + _normalize_ipw, + _dml_tune, + _solve_ipw_score, +) +from sklearn.base import clone +from sklearn.utils import check_X_y +import scipy +from sklearn.utils.multiclass import type_of_target + +from doubleml import DoubleMLData, DoubleMLBLP +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import NonLinearScoreMixin +from doubleml.utils import DoubleMLClusterResampling +from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from doubleml.utils.resampling import DoubleMLDoubleResampling + + +class DoubleMLLogit(NonLinearScoreMixin, DoubleML): + """Double machine learning for partially linear regression models + + Parameters + ---------- + obj_dml_data : :class:`DoubleMLData` object + The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + + ml_r : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`\\ell_0(X) = E[Y|X]`. + + ml_m : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`m_0(X) = E[D|X]`. + For binary treatment variables :math:`D` (with values 0 and 1), a classifier implementing ``fit()`` and + ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``, + ``predict_proba()`` is used otherwise ``predict()``. + + ml_g : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function + :math:`g_0(X) = E[Y - D \\theta_0|X]`. + Note: The learner `ml_g` is only required for the score ``'IV-type'``. Optionally, it can be specified and + estimated for callable scores. + + n_folds : int + Number of folds. + Default is ``5``. + + n_rep : int + Number of repetitons for the sample splitting. + Default is ``1``. + + score : str or callable + A str (``'partialling out'`` or ``'IV-type'``) specifying the score function + or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. + Default is ``'partialling out'``. + + draw_sample_splitting : bool + Indicates whether the sample splitting should be drawn during initialization of the object. + Default is ``True``. + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.base import clone + >>> np.random.seed(3141) + >>> learner = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_g = learner + >>> ml_m = learner + >>> obj_dml_data = make_plr_CCDDHNR2018(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) + >>> dml_plr_obj.fit().summary + coef std err t P>|t| 2.5 % 97.5 % + d 0.462321 0.04107 11.256983 2.139582e-29 0.381826 0.542816 + + Notes + ----- + **Partially linear regression (PLR)** models take the form + + .. math:: + + Y = D \\theta_0 + g_0(X) + \\zeta, & &\\mathbb{E}(\\zeta | D,X) = 0, + + D = m_0(X) + V, & &\\mathbb{E}(V | X) = 0, + + where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. + The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates, + and :math:`\\zeta` and :math:`V` are stochastic errors. + """ + + def __init__(self, + obj_dml_data, + ml_r, + ml_m, + ml_M, + ml_t, + ml_a=None, + n_folds=5, + n_folds_inner=5, + n_rep=1, + score='logistic', + draw_sample_splitting=True): + super().__init__(obj_dml_data, + n_folds, + n_rep, + score, + draw_sample_splitting) + + self._check_data(self._dml_data) + valid_scores = ['logistic'] + _check_score(self.score, valid_scores, allow_callable=True) + + _ = self._check_learner(ml_r, 'ml_r', regressor=True, classifier=False) + _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) + _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) + ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=True) + self._learner = {'ml_l': ml_r, 'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} + + if ml_a is not None: + ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) + self._learner['ml_a'] = ml_a + else: + self._learner['ml_a'] = clone(ml_m) + ml_a_is_classifier = ml_m_is_classifier + + self._predict_method = {'ml_r': 'predict', 'ml_t': 'predict', 'ml_M': 'predict_proba'} + + if ml_m_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method['ml_m'] = 'predict_proba' + else: + raise ValueError(f'The ml_m learner {str(ml_m)} was identified as classifier ' + 'but at least one treatment variable is not binary with values 0 and 1.') + else: + self._predict_method['ml_m'] = 'predict' + + if ml_a_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method['ml_a'] = 'predict_proba' + else: + raise ValueError(f'The ml_a learner {str(ml_a)} was identified as classifier ' + 'but at least one treatment variable is not binary with values 0 and 1.') + else: + self._predict_method['ml_a'] = 'predict' + + self._initialize_ml_nuisance_params() + self._sensitivity_implemented = True + self._external_predictions_implemented = True + + def _initialize_ml_nuisance_params(self): + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} + for learner in self._learner} + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLData): + raise TypeError('The data must be of DoubleMLData type. ' + f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.') + return + + def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, + n_jobs=None, est_params=None, method='predict'): + res = {} + res['preds'] = np.zeros_like(y) + res['preds_inner'] = np.zeros_like(y) + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, + est_params=est_params, method=method, + return_models=True) + _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) + + res['preds_inner'] += res_inner['preds'] + for model in res_inner['models']: + res['models'].append(model) + res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + + res["preds"] /= len(smpls) + res['targets'] = np.copy(y) + + + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + x, y = check_X_y(self._dml_data.x, self._dml_data.y, + force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, + force_all_finite=False) + x_d_concat = np.hstack([[d, np.newaxis], x]) + r_external = external_predictions['ml_r'] is not None + m_external = external_predictions['ml_m'] is not None + M_external = external_predictions['ml_M'] is not None + t_external = external_predictions['ml_t'] is not None + if 'ml_a' in self._learner: + a_external = external_predictions['ml_a'] is not None + else: + a_external = False + + # nuisance m + if m_external: + m_hat = {'preds': external_predictions['ml_m'], + 'targets': None, + 'models': None} + else: + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], + return_models=return_models) + _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) + + if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): + _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12) + + if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: + binary_preds = (type_of_target(m_hat['preds']) == 'binary') + zero_one_preds = np.all((np.power(m_hat['preds'], 2) - m_hat['preds']) == 0) + if binary_preds & zero_one_preds: + raise ValueError(f'For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, ' + f'predictions obtained with the ml_m learner {str(self._learner["ml_m"])} are also ' + 'observed to be binary with values 0 and 1. Make sure that for classifiers ' + 'probabilities and not labels are predicted.') + + + if M_external: + M_hat = {'preds': external_predictions['ml_M'], + 'targets': None, + 'models': None} + else: + M_hat = (self.double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + n_jobs=n_jobs_cv, + est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) + + if a_external: + a_hat = {'preds': external_predictions['ml_a'], + 'targets': None, + 'models': None} + else: + a_hat = (self.double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + n_jobs=n_jobs_cv, + est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) + + + W = scipy.special.logit(M_hat['preds']) + d_tilde_full = d - a_hat['preds'] + + beta_notFold = np.zeros_like(d) + + for _, test in smpls: + beta_notFold[test] = np.sum(d_tilde_full[test] * W[test]) / np.sum(d_tilde_full[test] ** 2) + + # nuisance t + if t_external: + t_hat = {'preds': external_predictions['ml_t'], + 'targets': None, + 'models': None} + else: + t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], + return_models=return_models) + _check_finite_predictions(t_hat['preds'], self._learner['ml_l'], 'ml_l', smpls) + + W = scipy.special.expit(M_hat['preds']) + + # nuisance W + if t_external: + t_hat = {'preds': external_predictions['ml_t'], + 'targets': None, + 'models': None} + else: + t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], + return_models=return_models) + _check_finite_predictions(t_hat['preds'], self._learner['ml_t'], 'ml_t', smpls) + + r_hat = {} + r_hat['preds'] = t_hat['preds'] - beta_notFold * a_hat['preds'] + + + psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) + + preds = {'predictions': {'ml_r': r_hat['preds'], + 'ml_m': m_hat['preds'], + 'ml_a': a_hat['preds'], + 'ml_t': t_hat['preds'], + 'ml_M': M_hat['preds']}, + 'targets': {'ml_r': r_hat['targets'], + 'ml_m': m_hat['targets'], + 'ml_a': a_hat['targets'], + 'ml_t': t_hat['targets'], + 'ml_M': M_hat['targets']}, + 'models': {'ml_r': None, + 'ml_m': m_hat['models'], + 'ml_a': a_hat['models'], + 'ml_t': t_hat['models'], + 'ml_M': M_hat['models']}} + + return psi_elements, preds + + def _score_elements(self, y, d, r_hat, m_hat): + # compute residual + d_tilde = d - m_hat + psi_hat = scipy.special.expit(-r) + score_const = d_tilde * (1 - y) * np.exp(r) + psi_elements = {"y": y, "d": d, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} + + return psi_elements + + def _sensitivity_element_est(self, preds): + pass + + def _nuisance_tuning(self): + pass + + @property + def __smpls__inner(self): + return self._smpls[self._i_rep] + + def draw_sample_splitting(self): + """ + Draw sample splitting for DoubleML models. + + The samples are drawn according to the attributes + ``n_folds`` and ``n_rep``. + + Returns + ------- + self : object + """ + + obj_dml_resampling = DoubleMLDoubleResampling(n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata) + self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() + + return self + + def set_sample_splitting(self): + raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLogit.') + + def _compute_score(self, psi_elements, coef): + + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d_tilde"] + + + return psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + + def _compute_score_deriv(self, psi_elements, coef, inds=None): + deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d"] + + return psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + + + def cate(self, basis, is_gate=False): + """ + Calculate conditional average treatment effects (CATE) for a given basis. + + Parameters + ---------- + basis : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, + where ``n_obs`` is the number of observations and ``d`` is the number of predictors. + is_gate : bool + Indicates whether the basis is constructed for GATEs (dummy-basis). + Default is ``False``. + + Returns + ------- + model : :class:`doubleML.DoubleMLBLP` + Best linear Predictor model. + """ + if self._dml_data.n_treat > 1: + raise NotImplementedError('Only implemented for single treatment. ' + + f'Number of treatments is {str(self._dml_data.n_treat)}.') + if self.n_rep != 1: + raise NotImplementedError('Only implemented for one repetition. ' + + f'Number of repetitions is {str(self.n_rep)}.') + + Y_tilde, D_tilde = self._partial_out() + + D_basis = basis * D_tilde + model = DoublelMLBLP( + orth_signal=Y_tilde.reshape(-1), + basis=D_basis, + is_gate=is_gate, + ) + model.fit() + + ## TODO: Solve score + + + return model + + def gate(self, groups): + """ + Calculate group average treatment effects (GATE) for groups. + + Parameters + ---------- + groups : :class:`pandas.DataFrame` + The group indicator for estimating the best linear predictor. Groups should be mutually exclusive. + Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations + and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str). + + Returns + ------- + model : :class:`doubleML.DoubleMLBLP` + Best linear Predictor model for Group Effects. + """ + + if not isinstance(groups, pd.DataFrame): + raise TypeError('Groups must be of DataFrame type. ' + f'Groups of type {str(type(groups))} was passed.') + if not all(groups.dtypes == bool) or all(groups.dtypes == int): + if groups.shape[1] == 1: + groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') + else: + raise TypeError('Columns of groups must be of bool type or int type (dummy coded). ' + 'Alternatively, groups should only contain one column.') + + if any(groups.sum(0) <= 5): + warnings.warn('At least one group effect is estimated with less than 6 observations.') + + model = self.cate(groups, is_gate=True) + return model + + def _partial_out(self): + """ + Helper function. Returns the partialled out quantities of Y and D. + Works with multiple repetitions. + + Returns + ------- + Y_tilde : :class:`numpy.ndarray` + The residual of the regression of Y on X. + D_tilde : :class:`numpy.ndarray` + The residual of the regression of D on X. + """ + if self.predictions is None: + raise ValueError('predictions are None. Call .fit(store_predictions=True) to store the predictions.') + + y = self._dml_data.y.reshape(-1, 1) + d = self._dml_data.d.reshape(-1, 1) + ml_m = self.predictions["ml_m"].squeeze(axis=2) + + if self.score == "partialling out": + ml_l = self.predictions["ml_l"].squeeze(axis=2) + Y_tilde = y - ml_l + D_tilde = d - ml_m + else: + assert self.score == "IV-type" + ml_g = self.predictions["ml_g"].squeeze(axis=2) + Y_tilde = y - (self.coef * ml_m) - ml_g + D_tilde = d - ml_m + + return Y_tilde, D_tilde \ No newline at end of file diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 188d2f248..18153944c 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -25,6 +25,51 @@ def split_samples(self): return smpls +class DoubleMLDoubleResampling: + def __init__(self, + n_folds, + n_folds_inner, + n_rep, + n_obs, + stratify=None): + self.n_folds = n_folds + self.n_rep = n_rep + self.n_obs = n_obs + self.stratify = stratify + + if n_folds < 2: + raise ValueError('n_folds must be greater than 1. ' + 'You can use set_sample_splitting with a tuple to only use one fold.') + if n_folds_inner < 2: + raise ValueError('n_folds_inner must be greater than 1. ' + 'You can use set_sample_splitting with a tuple to only use one fold.') + + + if self.stratify is None: + self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner) + else: + self.resampling = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner) + + def split_samples(self): + all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] + smpls = [all_smpls[(i_repeat * self.n_folds):((i_repeat + 1) * self.n_folds)] + for i_repeat in range(self.n_rep)] + smpls_inner = [] + for _ in range(self.n_rep): + smpls_inner_rep = [] + for _, test in all_smpls: + if self.stratify is None: + smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in self.resampling_inner.split(X=test)]) + else: + smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in + self.resampling_inner.split(X=np.zeros(len(test)), y=self.stratify[test])]) + smpls_inner.append(smpls_inner_rep) + + return smpls, smpls_inner + + class DoubleMLClusterResampling: def __init__(self, n_folds, n_rep, n_obs, n_cluster_vars, cluster_vars): self.n_folds = n_folds From f5521f142de7ad22e754c7d1d8d7c5f4c18ffa3b Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Jan 2025 10:11:36 +0100 Subject: [PATCH 02/48] First WIP of implementation --- doubleml/double_ml_data.py | 4 +--- doubleml/logistic/logistic.py | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index 4f8d7cbc7..fdee739dd 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -413,9 +413,7 @@ def x_cols(self, value): if not isinstance(value, list): raise TypeError('The covariates x_cols must be of str or list type (or None). ' f'{str(value)} of type {str(type(value))} was passed.') - if not len(se - - t(value)) == len(value): + if not len(set(value)) == len(value): raise ValueError('Invalid covariates x_cols: ' 'Contains duplicate values.') if not set(value).issubset(set(self.all_variables)): diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index 8915215bf..26c14a80d 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -1,5 +1,5 @@ import numpy as np -from doubleml.utils._estimation import ( +from ..utils._estimation import ( _dml_cv_predict, _trimm, _predict_zero_one_propensity, @@ -15,12 +15,15 @@ import scipy from sklearn.utils.multiclass import type_of_target -from doubleml import DoubleMLData, DoubleMLBLP -from doubleml.double_ml import DoubleML -from doubleml.double_ml_score_mixins import NonLinearScoreMixin -from doubleml.utils import DoubleMLClusterResampling -from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity -from doubleml.utils.resampling import DoubleMLDoubleResampling +from .. import DoubleMLData +from ..double_ml import DoubleML +from ..double_ml_score_mixins import NonLinearScoreMixin +from ..utils import DoubleMLClusterResampling +from ..utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from ..utils.resampling import DoubleMLDoubleResampling + + + class DoubleMLLogit(NonLinearScoreMixin, DoubleML): @@ -110,6 +113,7 @@ def __init__(self, n_rep=1, score='logistic', draw_sample_splitting=True): + self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, n_folds, n_rep, @@ -165,6 +169,8 @@ def _check_data(self, obj_dml_data): if not isinstance(obj_dml_data, DoubleMLData): raise TypeError('The data must be of DoubleMLData type. ' f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.') + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError('The outcome variable y must be binary with values 0 and 1.') return def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, @@ -311,6 +317,10 @@ def _score_elements(self, y, d, r_hat, m_hat): return psi_elements + @property + def _score_element_names(self): + return ['y', 'd', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] + def _sensitivity_element_est(self, preds): pass From bfa756c58797a36943741c2a5d03a9ae57e4e82a Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 21 Feb 2025 15:07:11 +0100 Subject: [PATCH 03/48] Working implementation. Started on test set-up. --- doubleml/logistic/logistic.py | 189 +++++++--- .../logistic/tests/_utils_logistic_manual.py | 346 +++++++++++++++++ doubleml/logistic/tests/tests_logistic.py | 352 ++++++++++++++++++ doubleml/utils/_estimation.py | 6 +- doubleml/utils/resampling.py | 13 +- 5 files changed, 853 insertions(+), 53 deletions(-) create mode 100644 doubleml/logistic/tests/_utils_logistic_manual.py create mode 100644 doubleml/logistic/tests/tests_logistic.py diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index 26c14a80d..25ba37634 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -103,7 +103,6 @@ class DoubleMLLogit(NonLinearScoreMixin, DoubleML): def __init__(self, obj_dml_data, - ml_r, ml_m, ml_M, ml_t, @@ -119,16 +118,17 @@ def __init__(self, n_rep, score, draw_sample_splitting) + self._coef_bounds = (-1e-2, 1e2) + self._coef_start_val = 1.0 self._check_data(self._dml_data) valid_scores = ['logistic'] _check_score(self.score, valid_scores, allow_callable=True) - _ = self._check_learner(ml_r, 'ml_r', regressor=True, classifier=False) _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=True) - self._learner = {'ml_l': ml_r, 'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} + self._learner = {'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) @@ -137,7 +137,7 @@ def __init__(self, self._learner['ml_a'] = clone(ml_m) ml_a_is_classifier = ml_m_is_classifier - self._predict_method = {'ml_r': 'predict', 'ml_t': 'predict', 'ml_M': 'predict_proba'} + self._predict_method = {'ml_t': 'predict', 'ml_M': 'predict_proba'} if ml_m_is_classifier: if self._dml_data.binary_treats.all(): @@ -158,7 +158,6 @@ def __init__(self, self._predict_method['ml_a'] = 'predict' self._initialize_ml_nuisance_params() - self._sensitivity_implemented = True self._external_predictions_implemented = True def _initialize_ml_nuisance_params(self): @@ -173,34 +172,40 @@ def _check_data(self, obj_dml_data): raise TypeError('The outcome variable y must be binary with values 0 and 1.') return + def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, n_jobs=None, est_params=None, method='predict'): res = {} res['preds'] = np.zeros_like(y) - res['preds_inner'] = np.zeros_like(y) + res['preds_inner'] = [] + res['models'] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, est_params=est_params, method=method, - return_models=True) + return_models=True, smpls_is_partition=True) _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) - res['preds_inner'] += res_inner['preds'] + res['preds_inner'].append(res_inner['preds']) for model in res_inner['models']: res['models'].append(model) - res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) - + if method == 'predict_proba': + res['preds'][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + else: + res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds_inner"] res["preds"] /= len(smpls) res['targets'] = np.copy(y) + return res def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) - x_d_concat = np.hstack([[d, np.newaxis], x]) - r_external = external_predictions['ml_r'] is not None + x_d_concat = np.hstack((d.reshape(-1,1), x)) m_external = external_predictions['ml_m'] is not None M_external = external_predictions['ml_M'] is not None t_external = external_predictions['ml_t'] is not None @@ -215,7 +220,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append((train_filtered, test)) + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], return_models=return_models) _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) @@ -238,7 +247,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - M_hat = (self.double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, n_jobs=n_jobs_cv, est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) @@ -247,18 +256,49 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - a_hat = (self.double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + a_hat = (self._double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x, d, smpls=smpls, smpls_inner=self.__smpls__inner, n_jobs=n_jobs_cv, est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) + # r_legacy = np.zeros_like(y) + # smpls_inner = self.__smpls__inner + # M_hat = {} + # a_hat = {} + # M_hat['preds_inner'] = [] + # M_hat['preds'] = np.full_like(y, np.nan) + # a_hat['preds_inner'] = [] + # a_hat['preds'] = np.full_like(y, np.nan) + # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + # test = smpls_single_split[1] + # train = smpls_single_split[0] + # # r_legacy[test] = + # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], + # self._learner['ml_m'], self._learner['ml_M'], + # smpls_single_split, smpls_double_split, y, x, d, + # x_d_concat, n_jobs_cv) + # Mtemp = np.full_like(y, np.nan) + # Mtemp[train] = Mleg + # Atemp = np.full_like(y, np.nan) + # Atemp[train] = aleg + # M_hat['preds_inner'].append(Mtemp) + # a_hat['preds_inner'].append(Atemp) + # a_hat['preds'][test] = a_nf_leg + # + # #r_hat['preds'] = r_legacy + + + + W_inner = [] + beta = np.zeros_like(d) + + for i, (train, test) in enumerate(smpls): + M_iteration = M_hat['preds_inner'][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + d_tilde = (d - a_hat['preds_inner'][i])[train] + beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde ** 2) - W = scipy.special.logit(M_hat['preds']) - d_tilde_full = d - a_hat['preds'] - - beta_notFold = np.zeros_like(d) - - for _, test in smpls: - beta_notFold[test] = np.sum(d_tilde_full[test] * W[test]) / np.sum(d_tilde_full[test] ** 2) # nuisance t if t_external: @@ -266,26 +306,17 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], - return_models=return_models) - _check_finite_predictions(t_hat['preds'], self._learner['ml_l'], 'ml_l', smpls) - - W = scipy.special.expit(M_hat['preds']) - - # nuisance W - if t_external: - t_hat = {'preds': external_predictions['ml_t'], - 'targets': None, - 'models': None} - else: - t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, + t_hat = _dml_cv_predict(self._learner['ml_t'], x, W_inner, smpls=smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], return_models=return_models) _check_finite_predictions(t_hat['preds'], self._learner['ml_t'], 'ml_t', smpls) + r_hat = {} - r_hat['preds'] = t_hat['preds'] - beta_notFold * a_hat['preds'] + r_hat['preds'] = t_hat['preds'] - beta * a_hat['preds'] + + + psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) @@ -295,7 +326,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'ml_a': a_hat['preds'], 'ml_t': t_hat['preds'], 'ml_M': M_hat['preds']}, - 'targets': {'ml_r': r_hat['targets'], + 'targets': {'ml_r': None, 'ml_m': m_hat['targets'], 'ml_a': a_hat['targets'], 'ml_t': t_hat['targets'], @@ -308,18 +339,86 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa return psi_elements, preds + + def legacy_implementation(self, Yfold: np.ndarray, Xfold: np.ndarray, Afold: np.ndarray, XnotFold: np.ndarray, AnotFold: np.ndarray, + learner, learnerClassifier, smpls_single_split, smpls_double_split, yfull, xfull, afull, x_d_concat, n_jobs_cv, noFolds: int = 5, seed=None, )-> (np.ndarray, np.ndarray, np.ndarray): + + def learn_predict(X, Y, Xpredict, learner, learnerClassifier, fit_args={}): + results = [] + if len(np.unique(Y)) == 2: + learnerClassifier.fit(X, Y, **fit_args) + for x in Xpredict: + results.append(learnerClassifier.predict_proba(x)[:, 1]) + else: + learner.fit(X, Y, **fit_args) + for x in Xpredict: + results.append(learner.predict(x)) + return (*results,) + + nFold = len(Yfold) + i = np.remainder(np.arange(nFold), noFolds) + np.random.default_rng(seed).shuffle(i) + + M = np.zeros((nFold)) + a_hat = np.zeros((nFold)) + a_hat_notFold = np.zeros((len(XnotFold))) + M_notFold = np.zeros((len(XnotFold))) + loss = {} + + a_hat_inner = _dml_cv_predict(self._learner['ml_a'], xfull, afull, smpls=smpls_double_split, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'], + return_models=True, smpls_is_partition=True) + _check_finite_predictions(a_hat_inner['preds'], self._learner['ml_a'], 'ml_a', smpls_double_split) + a_hat_notFold = np.full_like(yfull, 0.) + for model in a_hat_inner['models']: + if self._predict_method['ml_a'] == 'predict_proba': + a_hat_notFold[smpls_single_split[1]] += model.predict_proba(xfull[smpls_single_split[1]])[:, 1] + else: + a_hat_notFold[smpls_single_split[1]] += model.predict(xfull[smpls_single_split[1]]) + + M_hat = _dml_cv_predict(self._learner['ml_M'], x_d_concat, yfull, smpls=smpls_double_split, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'], + return_models=True, smpls_is_partition=True) + _check_finite_predictions(M_hat['preds'], self._learner['ml_M'], 'ml_M', smpls_double_split) + + M = M_hat['preds'][~np.isnan(M_hat['preds'])] + a_hat = a_hat_inner['preds'][~np.isnan(a_hat_inner['preds'])] + a_hat_notFold = a_hat_notFold[smpls_single_split[1]] + + np.clip(M, 1e-8, 1 - 1e-8, out=M) +# loss["M"] = compute_loss(Yfold, M) +# loss["a_hat"] = compute_loss(Afold, a_hat) + a_hat_notFold /= noFolds + # M_notFold /= noFolds + np.clip(M_notFold, 1e-8, 1 - 1e-8, out=M_notFold) + + # Obtain preliminary estimate of beta based on M and residual of a + W = scipy.special.logit(M) + A_resid = Afold - a_hat + beta_notFold = sum(A_resid * W) / sum(A_resid ** 2) + # print(beta_notFold) + t_notFold, = learn_predict(Xfold, W, [XnotFold], learner, learnerClassifier) + W_notFold = scipy.special.expit(M_notFold) +# loss["t"] = compute_loss(W_notFold, t_notFold) + + + # Compute r based on estimates for W=logit(M), beta and residual of A + r_notFold = t_notFold - beta_notFold * a_hat_notFold + + return M, a_hat, a_hat_notFold #r_notFold #, a_hat_notFold, M_notFold, t_notFold + def _score_elements(self, y, d, r_hat, m_hat): # compute residual d_tilde = d - m_hat - psi_hat = scipy.special.expit(-r) - score_const = d_tilde * (1 - y) * np.exp(r) - psi_elements = {"y": y, "d": d, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} + psi_hat = scipy.special.expit(-r_hat) + score_const = d_tilde * (1 - y) * np.exp(r_hat) + psi_elements = {"y": y, "d": d, "d_tilde": d_tilde, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} return psi_elements @property def _score_element_names(self): - return ['y', 'd', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] + return ['y', 'd', 'd_tilde', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] def _sensitivity_element_est(self, preds): pass @@ -329,7 +428,7 @@ def _nuisance_tuning(self): @property def __smpls__inner(self): - return self._smpls[self._i_rep] + return self._smpls_inner[self._i_rep] def draw_sample_splitting(self): """ @@ -357,13 +456,13 @@ def set_sample_splitting(self): def _compute_score(self, psi_elements, coef): - score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d_tilde"] + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] return psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) def _compute_score_deriv(self, psi_elements, coef, inds=None): - deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d"] + deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] return psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 diff --git a/doubleml/logistic/tests/_utils_logistic_manual.py b/doubleml/logistic/tests/_utils_logistic_manual.py new file mode 100644 index 000000000..ae53992a6 --- /dev/null +++ b/doubleml/logistic/tests/_utils_logistic_manual.py @@ -0,0 +1,346 @@ +import numpy as np +import scipy +from sklearn.base import clone, is_classifier + +from ...tests._utils_boot import boot_manual, draw_weights +from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search + + +def fit_logistic_multitreat(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, + n_rep=1, l_params=None, m_params=None, g_params=None, + use_other_treat_as_covariate=True): + n_obs = len(y) + n_d = d.shape[1] + + thetas = list() + ses = list() + all_l_hat = list() + all_m_hat = list() + all_g_hat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + thetas_this_rep = np.full(n_d, np.nan) + ses_this_rep = np.full(n_d, np.nan) + all_l_hat_this_rep = list() + all_m_hat_this_rep = list() + all_g_hat_this_rep = list() + + for i_d in range(n_d): + if use_other_treat_as_covariate: + xd = np.hstack((x, np.delete(d, i_d, axis=1))) + else: + xd = x + + l_hat, m_hat, g_hat, thetas_this_rep[i_d], ses_this_rep[i_d] = fit_plr_single_split( + y, xd, d[:, i_d], + learner_l, learner_m, learner_g, + smpls, score, + l_params, m_params, g_params) + all_l_hat_this_rep.append(l_hat) + all_m_hat_this_rep.append(m_hat) + all_g_hat_this_rep.append(g_hat) + + thetas.append(thetas_this_rep) + ses.append(ses_this_rep) + all_l_hat.append(all_l_hat_this_rep) + all_m_hat.append(all_m_hat_this_rep) + all_g_hat.append(all_g_hat_this_rep) + + theta = np.full(n_d, np.nan) + se = np.full(n_d, np.nan) + for i_d in range(n_d): + theta_vec = np.array([xx[i_d] for xx in thetas]) + se_vec = np.array([xx[i_d] for xx in ses]) + theta[i_d] = np.median(theta_vec) + se[i_d] = np.sqrt(np.median(np.power(se_vec, 2) * n_obs + np.power(theta_vec - theta[i_d], 2)) / n_obs) + + res = {'theta': theta, 'se': se, + 'thetas': thetas, 'ses': ses, + 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} + + return res + + +def fit_logistic(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, + n_rep=1, l_params=None, m_params=None, g_params=None): + n_obs = len(y) + + thetas = np.zeros(n_rep) + ses = np.zeros(n_rep) + all_l_hat = list() + all_m_hat = list() + all_g_hat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + l_hat, m_hat, g_hat, thetas[i_rep], ses[i_rep] = fit_plr_single_split( + y, x, d, + learner_l, learner_m, learner_g, + smpls, score, + l_params, m_params, g_params) + all_l_hat.append(l_hat) + all_m_hat.append(m_hat) + all_g_hat.append(g_hat) + + theta = np.median(thetas) + se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) + + res = {'theta': theta, 'se': se, + 'thetas': thetas, 'ses': ses, + 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} + + return res + + +def fit_plr_logistic_split(y, x, d, learner_l, learner_m, learner_g, smpls, score, + l_params=None, m_params=None, g_params=None): + fit_g = (score == 'IV-type') | callable(score) + if is_classifier(learner_m): + l_hat, m_hat, g_hat = fit_nuisance_plr_classifier(y, x, d, + learner_l, learner_m, learner_g, + smpls, fit_g, + l_params, m_params, g_params) + else: + l_hat, m_hat, g_hat = fit_nuisance_plr(y, x, d, + learner_l, learner_m, learner_g, + smpls, fit_g, + l_params, m_params, g_params) + + theta, se = plr_dml2(y, x, d, l_hat, m_hat, g_hat, + smpls, score) + + return l_hat, m_hat, g_hat, theta, se + + +def fit_nuisance_logistic(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, + l_params=None, m_params=None, g_params=None): + ml_l = clone(learner_l) + l_hat = fit_predict(y, x, ml_l, l_params, smpls) + + ml_m = clone(learner_m) + m_hat = fit_predict(d, x, ml_m, m_params, smpls) + + if fit_g: + y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) + psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) + psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + + ml_g = clone(learner_g) + g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) + else: + g_hat = [] + + return l_hat, m_hat, g_hat + + +def fit_nuisance_logistic_classifier(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, + l_params=None, m_params=None, g_params=None): + ml_l = clone(learner_l) + l_hat = fit_predict(y, x, ml_l, l_params, smpls) + + ml_m = clone(learner_m) + m_hat = fit_predict_proba(d, x, ml_m, m_params, smpls) + + if fit_g: + y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) + psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) + psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) + theta_initial = -np.mean(psi_b) / np.mean(psi_a) + + ml_g = clone(learner_g) + g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) + else: + g_hat = [] + + return l_hat, m_hat, g_hat + + +def tune_nuisance_plr(y, x, d, ml_l, ml_m, ml_g, smpls, n_folds_tune, param_grid_l, param_grid_m, param_grid_g, tune_g=True): + l_tune_res = tune_grid_search(y, x, ml_l, smpls, param_grid_l, n_folds_tune) + + m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) + + if tune_g: + l_hat = np.full_like(y, np.nan) + m_hat = np.full_like(d, np.nan) + for idx, (train_index, _) in enumerate(smpls): + l_hat[train_index] = l_tune_res[idx].predict(x[train_index, :]) + m_hat[train_index] = m_tune_res[idx].predict(x[train_index, :]) + psi_a = -np.multiply(d - m_hat, d - m_hat) + psi_b = np.multiply(d - m_hat, y - l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + + g_tune_res = tune_grid_search(y - theta_initial*d, x, ml_g, smpls, param_grid_g, n_folds_tune) + g_best_params = [xx.best_params_ for xx in g_tune_res] + else: + g_best_params = [] + + l_best_params = [xx.best_params_ for xx in l_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + + return l_best_params, m_best_params, g_best_params + + +def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): + y_minus_l_hat = np.full_like(y, np.nan, dtype='float64') + d_minus_m_hat = np.full_like(d, np.nan, dtype='float64') + y_minus_g_hat = np.full_like(y, np.nan, dtype='float64') + for idx, (_, test_index) in enumerate(smpls): + y_minus_l_hat[test_index] = y[test_index] - l_hat[idx] + if len(g_hat) > 0: + y_minus_g_hat[test_index] = y[test_index] - g_hat[idx] + d_minus_m_hat[test_index] = d[test_index] - m_hat[idx] + return y_minus_l_hat, d_minus_m_hat, y_minus_g_hat + + +def plr_dml2(y, x, d, l_hat, m_hat, g_hat, smpls, score): + n_obs = len(y) + y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) + theta_hat = plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score) + se = np.sqrt(var_plr(theta_hat, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs)) + + return theta_hat, se + + +def var_plr(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs): + if score == 'partialling out': + var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d_minus_m_hat)), 2) * \ + np.mean(np.power(np.multiply(y_minus_l_hat - d_minus_m_hat*theta, d_minus_m_hat), 2)) + else: + assert score == 'IV-type' + var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d)), 2) * \ + np.mean(np.power(np.multiply(y_minus_g_hat - d*theta, d_minus_m_hat), 2)) + + return var + + +def plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score): + if score == 'IV-type': + res = np.mean(np.multiply(d_minus_m_hat, y_minus_g_hat))/np.mean(np.multiply(d_minus_m_hat, d)) + else: + assert score == 'partialling out' + res = scipy.linalg.lstsq(d_minus_m_hat.reshape(-1, 1), y_minus_l_hat)[0] + + return res + + +def boot_plr(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, + all_smpls, score, bootstrap, n_rep_boot, + n_rep=1, apply_cross_fitting=True): + all_boot_t_stat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + if apply_cross_fitting: + n_obs = len(y) + else: + test_index = smpls[0][1] + n_obs = len(test_index) + weights = draw_weights(bootstrap, n_rep_boot, n_obs) + + boot_t_stat = boot_plr_single_split( + thetas[i_rep], y, d, all_l_hat[i_rep], all_m_hat[i_rep], all_g_hat[i_rep], smpls, + score, ses[i_rep], + weights, n_rep_boot, apply_cross_fitting) + all_boot_t_stat.append(boot_t_stat) + + # differently for plr because of n_rep_boot and multiple treatmentsa + boot_t_stat = np.transpose(np.vstack(all_boot_t_stat)) + + return boot_t_stat + + +def boot_plr_multitreat(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, + all_smpls, score, bootstrap, n_rep_boot, + n_rep=1, apply_cross_fitting=True): + n_d = d.shape[1] + all_boot_t_stat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + if apply_cross_fitting: + n_obs = len(y) + else: + test_index = smpls[0][1] + n_obs = len(test_index) + weights = draw_weights(bootstrap, n_rep_boot, n_obs) + + boot_t_stat = np.full((n_d, n_rep_boot), np.nan) + for i_d in range(n_d): + boot_t_stat[i_d, :] = boot_plr_single_split( + thetas[i_rep][i_d], y, d[:, i_d], + all_l_hat[i_rep][i_d], all_m_hat[i_rep][i_d], all_g_hat[i_rep][i_d], + smpls, score, ses[i_rep][i_d], + weights, n_rep_boot, apply_cross_fitting) + + # transpose for shape (n_rep_boot, n_d) + boot_t_stat = np.transpose(boot_t_stat) + all_boot_t_stat.append(boot_t_stat) + + # stack repetitions along the last axis + boot_t_stat = np.stack(all_boot_t_stat, axis=2) + + return boot_t_stat + + +def boot_plr_single_split(theta, y, d, l_hat, m_hat, g_hat, + smpls, score, se, weights, n_rep, apply_cross_fitting): + y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) + + if apply_cross_fitting: + if score == 'partialling out': + J = np.mean(-np.multiply(d_minus_m_hat, d_minus_m_hat)) + else: + assert score == 'IV-type' + J = np.mean(-np.multiply(d_minus_m_hat, d)) + else: + test_index = smpls[0][1] + if score == 'partialling out': + J = np.mean(-np.multiply(d_minus_m_hat[test_index], d_minus_m_hat[test_index])) + else: + assert score == 'IV-type' + J = np.mean(-np.multiply(d_minus_m_hat[test_index], d[test_index])) + + if score == 'partialling out': + psi = np.multiply(y_minus_l_hat - d_minus_m_hat * theta, d_minus_m_hat) + else: + assert score == 'IV-type' + psi = np.multiply(y_minus_g_hat - d * theta, d_minus_m_hat) + + boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep, apply_cross_fitting) + + return boot_t_stat + + +def fit_sensitivity_elements_plr(y, d, all_coef, predictions, score, n_rep): + n_treat = d.shape[1] + n_obs = len(y) + + sigma2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) + nu2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) + psi_sigma2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) + psi_nu2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) + + for i_rep in range(n_rep): + for i_treat in range(n_treat): + d_tilde = d[:, i_treat] + m_hat = predictions['ml_m'][:, i_rep, i_treat] + theta = all_coef[i_treat, i_rep] + if score == 'partialling out': + l_hat = predictions['ml_l'][:, i_rep, i_treat] + sigma2_score_element = np.square(y - l_hat - np.multiply(theta, d_tilde-m_hat)) + else: + assert score == 'IV-type' + g_hat = predictions['ml_g'][:, i_rep, i_treat] + sigma2_score_element = np.square(y - g_hat - np.multiply(theta, d_tilde)) + + sigma2[0, i_rep, i_treat] = np.mean(sigma2_score_element) + psi_sigma2[:, i_rep, i_treat] = sigma2_score_element - sigma2[0, i_rep, i_treat] + + nu2[0, i_rep, i_treat] = np.divide(1.0, np.mean(np.square(d_tilde-m_hat))) + psi_nu2[:, i_rep, i_treat] = nu2[0, i_rep, i_treat] - \ + np.multiply(np.square(d_tilde-m_hat), np.square(nu2[0, i_rep, i_treat])) + + element_dict = {'sigma2': sigma2, + 'nu2': nu2, + 'psi_sigma2': psi_sigma2, + 'psi_nu2': psi_nu2} + return element_dict diff --git a/doubleml/logistic/tests/tests_logistic.py b/doubleml/logistic/tests/tests_logistic.py new file mode 100644 index 000000000..2b97bf76b --- /dev/null +++ b/doubleml/logistic/tests/tests_logistic.py @@ -0,0 +1,352 @@ +import pytest +import math +import scipy +import numpy as np +import pandas as pd + +from sklearn.base import clone + +from sklearn.linear_model import LinearRegression, Lasso +from sklearn.ensemble import RandomForestRegressor + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_logistic_manual import fit_logistic, , boot_plr + + +@pytest.fixture(scope='module', + params=[RandomForestRegressor(max_depth=2, n_estimators=10), + LinearRegression(), + Lasso(alpha=0.1)]) +def learner(request): + return request.param + + +@pytest.fixture(scope='module', + params=['IV-type', 'partialling out']) +def score(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_plr_fixture(generate_data1, learner, score): + boot_methods = ['normal'] + n_folds = 2 + n_rep_boot = 502 + + # collect data + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith('X')].tolist() + + # Set machine learning methods for m & g + ml_l = clone(learner) + ml_m = clone(learner) + ml_g = clone(learner) + + np.random.seed(3141) + obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) + if score == 'partialling out': + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, + n_folds=n_folds, + score=score) + else: + assert score == 'IV-type' + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, ml_g, + n_folds, + score=score) + + dml_plr_obj.fit() + + np.random.seed(3141) + y = data['y'].values + x = data.loc[:, x_cols].values + d = data['d'].values + n_obs = len(y) + all_smpls = draw_smpls(n_obs, n_folds) + + res_manual = fit_plr(y, x, d, clone(learner), clone(learner), clone(learner), + all_smpls, score) + + np.random.seed(3141) + # test with external nuisance predictions + if score == 'partialling out': + dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, + n_folds, + score=score) + else: + assert score == 'IV-type' + dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, ml_g, + n_folds, + score=score) + + # synchronize the sample splitting + dml_plr_obj_ext.set_sample_splitting(all_smpls=all_smpls) + + if score == 'partialling out': + prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), + 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1)}} + else: + assert score == 'IV-type' + prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), + 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1), + 'ml_g': dml_plr_obj.predictions['ml_g'].reshape(-1, 1)}} + + dml_plr_obj_ext.fit(external_predictions=prediction_dict) + + res_dict = {'coef': dml_plr_obj.coef, + 'coef_manual': res_manual['theta'], + 'coef_ext': dml_plr_obj_ext.coef, + 'se': dml_plr_obj.se, + 'se_manual': res_manual['se'], + 'se_ext': dml_plr_obj_ext.se, + 'boot_methods': boot_methods} + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], + res_manual['all_l_hat'], res_manual['all_m_hat'], res_manual['all_g_hat'], + all_smpls, score, bootstrap, n_rep_boot) + + np.random.seed(3141) + dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_plr_obj_ext.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat + res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) + res_dict['boot_t_stat' + bootstrap + '_ext'] = dml_plr_obj_ext.boot_t_stat + + # sensitivity tests + res_dict['sensitivity_elements'] = dml_plr_obj.sensitivity_elements + res_dict['sensitivity_elements_manual'] = fit_sensitivity_elements_plr(y, d.reshape(-1, 1), + all_coef=dml_plr_obj.all_coef, + predictions=dml_plr_obj.predictions, + score=score, + n_rep=1) + # check if sensitivity score with rho=0 gives equal asymptotic standard deviation + dml_plr_obj.sensitivity_analysis(rho=0.0) + res_dict['sensitivity_ses'] = dml_plr_obj.sensitivity_params['se'] + return res_dict + + +@pytest.mark.ci +def test_dml_plr_coef(dml_plr_fixture): + assert math.isclose(dml_plr_fixture['coef'], + dml_plr_fixture['coef_manual'], + rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_plr_fixture['coef'], + dml_plr_fixture['coef_ext'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_se(dml_plr_fixture): + assert math.isclose(dml_plr_fixture['se'], + dml_plr_fixture['se_manual'], + rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_plr_fixture['se'], + dml_plr_fixture['se_ext'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_boot(dml_plr_fixture): + for bootstrap in dml_plr_fixture['boot_methods']: + assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], + dml_plr_fixture['boot_t_stat' + bootstrap + '_manual'], + rtol=1e-9, atol=1e-4) + assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], + dml_plr_fixture['boot_t_stat' + bootstrap + '_ext'], + rtol=1e-9, atol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_sensitivity(dml_plr_fixture): + sensitivity_element_names = ['sigma2', 'nu2', 'psi_sigma2', 'psi_nu2'] + for sensitivity_element in sensitivity_element_names: + assert np.allclose(dml_plr_fixture['sensitivity_elements'][sensitivity_element], + dml_plr_fixture['sensitivity_elements_manual'][sensitivity_element]) + + +@pytest.mark.ci +def test_dml_plr_sensitivity_rho0(dml_plr_fixture): + assert np.allclose(dml_plr_fixture['se'], + dml_plr_fixture['sensitivity_ses']['lower'], + rtol=1e-9, atol=1e-4) + assert np.allclose(dml_plr_fixture['se'], + dml_plr_fixture['sensitivity_ses']['upper'], + rtol=1e-9, atol=1e-4) + + +@pytest.fixture(scope="module") +def dml_plr_ols_manual_fixture(generate_data1, score): + learner = LinearRegression() + boot_methods = ['Bayes', 'normal', 'wild'] + n_folds = 2 + n_rep_boot = 501 + + # collect data + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith('X')].tolist() + + # Set machine learning methods for m & g + ml_l = clone(learner) + ml_g = clone(learner) + ml_m = clone(learner) + + obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) + if score == 'partialling out': + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, + n_folds=n_folds, + score=score) + else: + assert score == 'IV-type' + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, ml_g, + n_folds, + score=score) + + n = data.shape[0] + this_smpl = list() + xx = int(n/2) + this_smpl.append((np.arange(xx, n), np.arange(0, xx))) + this_smpl.append((np.arange(0, xx), np.arange(xx, n))) + smpls = [this_smpl] + dml_plr_obj.set_sample_splitting(smpls) + + dml_plr_obj.fit() + + y = data['y'].values + x = data.loc[:, x_cols].values + d = data['d'].values + + # add column of ones for intercept + o = np.ones((n, 1)) + x = np.append(x, o, axis=1) + + smpls = dml_plr_obj.smpls[0] + + l_hat = [] + l_hat_vec = np.full_like(y, np.nan) + for (train_index, test_index) in smpls: + ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0] + preds = np.dot(x[test_index], ols_est) + l_hat.append(preds) + l_hat_vec[test_index] = preds + + m_hat = [] + m_hat_vec = np.full_like(d, np.nan) + for (train_index, test_index) in smpls: + ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0] + preds = np.dot(x[test_index], ols_est) + m_hat.append(preds) + m_hat_vec[test_index] = preds + + g_hat = [] + if score == 'IV-type': + theta_initial = scipy.linalg.lstsq((d - m_hat_vec).reshape(-1, 1), y - l_hat_vec)[0] + for (train_index, test_index) in smpls: + ols_est = scipy.linalg.lstsq(x[train_index], + y[train_index] - d[train_index] * theta_initial)[0] + g_hat.append(np.dot(x[test_index], ols_est)) + + res_manual, se_manual = plr_dml2(y, x, d, + l_hat, m_hat, g_hat, + smpls, score) + + res_dict = {'coef': dml_plr_obj.coef, + 'coef_manual': res_manual, + 'se': dml_plr_obj.se, + 'se_manual': se_manual, + 'boot_methods': boot_methods} + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_plr(y, d, [res_manual], [se_manual], + [l_hat], [m_hat], [g_hat], + [smpls], score, bootstrap, n_rep_boot) + + np.random.seed(3141) + dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat + res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) + + return res_dict + + +@pytest.mark.ci +def test_dml_plr_ols_manual_coef(dml_plr_ols_manual_fixture): + assert math.isclose(dml_plr_ols_manual_fixture['coef'], + dml_plr_ols_manual_fixture['coef_manual'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_ols_manual_se(dml_plr_ols_manual_fixture): + assert math.isclose(dml_plr_ols_manual_fixture['se'], + dml_plr_ols_manual_fixture['se_manual'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture): + for bootstrap in dml_plr_ols_manual_fixture['boot_methods']: + assert np.allclose(dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap], + dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap + '_manual'], + rtol=1e-9, atol=1e-4) + + +@pytest.fixture(scope='module', + params=["nonrobust", "HC0", "HC1", "HC2", "HC3"]) +def cov_type(request): + return request.param + + +@pytest.mark.ci +def test_dml_plr_cate_gate(score, cov_type): + n = 9 + + # collect data + np.random.seed(42) + obj_dml_data = dml.datasets.make_plr_CCDDHNR2018(n_obs=n) + ml_l = LinearRegression() + ml_g = LinearRegression() + ml_m = LinearRegression() + + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_g, ml_m, ml_l, + n_folds=2, + score=score) + dml_plr_obj.fit() + random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) + cate = dml_plr_obj.cate(random_basis, cov_type=cov_type) + assert isinstance(cate, dml.DoubleMLBLP) + assert isinstance(cate.confint(), pd.DataFrame) + assert cate.blp_model.cov_type == cov_type + + groups_1 = pd.DataFrame( + np.column_stack([obj_dml_data.data['X1'] <= 0, + obj_dml_data.data['X1'] > 0.2]), + columns=['Group 1', 'Group 2']) + msg = ('At least one group effect is estimated with less than 6 observations.') + with pytest.warns(UserWarning, match=msg): + gate_1 = dml_plr_obj.gate(groups_1, cov_type=cov_type) + assert isinstance(gate_1, dml.utils.blp.DoubleMLBLP) + assert isinstance(gate_1.confint(), pd.DataFrame) + assert all(gate_1.confint().index == groups_1.columns.tolist()) + assert gate_1.blp_model.cov_type == cov_type + + np.random.seed(42) + groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) + msg = ('At least one group effect is estimated with less than 6 observations.') + with pytest.warns(UserWarning, match=msg): + gate_2 = dml_plr_obj.gate(groups_2, cov_type=cov_type) + assert isinstance(gate_2, dml.utils.blp.DoubleMLBLP) + assert isinstance(gate_2.confint(), pd.DataFrame) + assert all(gate_2.confint().index == ["Group_1", "Group_2"]) + assert gate_2.blp_model.cov_type == cov_type diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 3d99d93a5..3ed110f3c 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -45,10 +45,12 @@ def _fit(estimator, x, y, train_index, idx=None): def _dml_cv_predict( estimator, x, y, smpls=None, n_jobs=None, est_params=None, method="predict", return_train_preds=False, return_models=False -): +, smpls_is_partition=None): n_obs = x.shape[0] - smpls_is_partition = _check_is_partition(smpls, n_obs) + # TODO: Better name for smples_is_partition + if smpls_is_partition is None: + smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) manual_cv_predict = ( diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 18153944c..d10145176 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -33,6 +33,7 @@ def __init__(self, n_obs, stratify=None): self.n_folds = n_folds + self.n_folds_inner = n_folds_inner self.n_rep = n_rep self.n_obs = n_obs self.stratify = stratify @@ -47,10 +48,10 @@ def __init__(self, if self.stratify is None: self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) - self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner) + self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner, n_repeats=1) else: self.resampling = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_rep) - self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner) + self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner, n_repeats=1) def split_samples(self): all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] @@ -59,12 +60,12 @@ def split_samples(self): smpls_inner = [] for _ in range(self.n_rep): smpls_inner_rep = [] - for _, test in all_smpls: + for train, test in all_smpls: if self.stratify is None: - smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in self.resampling_inner.split(X=test)]) + smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in self.resampling_inner.split(X=train)]) else: - smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in - self.resampling_inner.split(X=np.zeros(len(test)), y=self.stratify[test])]) + smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in + self.resampling_inner.split(X=np.zeros(len(train)), y=self.stratify[train])]) smpls_inner.append(smpls_inner_rep) return smpls, smpls_inner From d729d0a2f1ea0752ddeb9c762452e46d3ad43f14 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Thu, 27 Feb 2025 15:24:40 +0100 Subject: [PATCH 04/48] Changed data type of arrays --- doubleml/logistic/logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index 25ba37634..cfb9926ee 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -176,7 +176,7 @@ def _check_data(self, obj_dml_data): def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, n_jobs=None, est_params=None, method='predict'): res = {} - res['preds'] = np.zeros_like(y) + res['preds'] = np.zeros(d.shape, dtype=float) res['preds_inner'] = [] res['models'] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): @@ -289,7 +289,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa W_inner = [] - beta = np.zeros_like(d) + beta = np.zeros(d.shape, dtype=float) for i, (train, test) in enumerate(smpls): M_iteration = M_hat['preds_inner'][i][train] From 8fe7ca667519d79507fb0a8621bb51e54e2983a5 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Thu, 27 Feb 2025 15:30:40 +0100 Subject: [PATCH 05/48] Fix variable name --- doubleml/logistic/logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index cfb9926ee..ab10ceb87 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -176,7 +176,7 @@ def _check_data(self, obj_dml_data): def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, n_jobs=None, est_params=None, method='predict'): res = {} - res['preds'] = np.zeros(d.shape, dtype=float) + res['preds'] = np.zeros(y.shape, dtype=float) res['preds_inner'] = [] res['models'] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): From 18bac23cbc95b0e6d25af918f1924202cea231b5 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 27 Aug 2025 10:22:19 +0200 Subject: [PATCH 06/48] Moved into plm folder, started testing setup --- doubleml/__init__.py | 2 + doubleml/datasets.py | 1753 +++++++++++++++++ doubleml/double_ml_data.py | 55 +- doubleml/plm/__init__.py | 1 + doubleml/{logistic => plm}/logistic.py | 103 +- .../tests/_utils_logistic_manual.py | 37 +- .../{logistic => plm}/tests/tests_logistic.py | 51 +- doubleml/utils/_estimation.py | 30 +- 8 files changed, 1906 insertions(+), 126 deletions(-) create mode 100644 doubleml/datasets.py rename doubleml/{logistic => plm}/logistic.py (87%) rename doubleml/{logistic => plm}/tests/_utils_logistic_manual.py (87%) rename doubleml/{logistic => plm}/tests/tests_logistic.py (85%) diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 935491167..ba59a07e0 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,6 +13,8 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM +from doubleml.plm.logistic import DoubleMLLogit + from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR from .logistic.logistic import DoubleMLLogit diff --git a/doubleml/datasets.py b/doubleml/datasets.py new file mode 100644 index 000000000..629a033aa --- /dev/null +++ b/doubleml/datasets.py @@ -0,0 +1,1753 @@ +import pandas as pd +import numpy as np +import warnings + +from scipy.linalg import toeplitz +from scipy.optimize import minimize_scalar +from scipy.special import expit + +from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder +from sklearn.datasets import make_spd_matrix + +from .double_ml_data import DoubleMLData, DoubleMLClusterData + +_array_alias = ['array', 'np.ndarray', 'np.array', np.ndarray] +_data_frame_alias = ['DataFrame', 'pd.DataFrame', pd.DataFrame] +_dml_data_alias = ['DoubleMLData', DoubleMLData] +_dml_cluster_data_alias = ['DoubleMLClusterData', DoubleMLClusterData] + + +def fetch_401K(return_type='DoubleMLData', polynomial_features=False): + """ + Data set on financial wealth and 401(k) plan participation. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of + Econometrics, 113(2): 231-263. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + url = 'https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta' + raw_data = pd.read_stata(url) + + y_col = 'net_tfa' + d_cols = ['e401'] + x_cols = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown'] + + data = raw_data.copy() + + if polynomial_features: + raise NotImplementedError('polynomial_features os not implemented yet for fetch_401K.') + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError('Invalid return_type.') + + +def fetch_bonus(return_type='DoubleMLData', polynomial_features=False): + """ + Data set on the Pennsylvania Reemployment Bonus experiment. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. + Journal of Applied Econometrics, 15(6): 575-594. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + url = 'https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat' + raw_data = pd.read_csv(url, sep='\s+') + + ind = (raw_data['tg'] == 0) | (raw_data['tg'] == 4) + data = raw_data.copy()[ind] + data.reset_index(inplace=True) + data['tg'] = data['tg'].replace(4, 1) + data['inuidur1'] = np.log(data['inuidur1']) + + # variable dep as factor (dummy encoding) + dummy_enc = OneHotEncoder(drop='first', categories='auto').fit(data.loc[:, ['dep']]) + xx = dummy_enc.transform(data.loc[:, ['dep']]).toarray() + data['dep1'] = xx[:, 0] + data['dep2'] = xx[:, 1] + + y_col = 'inuidur1' + d_cols = ['tg'] + x_cols = ['female', 'black', 'othrace', + 'dep1', 'dep2', + 'q2', 'q3', 'q4', 'q5', 'q6', + 'agelt35', 'agegt54', 'durable', 'lusd', 'husd'] + + if polynomial_features: + poly = PolynomialFeatures(2, include_bias=False) + data_transf = poly.fit_transform(data[x_cols]) + x_cols = list(poly.get_feature_names_out(x_cols)) + + data_transf = pd.DataFrame(data_transf, columns=x_cols) + data = pd.concat((data[[y_col] + d_cols], data_transf), + axis=1, sort=False) + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError('Invalid return_type.') + + +def _g(x): + return np.power(np.sin(x), 2) + + +def _m(x, nu=0., gamma=1.): + return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) + + +def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): + """ + Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. + The nuisance functions are given by + + .. math:: + + m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, + + g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + alpha : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. + + References + ---------- + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + a_0 = kwargs.get('a_0', 1.) + a_1 = kwargs.get('a_1', 0.25) + s_1 = kwargs.get('s_1', 1.) + + b_0 = kwargs.get('b_0', 1.) + b_1 = kwargs.get('b_1', 0.25) + s_2 = kwargs.get('s_2', 1.) + + cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + d = a_0 * x[:, 0] + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) \ + + s_1 * np.random.standard_normal(size=[n_obs, ]) + y = alpha * d + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) \ + + b_1 * x[:, 2] + s_2 * np.random.standard_normal(size=[n_obs, ]) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type='DoubleMLData', **kwargs): + """ + Generates data from a partially linear regression model used in a blog article by Turrell (2018). + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, + positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. + :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by + + .. math:: + + m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, + + g_0(x_i) &= \\sin(x_i)^2. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\nu=0`, or :math:`\\gamma=1`. + + References + ---------- + Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, + science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ + `_. + """ + nu = kwargs.get('nu', 0.) + gamma = kwargs.get('gamma', 1.) + + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) + G = _g(np.dot(x, b)) + M = _m(np.dot(x, b), nu=nu, gamma=gamma) + d = M + np.random.standard_normal(size=[n_obs, ]) + y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type='DoubleMLData'): + """ + Generates data from a interactive regression (IRM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i + \\sim \\mathcal{U}(0,1), + + y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and + :math:`c_d` are given by + + .. math:: + + c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = + \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. + + The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni + et al. (2017). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + R2_d : + The value of the parameter :math:`R_d^2`. + R2_y : + The value of the parameter :math:`R_y^2`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + + References + ---------- + Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With + High‐Dimensional Data. Econometrica, 85: 233-298. + """ + # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement + v = np.random.uniform(size=[n_obs, ]) + zeta = np.random.standard_normal(size=[n_obs, ]) + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] + b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) + c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) + c_d = np.sqrt(np.pi ** 2 / 3. * R2_d / ((1 - R2_d) * b_sigma_b)) + + xx = np.exp(np.dot(x, np.multiply(beta, c_d))) + d = 1. * ((xx / (1 + xx)) > v) + + y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_iivm_data(n_obs=500, dim_x=20, theta=1., alpha_x=0.2, return_type='DoubleMLData'): + """ + Generates data from a interactive IV regression (IIVM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, + + y_i &= \\theta d_i + x_i' \\beta + u_i, + + with :math:`Z \\sim \\text{Bernoulli}(0.5)` and + + .. math:: + + \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). + + The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries + :math:`\\beta_j=\\frac{1}{j^2}`. + + The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and + Klaassen (2020). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + alpha_x : + The value of the parameter :math:`\\alpha_x`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion + Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. + """ + # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 + xx = np.random.multivariate_normal(np.zeros(2), + np.array([[1., 0.3], [0.3, 1.]]), + size=[n_obs, ]) + u = xx[:, 0] + v = xx[:, 1] + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] + + z = np.random.binomial(p=0.5, n=1, size=[n_obs, ]) + d = 1. * (alpha_x * z + v > 0) + + y = d * theta + np.dot(x, beta) + u + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=x_cols + ['y', 'd', 'z']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, 'z') + else: + raise ValueError('Invalid return_type.') + + +def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type='DoubleMLData'): + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) + G = _g(np.dot(x, b)) + # instrument + z = _m(np.dot(x, b)) + np.random.standard_normal(size=[n_obs, ]) + # treatment + M = _m(gamma_z * z + np.dot(x, b)) + d = M + np.random.standard_normal(size=[n_obs, ]) + y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=x_cols + ['y', 'd', 'z']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, 'z') + else: + raise ValueError('Invalid return_type.') + + +def make_pliv_CHS2015(n_obs, alpha=1., dim_x=200, dim_z=150, return_type='DoubleMLData'): + """ + Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). + The data generating process is defined as + + .. math:: + + z_i &= \\Pi x_i + \\zeta_i, + + d_i &= x_i' \\gamma + z_i' \\delta + u_i, + + y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, + + with + + .. math:: + + \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ + 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) + + where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. + :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, + :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` + and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + alpha : + The value of the causal parameter. + dim_x : + The number of covariates. + dim_z : + The number of instruments. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear + Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. + """ + assert dim_x >= dim_z + # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf + xx = np.random.multivariate_normal(np.zeros(2), + np.array([[1., 0.6], [0.6, 1.]]), + size=[n_obs, ]) + epsilon = xx[:, 0] + u = xx[:, 1] + + sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), + sigma, + size=[n_obs, ]) + + I_z = np.eye(dim_z) + xi = np.random.multivariate_normal(np.zeros(dim_z), + 0.25 * I_z, + size=[n_obs, ]) + + beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] + gamma = beta + delta = [1 / (k ** 2) for k in range(1, dim_z + 1)] + Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) + + z = np.dot(x, np.transpose(Pi)) + xi + d = np.dot(x, gamma) + np.dot(z, delta) + u + y = alpha * d + np.dot(x, beta) + epsilon + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + z_cols = [f'Z{i + 1}' for i in np.arange(dim_z)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=x_cols + ['y', 'd'] + z_cols) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, z_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1., return_type='DoubleMLClusterData', **kwargs): + """ + Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. + (2021). The data generating process is defined as + + .. math:: + + Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, + + D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, + + Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, + + with + + .. math:: + + X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X + + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, + + \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon + + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, + + v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v + + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, + + V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V + + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, + + and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` + where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries + :math:`\\Sigma_{kj} = s_X^{|j-k|}`. + Further + + .. math:: + + \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) + \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ + s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) + + + and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. + + Parameters + ---------- + N : + The number of observations (first dimension). + M : + The number of observations (second dimension). + dim_X : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where + ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s + ``(x, y, d, cluster_vars, z)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, + :math:`s_X = s_{\\varepsilon v} = 0.25`, + or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries + :math:`(\\zeta_{0})_j = 0.5^j`. + + References + ---------- + Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, + Journal of Business & Economic Statistics, + doi: `10.1080/07350015.2021.1895815 `_, + arXiv:`1909.03489 `_. + """ + # additional parameters specifiable via kwargs + pi_10 = kwargs.get('pi_10', 1.0) + + xx = np.arange(1, dim_X + 1) + zeta_0 = kwargs.get('zeta_0', np.power(0.5, xx)) + pi_20 = kwargs.get('pi_20', np.power(0.5, xx)) + xi_0 = kwargs.get('xi_0', np.power(0.5, xx)) + + omega_X = kwargs.get('omega_X', np.array([0.25, 0.25])) + omega_epsilon = kwargs.get('omega_epsilon', np.array([0.25, 0.25])) + omega_v = kwargs.get('omega_v', np.array([0.25, 0.25])) + omega_V = kwargs.get('omega_V', np.array([0.25, 0.25])) + + s_X = kwargs.get('s_X', 0.25) + s_epsilon_v = kwargs.get('s_epsilon_v', 0.25) + + # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., + # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] + # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] + + alpha_V = np.random.normal(size=(N * M)) + alpha_V_i = np.repeat(np.random.normal(size=N), M) + alpha_V_j = np.tile(np.random.normal(size=M), N) + + cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) + alpha_eps_v = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N * M, ]) + alpha_eps = alpha_eps_v[:, 0] + alpha_v = alpha_eps_v[:, 1] + + alpha_eps_v_i = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N, ]) + alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) + alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) + + alpha_eps_v_j = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[M, ]) + alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) + alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) + + cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) + alpha_X = np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N * M, ]) + alpha_X_i = np.repeat(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N, ]), + M, axis=0) + alpha_X_j = np.tile(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[M, ]), + (N, 1)) + + # generate variables + x = (1 - omega_X[0] - omega_X[1]) * alpha_X \ + + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j + + eps = (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps \ + + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j + + v = (1 - omega_v[0] - omega_v[1]) * alpha_v \ + + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j + + V = (1 - omega_V[0] - omega_V[1]) * alpha_V \ + + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j + + z = np.matmul(x, xi_0) + V + d = z * pi_10 + np.matmul(x, pi_20) + v + y = d * theta + np.matmul(x, zeta_0) + eps + + cluster_cols = ['cluster_var_i', 'cluster_var_j'] + cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) + + if return_type in _array_alias: + return x, y, d, cluster_vars.values, z + elif return_type in _data_frame_alias + _dml_cluster_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_X)] + data = pd.concat((cluster_vars, + pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ['Y', 'D', 'Z'])), + axis=1) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLClusterData(data, 'Y', 'D', cluster_cols, x_cols, 'Z') + else: + raise ValueError('Invalid return_type.') + + +def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type='DoubleMLData', **kwargs): + """ + Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). + The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let + + .. math:: + + f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4), + + f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4). + + + Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, + :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. + At first define + + .. math:: + + Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0, + + Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d), + + p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))}, + + D &= 1\\{p(W_{ps}) \\ge U\\}, + + where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables, + :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform + and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`. + The different data generating processes are defined via + + .. math:: + + DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z + + DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X + + DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z + + DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X + + DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 + + DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0, + + such that the last two settings correspond to an experimental setting with treatment probability + of :math:`P(D=1) = \\frac{1}{2}.` + For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`. + For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``. + Then the outcome will be defined to be + + .. math:: + + Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0), + + where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`. + The true average treatment effect on the treated is zero for all data generating processes. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dgp_type : + The DGP to be used. Default value is ``1`` (integer). + cross_sectional_data : + Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)`` + or ``(x, y, d, t)``. + **kwargs + Additional keyword arguments to set non-default values for the parameter + :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + xi = kwargs.get('xi', 0.75) + c = kwargs.get('c', 0.0) + lambda_t = kwargs.get('lambda_t', 0.5) + + def f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + def f_ps(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + dim_x = 4 + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs) + epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2]) + + if dgp_type == 1: + features_ps = z + features_reg = z + elif dgp_type == 2: + features_ps = x + features_reg = z + elif dgp_type == 3: + features_ps = z + features_reg = x + elif dgp_type == 4: + features_ps = x + features_reg = x + elif dgp_type == 5: + features_ps = None + features_reg = z + elif dgp_type == 6: + features_ps = None + features_reg = x + else: + raise ValueError('The dgp_type is not valid.') + + # treatment and propensities + is_experimental = (dgp_type == 5) or (dgp_type == 6) + if is_experimental: + # Set D to be experimental + p = 0.5 * np.ones(n_obs) + else: + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (p >= u) + + # potential outcomes + nu = np.random.normal(loc=d * f_reg(features_reg), scale=1, size=n_obs) + y0 = f_reg(features_reg) + nu + epsilon_0 + y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0] + y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1] + y1 = d * y1_d1 + (1 - d) * y1_d0 + + if not cross_sectional_data: + y = y1 - y0 + + if return_type in _array_alias: + return z, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((z, y, d)), + columns=z_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', z_cols) + else: + raise ValueError('Invalid return_type.') + + else: + u_t = np.random.uniform(low=0, high=1, size=n_obs) + t = 1.0 * (u_t <= lambda_t) + y = t * y1 + (1 - t) * y0 + + if return_type in _array_alias: + return z, y, d, t + elif return_type in _data_frame_alias + _dml_data_alias: + z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((z, y, d, t)), + columns=z_cols + ['y', 'd', 't']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', z_cols, t_col='t') + else: + raise ValueError('Invalid return_type.') + + +def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): + """ + Generates counfounded data from an interactive regression model. + + The data generating process is defined as follows (inspired by the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the propensity score as + + .. math:: + + m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A + + where + + .. math:: + + p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, + + f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). + + and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. + Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as + + .. math:: + + P(D=1|X) = p(Z). + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) + \\cdot D (Z_5 + 1) + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be + set via the parameters ``gamma_a`` and ``beta_a``. + + The observed data is given as :math:`W = (Y, D, Z)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, + the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and + in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) + are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``0.0``. + gamma_a : float + Coefficient of the unobserved confounder in the propensity score. + Default is ``0.127``. + beta_a : float + Coefficient of the unobserved confounder in the outcome regression. + Default is ``0.58``. + linear : bool + If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = 0.0 # the confounding strength is only valid for c=0 + xi = 0.75 + dim_x = kwargs.get('dim_x', 5) + trimming_threshold = kwargs.get('trimming_threshold', 0.01) + var_eps_y = kwargs.get('var_eps_y', 1.0) + + # Specification of main regression function + def f_reg(w): + res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) + return res + + # Specification of prop score function + def f_ps(w, xi): + res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + z_tilde_5 = x[:, 4] + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + # error terms and unobserved confounder + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # Choose the features used in the models + if linear: + features_ps = x + features_reg = x + else: + features_ps = z + features_reg = z + + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + # compute short and long form of propensity score + m_long = p + gamma_a * a + m_short = p + # check propensity score bounds + if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): + m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) + m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) + warnings.warn(f'Propensity score is close to 0 or 1. ' + f'Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied') + # generate treatment based on long form + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (m_long >= u) + # add treatment heterogeneity + d1x = z[:, 4] + 1 + var_dx = np.var(d * (d1x)) + cov_adx = gamma_a * var_a + # Outcome regression + g_partial_reg = f_reg(features_reg) + # short model + g_short_d0 = g_partial_reg + g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg + g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 + # long model + g_long_d0 = g_partial_reg + beta_a * a + g_long_d1 = theta * d1x + g_partial_reg + beta_a * a + g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 + # Potential outcomes + y_0 = g_long_d0 + eps_y + y_1 = g_long_d1 + eps_y + # Realized outcome + y = d * y_1 + (1.0 - d) * y_0 + # In-sample values for confounding strength + explained_residual_variance = np.square(g_long - g_short) + residual_variance = np.square(y - g_short) + cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) + # compute the Riesz representation + treated_weight = d / np.mean(d) + untreated_weight = (1.0 - d) / np.mean(d) + # Odds ratios + propensity_ratio_long = m_long / (1.0 - m_long) + rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) + rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) + propensity_ratio_short = m_short / (1.0 - m_short) + rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) + rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) + cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( + 1 / (m_long * (1 - m_long))) + cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) + if (beta_a == 0) | (gamma_a == 0): + rho_ate = 0.0 + rho_atte = 0.0 + else: + rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] + rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] + oracle_values = { + 'g_long': g_long, + 'g_short': g_short, + 'm_long': m_long, + 'm_short': m_short, + 'gamma_a': gamma_a, + 'beta_a': beta_a, + 'a': a, + 'y_0': y_0, + 'y_1': y_1, + 'z': z, + 'cf_y': cf_y, + 'cf_d_ate': cf_d_ate, + 'cf_d_atte': cf_d_atte, + 'rho_ate': rho_ate, + 'rho_atte': rho_atte, + } + res_dict = { + 'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values + } + return res_dict + + +def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): + """ + Generates counfounded data from an partially linear regression model. + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, + where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the treatment as + + .. math:: + + D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D + + and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. + Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as + + .. math:: + + E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + + E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. + Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). + + The observed data is given as :math:`W = (Y, D, X)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, + the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and + the propensity score are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``5.0``. + cf_y : float + Percentage of the residual variation of the outcome explained by latent/confounding variable. + Default is ``0.04``. + cf_d : float + Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. + Default is ``0.04``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = kwargs.get('c', 0.0) + dim_x = kwargs.get('dim_x', 4) + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # get the required impact of the confounder on the propensity score + m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] + + def f_m(gamma_a): + rr_long = eps_d / var_eps_d + rr_short = (gamma_a * a + eps_d) / (gamma_a ** 2 * var_a + var_eps_d) + C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) + return np.square(C2_D / (1 + C2_D) - cf_d) + + gamma_a = minimize_scalar(f_m).x + m_long = m_short + gamma_a * a + d = m_long + eps_d + + # short and long version of g + g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) + + var_d = np.var(d) + + def f_g(beta_a): + g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) + y_diff = eps_y + g_diff + return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) + + beta_a = minimize_scalar(f_g).x + + g_long = theta * d + g_partial_reg + beta_a * a + g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg + + y = g_long + eps_y + + oracle_values = {'g_long': g_long, + 'g_short': g_short, + 'm_long': m_long, + 'm_short': m_short, + 'theta': theta, + 'gamma_a': gamma_a, + 'beta_a': beta_a, + 'a': a, + 'z': z} + + res_dict = {'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values} + + return res_dict + + +def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): + """ + Creates a simple synthetic example for heterogeneous treatment effects. + The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). + + The data is generated as + + .. math:: + + Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i + + D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, + + where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i + \\sim\\mathcal{U}[-1,1]`. + If the treatment is set to be binary, the treatment is generated as + + .. math:: + D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. + + The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support + which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. + Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending + on the dimension of :math:`x`. + + If the heterogeneity is univariate the conditional treatment effect takes the following form + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), + + whereas for the two-dimensional case the conditional treatment effect is defined as + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + Default is ``200``. + + p : int + Dimension of covariates. + Default is ``30``. + + support_size : int + Number of relevant (confounding) covariates. + Default is ``5``. + + n_x : int + Dimension of the heterogeneity. Can be either ``1`` or ``2``. + Default is ``1``. + + binary_treatment : bool + Indicates whether the treatment is binary. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``data``, ``effects``, ``treatment_effect``. + + """ + # simple input checks + assert n_x in [1, 2], 'n_x must be either 1 or 2.' + assert support_size <= p, 'support_size must be smaller than p.' + assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.' + + # define treatment effects + if n_x == 1: + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) + else: + assert n_x == 2 + + # redefine treatment effect + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) + + # Outcome support and coefficients + support_y = np.random.choice(np.arange(p), size=support_size, replace=False) + coefs_y = np.random.uniform(0, 1, size=support_size) + # treatment support and coefficients + support_d = support_y + coefs_d = np.random.uniform(0, 0.3, size=support_size) + + # noise + epsilon = np.random.uniform(-1, 1, size=n_obs) + eta = np.random.uniform(-1, 1, size=n_obs) + + # Generate controls, covariates, treatments and outcomes + x = np.random.uniform(0, 1, size=(n_obs, p)) + # Heterogeneous treatment effects + te = treatment_effect(x) + if binary_treatment: + d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) + else: + d = np.dot(x[:, support_d], coefs_d) + eta + y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon + + # Now we build the dataset + y_df = pd.DataFrame({'y': y}) + d_df = pd.DataFrame({'d': d}) + x_df = pd.DataFrame( + data=x, + index=np.arange(x.shape[0]), + columns=[f'X_{i}' for i in range(x.shape[1])] + ) + + data = pd.concat([y_df, d_df, x_df], axis=1) + res_dict = { + 'data': data, + 'effects': te, + 'treatment_effect': treatment_effect} + return res_dict + + +def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleMLData'): + """ + Generates data from a sample selection model (SSM). + The data generating process is defined as + + .. math:: + + y_i &= \\theta d_i + x_i' \\beta d_i + u_i, + + s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, + + d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, + + with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where + :math:`\\Sigma^2_x` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` + :math:`z_i \\sim \\mathcal{N}(0, 1)`, + :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, + :math:`w_i \\sim \\mathcal{N}(0, 1)`. + + + The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, + Huber and Lafférs (2023). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + mar: + Boolean. Indicates whether missingness at random holds. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. + + References + ---------- + Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, + Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 + """ + if mar: + sigma = np.array([[1, 0], [0, 1]]) + gamma = 0 + else: + sigma = np.array([[1, 0.8], [0.8, 1]]) + gamma = 1 + + e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + beta = [0.4 / (k ** 2) for k in range(1, dim_x + 1)] + + d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) + z = np.random.randn(n_obs) + s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) + + y = np.dot(x, beta) + theta * d + e[1] + y[s == 0] = 0 + + if return_type in _array_alias: + return x, y, d, z, s + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + if mar: + data = pd.DataFrame(np.column_stack((x, y, d, s)), + columns=x_cols + ['y', 'd', 's']) + else: + data = pd.DataFrame(np.column_stack((x, y, d, z, s)), + columns=x_cols + ['y', 'd', 'z', 's']) + if return_type in _data_frame_alias: + return data + else: + if mar: + return DoubleMLData(data, 'y', 'd', x_cols, None, None, 's') + return DoubleMLData(data, 'y', 'd', x_cols, 'z', None, 's') + else: + raise ValueError('Invalid return_type.') + + +def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): + """ + Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an + underlying continous treatment). + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + A continuous treatment :math:`D_{\\text{cont}}` is generated as + + .. math:: + + D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, + + where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment + effect is defined as + + .. math:: + + \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. + + Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of + :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels + is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. + + The potential outcomes are defined as + + .. math:: + + Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y + + Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), + + where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as + + .. math:: + + Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. + + The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``200``. + + n_levels : int + The number of treatment levels. + Default is ``3``. + + linear : bool + Indicates whether the true underlying regression is linear. + Default is ``False``. + + random_state : int + Random seed for reproducibility. + Default is ``42``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + The oracle values contain the continuous treatment, the level bounds, the potential level, ITE + and the potential outcome without treatment. + + """ + if random_state is not None: + np.random.seed(random_state) + xi = kwargs.get('xi', 0.3) + c = kwargs.get('c', 0.0) + dim_x = kwargs.get('dim_x', 5) + + if not isinstance(n_levels, int): + raise ValueError('n_levels must be an integer.') + if n_levels < 2: + raise ValueError('n_levels must be at least 2.') + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + def f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + def f_treatment(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + def treatment_effect(d, scale=15): + return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + if linear: + g = f_reg(x) + m = f_treatment(x, xi) + else: + assert not linear + g = f_reg(z) + m = f_treatment(z, xi) + + cont_d = m + eps_d + level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) + potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 + eta = np.random.uniform(0, 1, size=n_obs) + d = 1.0 * (eta >= 1 / n_levels) * potential_level + + ite = treatment_effect(cont_d) + y0 = g + eps_y + # only treated for d > 0 compared to the baseline + y = ite * (d > 0) + y0 + + oracle_values = { + 'cont_d': cont_d, + 'level_bounds': level_bounds, + 'potential_level': potential_level, + 'ite': ite, + 'y0': y0, + } + + resul_dict = { + 'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values + } + + return resul_dict + + +def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): + """ + Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), + designed for use in double/debiased machine learning applications. + + The data generating process is defined as follows: + + - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). + - Treatment \( d_i = a_0(x_i) \). + - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. + - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). + + The nuisance functions are defined as: + + .. math:: + + a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ + &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ + + r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + dim_x : int + Number of covariates. + alpha : float + Value of the causal parameter. + return_type : str + Determines the return format. One of: + + - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. + - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. + - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). + + **kwargs + Optional keyword arguments (currently unused in this implementation). + + Returns + ------- + Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] + The generated data in the specified format. + + References + ---------- + Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. + "Double/Debiased Machine Learning for Logistic Partially Linear Model." + The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. + + """ + + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 1 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + + def a_0(X): + return 2 / (1 + np.exp(X[:, 0])) + \ + -2 / (1 + np.exp(X[:, 1])) + \ + 1 * np.sin(X[:, 2]) + \ + 1 * np.cos(X[:, 3]) + \ + 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ + -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ + 0.2 * X[:, 6] * X[:, 7] + \ + -0.2 * X[:, 8] * X[:, 9] + + + sigma = np.full((dim_x, dim_x), 0.2) + np.fill_diagonal(sigma, 1) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) + np.clip(x, -2, 2, out=x) + + d = a_0(x) + + p = expit(alpha * d[:] + r_0(x)) + + y = np.random.binomial(1, p) + + if return_type in _array_alias: + return x, y, d, p + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), + columns=x_cols + ['y', 'd', 'p']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, p_cols='p') + else: + raise ValueError('Invalid return_type.') diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index fdee739dd..35c9af651 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -113,6 +113,10 @@ class DoubleMLData(DoubleMLBaseData): The score or selection variable (only relevant/used for RDD or SSM Estimatiors). Default is ``None``. + p_cols : None, str or list, optional + The column(s) containing the probabilities of the outcome (only for simulated, binary data). + Default is ``None``. + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. @@ -145,6 +149,7 @@ def __init__(self, z_cols=None, t_col=None, s_col=None, + p_cols=None, use_other_treat_as_covariate=True, force_all_x_finite=True): DoubleMLBaseData.__init__(self, data) @@ -155,6 +160,7 @@ def __init__(self, self.t_col = t_col self.s_col = s_col self.x_cols = x_cols + self.p_cols = p_cols self._check_disjoint_sets_y_d_x_z_t_s() self.use_other_treat_as_covariate = use_other_treat_as_covariate self.force_all_x_finite = force_all_x_finite @@ -187,7 +193,7 @@ def _data_summary_str(self): return data_summary @classmethod - def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covariate=True, + def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as_covariate=True, force_all_x_finite=True): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. @@ -215,6 +221,10 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria Array of the score or selection variable (only relevant/used for RDD and SSM models). Default is ``None``. + p : None or :class:`numpy.ndarray` + Array of the probabilities of the outcome (only for simulated, binary data). + Default is ``None``. + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. @@ -299,7 +309,13 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria if s is not None: data[s_col] = s - return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite) + if p is not None: + if p.shape[1] == 1: + d_cols = ['p'] + else: + d_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] + + return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, p_cols, use_other_treat_as_covariate, force_all_x_finite) @property def x(self): @@ -358,6 +374,41 @@ def s(self): else: return None + @property + def p_cols(self): + """ + The column(s) containing the probabilities of the outcome (only for simulated data). + """ + return self._p_cols + + @p_cols.setter + def p_cols(self, value): + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The probability column(s) p_cols must be of str or list type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid probability column(s) p_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid probability column(s) p_cols. ' + 'At least one probability column is not a data column.') + self._p_cols = value + else: + self._p_cols = None + + @property + def p(self): + """ + Array of probabilities of the outcome (only for simulated data). + """ + if self.p_cols is not None: + return self._p.values + else: + return None + @property def n_treat(self): """ diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index e81f00c52..88ff26a8a 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -8,4 +8,5 @@ __all__ = [ "DoubleMLPLR", "DoubleMLPLIV", + "DoubleMLLogit" ] diff --git a/doubleml/logistic/logistic.py b/doubleml/plm/logistic.py similarity index 87% rename from doubleml/logistic/logistic.py rename to doubleml/plm/logistic.py index ab10ceb87..d48fb29d3 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/plm/logistic.py @@ -1,5 +1,9 @@ +import inspect + import numpy as np -from ..utils._estimation import ( +from torch.sparse import sampled_addmm + +from doubleml.utils._estimation import ( _dml_cv_predict, _trimm, _predict_zero_one_propensity, @@ -15,12 +19,12 @@ import scipy from sklearn.utils.multiclass import type_of_target -from .. import DoubleMLData -from ..double_ml import DoubleML -from ..double_ml_score_mixins import NonLinearScoreMixin -from ..utils import DoubleMLClusterResampling -from ..utils._checks import _check_score, _check_finite_predictions, _check_is_propensity -from ..utils.resampling import DoubleMLDoubleResampling +from doubleml import DoubleMLData +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import NonLinearScoreMixin +from doubleml.utils import DoubleMLClusterResampling +from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from doubleml.utils.resampling import DoubleMLDoubleResampling @@ -61,7 +65,7 @@ class DoubleMLLogit(NonLinearScoreMixin, DoubleML): Default is ``1``. score : str or callable - A str (``'partialling out'`` or ``'IV-type'``) specifying the score function + A str (``'nuisance_space'`` or ``'instrument'``) specifying the score function or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. Default is ``'partialling out'``. @@ -103,14 +107,14 @@ class DoubleMLLogit(NonLinearScoreMixin, DoubleML): def __init__(self, obj_dml_data, - ml_m, ml_M, ml_t, + ml_m, ml_a=None, n_folds=5, n_folds_inner=5, n_rep=1, - score='logistic', + score='nuisance_space', draw_sample_splitting=True): self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, @@ -122,12 +126,16 @@ def __init__(self, self._coef_start_val = 1.0 self._check_data(self._dml_data) - valid_scores = ['logistic'] + valid_scores = ['nuisance_space', 'instrument'] _check_score(self.score, valid_scores, allow_callable=True) _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) - ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=True) + + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True) + else: + ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=False) self._learner = {'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} if ml_a is not None: @@ -157,6 +165,11 @@ def __init__(self, else: self._predict_method['ml_a'] = 'predict' + if score == 'instrument': + sig = inspect.signature(self.learner['ml_a'].fit) + if not 'sample_weight' in sig.parameters: + raise ValueError('Learner \"ml_a\" who supports sample_weight is required for score type \"instrument\"') + self._initialize_ml_nuisance_params() self._external_predictions_implemented = True @@ -174,7 +187,7 @@ def _check_data(self, obj_dml_data): def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, - n_jobs=None, est_params=None, method='predict'): + n_jobs=None, est_params=None, method='predict', sample_weights=None): res = {} res['preds'] = np.zeros(y.shape, dtype=float) res['preds_inner'] = [] @@ -182,7 +195,7 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, est_params=est_params, method=method, - return_models=True, smpls_is_partition=True) + return_models=True, smpls_is_partition=True, sample_weights=sample_weights) _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) res['preds_inner'].append(res_inner['preds']) @@ -214,19 +227,41 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa else: a_external = False + if M_external: + M_hat = {'preds': external_predictions['ml_M'], + 'targets': None, + 'models': None} + else: + M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) + + # TODO + #if self._score_type == "instrument": + + # nuisance m if m_external: m_hat = {'preds': external_predictions['ml_m'], 'targets': None, 'models': None} else: - filtered_smpls = [] - for train, test in smpls: - train_filtered = train[y[train] == 0] - filtered_smpls.append((train_filtered, test)) - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models) + if self.score == 'instrument': + weights = [] + for i, (train, test) in enumerate(smpls): + weights.append( M_hat['preds_inner'][i][train] * (1-M_hat['preds_inner'][i][train])) + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], + return_models=return_models, weights=weights) + + else: + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append((train_filtered, test)) + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], + return_models=return_models) _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): @@ -242,14 +277,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'probabilities and not labels are predicted.') - if M_external: - M_hat = {'preds': external_predictions['ml_M'], - 'targets': None, - 'models': None} - else: - M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) + if a_external: a_hat = {'preds': external_predictions['ml_a'], @@ -456,15 +484,22 @@ def set_sample_splitting(self): def _compute_score(self, psi_elements, coef): - score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] - + if self._score_type == 'nuisance_space': + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] + score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + else: + score = (psi_elements["y"] - np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] - return psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + return score def _compute_score_deriv(self, psi_elements, coef, inds=None): - deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + if self._score_type == 'nuisance_space': + deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + else: + deriv = - psi_elements["d"] * np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"]) * psi_elements["d_tilde"] - return psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + return deriv def cate(self, basis, is_gate=False): diff --git a/doubleml/logistic/tests/_utils_logistic_manual.py b/doubleml/plm/tests/_utils_logistic_manual.py similarity index 87% rename from doubleml/logistic/tests/_utils_logistic_manual.py rename to doubleml/plm/tests/_utils_logistic_manual.py index ae53992a6..af4d034eb 100644 --- a/doubleml/logistic/tests/_utils_logistic_manual.py +++ b/doubleml/plm/tests/_utils_logistic_manual.py @@ -2,8 +2,8 @@ import scipy from sklearn.base import clone, is_classifier -from ...tests._utils_boot import boot_manual, draw_weights -from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search +from doubleml.tests._utils_boot import boot_manual, draw_weights +from doubleml.tests._utils import fit_predict, fit_predict_proba, tune_grid_search def fit_logistic_multitreat(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, @@ -155,32 +155,6 @@ def fit_nuisance_logistic_classifier(y, x, d, learner_l, learner_m, learner_g, s return l_hat, m_hat, g_hat -def tune_nuisance_plr(y, x, d, ml_l, ml_m, ml_g, smpls, n_folds_tune, param_grid_l, param_grid_m, param_grid_g, tune_g=True): - l_tune_res = tune_grid_search(y, x, ml_l, smpls, param_grid_l, n_folds_tune) - - m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) - - if tune_g: - l_hat = np.full_like(y, np.nan) - m_hat = np.full_like(d, np.nan) - for idx, (train_index, _) in enumerate(smpls): - l_hat[train_index] = l_tune_res[idx].predict(x[train_index, :]) - m_hat[train_index] = m_tune_res[idx].predict(x[train_index, :]) - psi_a = -np.multiply(d - m_hat, d - m_hat) - psi_b = np.multiply(d - m_hat, y - l_hat) - theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) - - g_tune_res = tune_grid_search(y - theta_initial*d, x, ml_g, smpls, param_grid_g, n_folds_tune) - g_best_params = [xx.best_params_ for xx in g_tune_res] - else: - g_best_params = [] - - l_best_params = [xx.best_params_ for xx in l_tune_res] - m_best_params = [xx.best_params_ for xx in m_tune_res] - - return l_best_params, m_best_params, g_best_params - - def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): y_minus_l_hat = np.full_like(y, np.nan, dtype='float64') d_minus_m_hat = np.full_like(d, np.nan, dtype='float64') @@ -193,13 +167,6 @@ def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): return y_minus_l_hat, d_minus_m_hat, y_minus_g_hat -def plr_dml2(y, x, d, l_hat, m_hat, g_hat, smpls, score): - n_obs = len(y) - y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) - theta_hat = plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score) - se = np.sqrt(var_plr(theta_hat, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs)) - - return theta_hat, se def var_plr(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs): diff --git a/doubleml/logistic/tests/tests_logistic.py b/doubleml/plm/tests/tests_logistic.py similarity index 85% rename from doubleml/logistic/tests/tests_logistic.py rename to doubleml/plm/tests/tests_logistic.py index 2b97bf76b..a77db7a67 100644 --- a/doubleml/logistic/tests/tests_logistic.py +++ b/doubleml/plm/tests/tests_logistic.py @@ -11,8 +11,8 @@ import doubleml as dml -from ...tests._utils import draw_smpls -from ._utils_logistic_manual import fit_logistic, , boot_plr +from doubleml.tests._utils import draw_smpls +from ._utils_logistic_manual import fit_logistic, boot_plr @pytest.fixture(scope='module', @@ -304,49 +304,4 @@ def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture): @pytest.fixture(scope='module', params=["nonrobust", "HC0", "HC1", "HC2", "HC3"]) def cov_type(request): - return request.param - - -@pytest.mark.ci -def test_dml_plr_cate_gate(score, cov_type): - n = 9 - - # collect data - np.random.seed(42) - obj_dml_data = dml.datasets.make_plr_CCDDHNR2018(n_obs=n) - ml_l = LinearRegression() - ml_g = LinearRegression() - ml_m = LinearRegression() - - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_g, ml_m, ml_l, - n_folds=2, - score=score) - dml_plr_obj.fit() - random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) - cate = dml_plr_obj.cate(random_basis, cov_type=cov_type) - assert isinstance(cate, dml.DoubleMLBLP) - assert isinstance(cate.confint(), pd.DataFrame) - assert cate.blp_model.cov_type == cov_type - - groups_1 = pd.DataFrame( - np.column_stack([obj_dml_data.data['X1'] <= 0, - obj_dml_data.data['X1'] > 0.2]), - columns=['Group 1', 'Group 2']) - msg = ('At least one group effect is estimated with less than 6 observations.') - with pytest.warns(UserWarning, match=msg): - gate_1 = dml_plr_obj.gate(groups_1, cov_type=cov_type) - assert isinstance(gate_1, dml.utils.blp.DoubleMLBLP) - assert isinstance(gate_1.confint(), pd.DataFrame) - assert all(gate_1.confint().index == groups_1.columns.tolist()) - assert gate_1.blp_model.cov_type == cov_type - - np.random.seed(42) - groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) - msg = ('At least one group effect is estimated with less than 6 observations.') - with pytest.warns(UserWarning, match=msg): - gate_2 = dml_plr_obj.gate(groups_2, cov_type=cov_type) - assert isinstance(gate_2, dml.utils.blp.DoubleMLBLP) - assert isinstance(gate_2.confint(), pd.DataFrame) - assert all(gate_2.confint().index == ["Group_1", "Group_2"]) - assert gate_2.blp_model.cov_type == cov_type + return request.param \ No newline at end of file diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 3ed110f3c..6029dfd97 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -43,9 +43,9 @@ def _fit(estimator, x, y, train_index, idx=None): return estimator, idx -def _dml_cv_predict( - estimator, x, y, smpls=None, n_jobs=None, est_params=None, method="predict", return_train_preds=False, return_models=False -, smpls_is_partition=None): +def _dml_cv_predict(estimator, x, y, smpls=None, + n_jobs=None, est_params=None, method='predict', return_train_preds=False, return_models=False, + smpls_is_partition=None, sample_weights=None): n_obs = x.shape[0] # TODO: Better name for smples_is_partition @@ -53,9 +53,9 @@ def _dml_cv_predict( smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) - manual_cv_predict = ( - (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target | return_models - ) + manual_cv_predict = (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target \ + | return_models | bool(sample_weights) + #TODO: Check if cross_val_predict supports weights res = {"models": None} if not manual_cv_predict: @@ -187,6 +187,22 @@ def _draw_weights(method, n_rep_boot, n_obs): return weights +def _trimm(preds, trimming_rule, trimming_threshold): + if trimming_rule == 'truncate': + preds[preds < trimming_threshold] = trimming_threshold + preds[preds > 1 - trimming_threshold] = 1 - trimming_threshold + return preds + + +def _normalize_ipw(propensity, treatment): + mean_treat1 = np.mean(np.divide(treatment, propensity)) + mean_treat0 = np.mean(np.divide(1.0 - treatment, 1.0 - propensity)) + normalized_weights = np.multiply(treatment, np.multiply(propensity, mean_treat1)) \ + + np.multiply(1.0 - treatment, 1.0 - np.multiply(1.0 - propensity, mean_treat0)) + + return normalized_weights + + def _rmse(y_true, y_pred): subset = np.logical_not(np.isnan(y_true)) rmse = root_mean_squared_error(y_true[subset], y_pred[subset]) @@ -302,7 +318,7 @@ def _var_est(psi, psi_deriv, smpls, is_cluster_data, cluster_vars=None, smpls_cl J_l = test_cluster_inds[1] const = np.divide(min(len(I_k), len(J_l)), (np.square(len(I_k) * len(J_l)))) for cluster_value in I_k: - ind_cluster = (first_cluster_var == cluster_value) & np.isin(second_cluster_var, J_l) + ind_cluster = (first_cluster_var == cluster_value) & np.in1d(second_cluster_var, J_l) gamma_hat += const * np.sum(np.outer(psi[ind_cluster], psi[ind_cluster])) for cluster_value in J_l: ind_cluster = (second_cluster_var == cluster_value) & np.isin(first_cluster_var, I_k) From c6e600d2f67abf33aa59d8f074453c49ebd60c77 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 27 Aug 2025 19:18:16 +0200 Subject: [PATCH 07/48] Fixed bug in score computation --- doubleml/double_ml_data.py | 14 ++- doubleml/plm/logistic.py | 183 ++++++++----------------------------- 2 files changed, 49 insertions(+), 148 deletions(-) diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index 35c9af651..612e6b7f0 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -288,6 +288,15 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as check_consistent_length(x, y, d, s) s_col = 's' + + if p is None: + p_cols = None + else: + if p.shape[1] == 1: + p_cols = ['p'] + else: + p_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] + if d.shape[1] == 1: d_cols = ['d'] else: @@ -310,10 +319,7 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as data[s_col] = s if p is not None: - if p.shape[1] == 1: - d_cols = ['p'] - else: - d_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] + data[p_cols] = p return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, p_cols, use_other_treat_as_covariate, force_all_x_finite) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index d48fb29d3..3e04d15d5 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -215,9 +215,9 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, - force_all_finite=False) + ensure_all_finite=False) x, d = check_X_y(x, self._dml_data.d, - force_all_finite=False) + ensure_all_finite=False) x_d_concat = np.hstack((d.reshape(-1,1), x)) m_external = external_predictions['ml_m'] is not None M_external = external_predictions['ml_M'] is not None @@ -236,9 +236,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa n_jobs=n_jobs_cv, est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) - # TODO - #if self._score_type == "instrument": - # nuisance m if m_external: @@ -254,7 +251,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], return_models=return_models, weights=weights) - else: + elif self.score == 'nuisance_space': filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] @@ -262,6 +259,8 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], return_models=return_models) + else: + raise NotImplementedError _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): @@ -288,31 +287,32 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa n_jobs=n_jobs_cv, est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) - # r_legacy = np.zeros_like(y) - # smpls_inner = self.__smpls__inner - # M_hat = {} - # a_hat = {} - # M_hat['preds_inner'] = [] - # M_hat['preds'] = np.full_like(y, np.nan) - # a_hat['preds_inner'] = [] - # a_hat['preds'] = np.full_like(y, np.nan) - # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - # test = smpls_single_split[1] - # train = smpls_single_split[0] - # # r_legacy[test] = - # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], - # self._learner['ml_m'], self._learner['ml_M'], - # smpls_single_split, smpls_double_split, y, x, d, - # x_d_concat, n_jobs_cv) - # Mtemp = np.full_like(y, np.nan) - # Mtemp[train] = Mleg - # Atemp = np.full_like(y, np.nan) - # Atemp[train] = aleg - # M_hat['preds_inner'].append(Mtemp) - # a_hat['preds_inner'].append(Atemp) - # a_hat['preds'][test] = a_nf_leg - # - # #r_hat['preds'] = r_legacy + + r_legacy = np.zeros_like(y) + smpls_inner = self.__smpls__inner + M_hat_l = {} + a_hat_l = {} + M_hat_l['preds_inner'] = [] + M_hat_l['preds'] = np.full_like(y, np.nan) + a_hat_l['preds_inner'] = [] + a_hat_l['preds'] = np.full_like(y, np.nan) + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + test = smpls_single_split[1] + train = smpls_single_split[0] + # r_legacy[test] = + Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], + self._learner['ml_m'], self._learner['ml_M'], + smpls_single_split, smpls_double_split, y, x, d, + x_d_concat, n_jobs_cv) + Mtemp = np.full_like(y, np.nan) + Mtemp[train] = Mleg + Atemp = np.full_like(y, np.nan) + Atemp[train] = aleg + M_hat_l['preds_inner'].append(Mtemp) + a_hat_l['preds_inner'].append(Atemp) + a_hat_l['preds'][test] = a_nf_leg + + #r_hat['preds'] = r_legacy @@ -343,10 +343,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa r_hat = {} r_hat['preds'] = t_hat['preds'] - beta * a_hat['preds'] - - - - psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) preds = {'predictions': {'ml_r': r_hat['preds'], @@ -484,124 +480,23 @@ def set_sample_splitting(self): def _compute_score(self, psi_elements, coef): - if self._score_type == 'nuisance_space': + if self.score == 'nuisance_space': score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) - else: + elif self.score == 'instrument': score = (psi_elements["y"] - np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] + else: + raise NotImplementedError return score def _compute_score_deriv(self, psi_elements, coef, inds=None): - if self._score_type == 'nuisance_space': + if self.score == 'nuisance_space': deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 - else: + elif self.score == 'instrument': deriv = - psi_elements["d"] * np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"]) * psi_elements["d_tilde"] - - return deriv - - - def cate(self, basis, is_gate=False): - """ - Calculate conditional average treatment effects (CATE) for a given basis. - - Parameters - ---------- - basis : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, - where ``n_obs`` is the number of observations and ``d`` is the number of predictors. - is_gate : bool - Indicates whether the basis is constructed for GATEs (dummy-basis). - Default is ``False``. - - Returns - ------- - model : :class:`doubleML.DoubleMLBLP` - Best linear Predictor model. - """ - if self._dml_data.n_treat > 1: - raise NotImplementedError('Only implemented for single treatment. ' + - f'Number of treatments is {str(self._dml_data.n_treat)}.') - if self.n_rep != 1: - raise NotImplementedError('Only implemented for one repetition. ' + - f'Number of repetitions is {str(self.n_rep)}.') - - Y_tilde, D_tilde = self._partial_out() - - D_basis = basis * D_tilde - model = DoublelMLBLP( - orth_signal=Y_tilde.reshape(-1), - basis=D_basis, - is_gate=is_gate, - ) - model.fit() - - ## TODO: Solve score - - - return model - - def gate(self, groups): - """ - Calculate group average treatment effects (GATE) for groups. - - Parameters - ---------- - groups : :class:`pandas.DataFrame` - The group indicator for estimating the best linear predictor. Groups should be mutually exclusive. - Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations - and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str). - - Returns - ------- - model : :class:`doubleML.DoubleMLBLP` - Best linear Predictor model for Group Effects. - """ - - if not isinstance(groups, pd.DataFrame): - raise TypeError('Groups must be of DataFrame type. ' - f'Groups of type {str(type(groups))} was passed.') - if not all(groups.dtypes == bool) or all(groups.dtypes == int): - if groups.shape[1] == 1: - groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') - else: - raise TypeError('Columns of groups must be of bool type or int type (dummy coded). ' - 'Alternatively, groups should only contain one column.') - - if any(groups.sum(0) <= 5): - warnings.warn('At least one group effect is estimated with less than 6 observations.') - - model = self.cate(groups, is_gate=True) - return model - - def _partial_out(self): - """ - Helper function. Returns the partialled out quantities of Y and D. - Works with multiple repetitions. - - Returns - ------- - Y_tilde : :class:`numpy.ndarray` - The residual of the regression of Y on X. - D_tilde : :class:`numpy.ndarray` - The residual of the regression of D on X. - """ - if self.predictions is None: - raise ValueError('predictions are None. Call .fit(store_predictions=True) to store the predictions.') - - y = self._dml_data.y.reshape(-1, 1) - d = self._dml_data.d.reshape(-1, 1) - ml_m = self.predictions["ml_m"].squeeze(axis=2) - - if self.score == "partialling out": - ml_l = self.predictions["ml_l"].squeeze(axis=2) - Y_tilde = y - ml_l - D_tilde = d - ml_m else: - assert self.score == "IV-type" - ml_g = self.predictions["ml_g"].squeeze(axis=2) - Y_tilde = y - (self.coef * ml_m) - ml_g - D_tilde = d - ml_m + raise NotImplementedError - return Y_tilde, D_tilde \ No newline at end of file + return deriv \ No newline at end of file From 6f556e02caaf3e39e8b11e2655361178305ca183 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 27 Aug 2025 22:02:40 +0200 Subject: [PATCH 08/48] Reverted from ensure_all_finite to force_all_finite --- doubleml/plm/logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index 3e04d15d5..a716497d2 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -215,9 +215,9 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, - ensure_all_finite=False) + force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, - ensure_all_finite=False) + force_all_finite=False) x_d_concat = np.hstack((d.reshape(-1,1), x)) m_external = external_predictions['ml_m'] is not None M_external = external_predictions['ml_M'] is not None From 3a332bf91e97af94780805130f21b7688238d29d Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 28 Aug 2025 15:59:29 +0200 Subject: [PATCH 09/48] Fixes to instrument score --- doubleml/plm/logistic.py | 53 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index a716497d2..e19fc1e40 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -249,7 +249,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa weights.append( M_hat['preds_inner'][i][train] * (1-M_hat['preds_inner'][i][train])) m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models, weights=weights) + return_models=return_models, sample_weights=weights) elif self.score == 'nuisance_space': filtered_smpls = [] @@ -288,29 +288,29 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) - r_legacy = np.zeros_like(y) - smpls_inner = self.__smpls__inner - M_hat_l = {} - a_hat_l = {} - M_hat_l['preds_inner'] = [] - M_hat_l['preds'] = np.full_like(y, np.nan) - a_hat_l['preds_inner'] = [] - a_hat_l['preds'] = np.full_like(y, np.nan) - for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - test = smpls_single_split[1] - train = smpls_single_split[0] - # r_legacy[test] = - Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], - self._learner['ml_m'], self._learner['ml_M'], - smpls_single_split, smpls_double_split, y, x, d, - x_d_concat, n_jobs_cv) - Mtemp = np.full_like(y, np.nan) - Mtemp[train] = Mleg - Atemp = np.full_like(y, np.nan) - Atemp[train] = aleg - M_hat_l['preds_inner'].append(Mtemp) - a_hat_l['preds_inner'].append(Atemp) - a_hat_l['preds'][test] = a_nf_leg + # r_legacy = np.zeros_like(y) + # smpls_inner = self.__smpls__inner + # M_hat_l = {} + # a_hat_l = {} + # M_hat_l['preds_inner'] = [] + # M_hat_l['preds'] = np.full_like(y, np.nan) + # a_hat_l['preds_inner'] = [] + # a_hat_l['preds'] = np.full_like(y, np.nan) + # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + # test = smpls_single_split[1] + # train = smpls_single_split[0] + # # r_legacy[test] = + # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], + # self._learner['ml_m'], self._learner['ml_M'], + # smpls_single_split, smpls_double_split, y, x, d, + # x_d_concat, n_jobs_cv) + # Mtemp = np.full_like(y, np.nan) + # Mtemp[train] = Mleg + # Atemp = np.full_like(y, np.nan) + # Atemp[train] = aleg + # M_hat_l['preds_inner'].append(Mtemp) + # a_hat_l['preds_inner'].append(Atemp) + # a_hat_l['preds'][test] = a_nf_leg #r_hat['preds'] = r_legacy @@ -484,7 +484,7 @@ def _compute_score(self, psi_elements, coef): score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) elif self.score == 'instrument': - score = (psi_elements["y"] - np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] + score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] else: raise NotImplementedError @@ -495,7 +495,8 @@ def _compute_score_deriv(self, psi_elements, coef, inds=None): deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 elif self.score == 'instrument': - deriv = - psi_elements["d"] * np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"]) * psi_elements["d_tilde"] + expit = scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"]) + deriv = - psi_elements["d"] * expit * (1-expit) * psi_elements["d_tilde"] else: raise NotImplementedError From b41a773c92a3d0aab04e76bfdb7d1343ff129122 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 3 Sep 2025 14:52:48 +0200 Subject: [PATCH 10/48] Added option for exception on convergence failure --- doubleml/double_ml_score_mixins.py | 44 ++++++++++++++++++------------ doubleml/plm/logistic.py | 4 ++- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py index 57dd6e623..b0c69c25e 100644 --- a/doubleml/double_ml_score_mixins.py +++ b/doubleml/double_ml_score_mixins.py @@ -86,6 +86,7 @@ class NonLinearScoreMixin: _score_type = "nonlinear" _coef_start_val = np.nan _coef_bounds = None + _error_on_convergence_failure = False @property @abstractmethod @@ -149,12 +150,14 @@ def score_deriv(theta): theta_hat = root_res.root if not root_res.converged: score_val = score(theta_hat) - warnings.warn( - "Could not find a root of the score function.\n " - f"Flag: {root_res.flag}.\n" - f"Score value found is {score_val} " - f"for parameter theta equal to {theta_hat}." - ) + msg = ('Could not find a root of the score function.\n ' + f'Flag: {root_res.flag}.\n' + f'Score value found is {score_val} ' + f'for parameter theta equal to {theta_hat}.') + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds) @@ -182,16 +185,19 @@ def score_squared(theta): else: score_val_sign = np.sign(score(alt_coef_start)) if score_val_sign > 0: + theta_hat_array, score_val, _ = fmin_l_bfgs_b( score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( - "Could not find a root of the score function.\n " - f"Minimum score value found is {score_val} " - f"for parameter theta equal to {theta_hat}.\n " - "No theta found such that the score function evaluates to a negative value." - ) + msg = ('Could not find a root of the score function.\n ' + f'Minimum score value found is {score_val} ' + f'for parameter theta equal to {theta_hat}.\n ' + 'No theta found such that the score function evaluates to a negative value.') + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: def neg_score(theta): @@ -202,11 +208,13 @@ def neg_score(theta): neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( - "Could not find a root of the score function. " - f"Maximum score value found is {-1 * neg_score_val} " - f"for parameter theta equal to {theta_hat}. " - "No theta found such that the score function evaluates to a positive value." - ) + msg = ('Could not find a root of the score function. ' + f'Maximum score value found is {-1*neg_score_val} ' + f'for parameter theta equal to {theta_hat}. ' + 'No theta found such that the score function evaluates to a positive value.') + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) return theta_hat diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index e19fc1e40..9e1bb8750 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -115,13 +115,15 @@ def __init__(self, n_folds_inner=5, n_rep=1, score='nuisance_space', - draw_sample_splitting=True): + draw_sample_splitting=True, + error_on_convergence_failure=False,): self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) + self._error_on_convergence_failure = error_on_convergence_failure self._coef_bounds = (-1e-2, 1e2) self._coef_start_val = 1.0 From c434667ec8a668ca271d6639194807fda1ca26f6 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Mon, 29 Sep 2025 10:38:13 -0700 Subject: [PATCH 11/48] Added unbalanced dataset option, bug fixes --- doubleml/datasets.py | 34 ++++++++++----- doubleml/plm/logistic.py | 80 ++++++++++++++++++++++++++++++++++- doubleml/utils/_estimation.py | 28 ++++++------ 3 files changed, 115 insertions(+), 27 deletions(-) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index 629a033aa..dad8b9f79 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1651,7 +1651,7 @@ def treatment_effect(d, scale=15): return resul_dict -def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): +def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, **kwargs): """ Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), designed for use in double/debiased machine learning applications. @@ -1705,16 +1705,28 @@ def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLD """ - def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 1 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) + if balanced_r0: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 1 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + else: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 3 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.5 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) def a_0(X): return 2 / (1 + np.exp(X[:, 0])) + \ diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index 9e1bb8750..7314debd7 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -143,9 +143,11 @@ def __init__(self, if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) self._learner['ml_a'] = ml_a + self._ml_a_provided = True else: self._learner['ml_a'] = clone(ml_m) ml_a_is_classifier = ml_m_is_classifier + self._ml_a_provided = False self._predict_method = {'ml_t': 'predict', 'ml_M': 'predict_proba'} @@ -449,8 +451,82 @@ def _score_element_names(self): def _sensitivity_element_est(self, preds): pass - def _nuisance_tuning(self): - pass + def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, + search_mode, n_iter_randomized_search): + # TODO: test + x, y = check_X_y(self._dml_data.x, self._dml_data.y, + force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, + force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + + if scoring_methods is None: + scoring_methods = {'ml_m': None, + 'ml_M': None, + 'ml_a': None, + 'ml_t': None} + + train_inds = [train_index for (train_index, _) in smpls] + M_tune_res = _dml_tune(y, x_d_concat, train_inds, + self._learner['ml_M'], param_grids['ml_M'], scoring_methods['ml_M'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + + if self.score == 'nuisance_space': + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append(train_filtered) + filtered_train_inds = [train_index for (train_index, _) in smpls] + elif self.score == 'instrument': + filtered_train_inds = train_inds + else: + raise NotImplementedError + m_tune_res = _dml_tune(d, x, filtered_train_inds, + self._learner['ml_m'], param_grids['ml_m'], scoring_methods['ml_m'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + + a_tune_res = _dml_tune(d, x, train_inds, + self._learner['ml_a'], param_grids['ml_a'], scoring_methods['ml_a'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + + M_best_params = [xx.best_params_ for xx in M_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + a_best_params = [xx.best_params_ for xx in a_tune_res] + + # Create targets for tuning ml_t + M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=M_best_params, method=self._predict_method['ml_M'])) + + W_inner = [] + for i, (train, test) in enumerate(smpls): + M_iteration = M_hat['preds_inner'][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + + t_tune_res = _dml_tune(W_inner, x, train_inds, + self._learner['ml_t'], param_grids['ml_t'], scoring_methods['ml_t'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + t_best_params = [xx.best_params_ for xx in t_tune_res] + + + + # Update params and tune_res to include ml_a and ml_t + params = {'ml_M': M_best_params, + 'ml_m': m_best_params, + 'ml_a': a_best_params, + 'ml_t': t_best_params} + tune_res = {'M_tune': M_tune_res, + 'm_tune': m_tune_res, + 'a_tune': a_tune_res, + 't_tune': t_tune_res} + + res = {'params': params, + 'tune_res': tune_res} + + return res @property def __smpls__inner(self): diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 6029dfd97..8086322a8 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -149,25 +149,25 @@ def _dml_cv_predict(estimator, x, y, smpls=None, return res -def _dml_tune( - y, x, train_inds, learner, param_grid, scoring_method, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search -): +def _dml_tune(y, x, train_inds, + learner, param_grid, scoring_method, + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search, fold_specific_target=False): tune_res = list() - for train_index in train_inds: + for i, train_index in enumerate(train_inds): tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True) if search_mode == "grid_search": g_grid_search = GridSearchCV(learner, param_grid, scoring=scoring_method, cv=tune_resampling, n_jobs=n_jobs_cv) else: - assert search_mode == "randomized_search" - g_grid_search = RandomizedSearchCV( - learner, - param_grid, - scoring=scoring_method, - cv=tune_resampling, - n_jobs=n_jobs_cv, - n_iter=n_iter_randomized_search, - ) - tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) + assert search_mode == 'randomized_search' + g_grid_search = RandomizedSearchCV(learner, param_grid, + scoring=scoring_method, + cv=tune_resampling, + n_jobs=n_jobs_cv, + n_iter=n_iter_randomized_search) + if fold_specific_target: + tune_res.append(g_grid_search.fit(x[train_index, :], y[i])) + else: + tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) return tune_res From 443d82ddcfa530f8151e47ad467bb17cddb2b0ed Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Tue, 7 Oct 2025 15:42:38 -0700 Subject: [PATCH 12/48] Added binary treatment dataset, fixed bug for model check --- doubleml/datasets.py | 11 +++++++++-- doubleml/plm/logistic.py | 3 +-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index dad8b9f79..b555b3bca 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1651,7 +1651,7 @@ def treatment_effect(d, scale=15): return resul_dict -def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, **kwargs): +def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): """ Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), designed for use in double/debiased machine learning applications. @@ -1745,7 +1745,14 @@ def a_0(X): x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) np.clip(x, -2, 2, out=x) - d = a_0(x) + if treatment == "continuous": + d = a_0(x) + elif treatment == "binary": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont - d_cont.mean())) + elif treatment == "binary_unbalanced": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont)) p = expit(alpha * d[:] + r_0(x)) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index 7314debd7..3e21cbf0c 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -1,7 +1,6 @@ import inspect import numpy as np -from torch.sparse import sampled_addmm from doubleml.utils._estimation import ( _dml_cv_predict, @@ -134,7 +133,7 @@ def __init__(self, _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) - if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + if np.array_equal(np.unique(obj_dml_data.d), [0, 1]): ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True) else: ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=False) From 774c74dfb98d7cb3b461bd962a0f37b74fce3257 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Tue, 7 Oct 2025 15:45:10 -0700 Subject: [PATCH 13/48] Adjusted dataset balancing --- doubleml/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index b555b3bca..6d9acfc88 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1723,9 +1723,9 @@ def r_0(X): 0.1 * X[:, 5] ** 3 + \ -0.5 * np.sin(X[:, 6]) ** 2 + \ 0.5 * np.cos(X[:, 7]) + \ - 3 / (1 + X[:, 8] ** 2) + \ + 4 / (1 + X[:, 8] ** 2) + \ -1 / (1 + np.exp(X[:, 9])) + \ - 0.5 * np.where(X[:, 10] > 0, 1, 0) + \ + 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ -0.25 * np.where(X[:, 12] > 0, 1, 0) def a_0(X): From 9695820f2cefa6bd1b63659fcca96e9f6f6a805a Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 13:54:16 -0700 Subject: [PATCH 14/48] Renamed Logistic to LPLR Added test set-up --- doubleml/__init__.py | 4 +- doubleml/plm/__init__.py | 3 +- doubleml/plm/datasets/__init__.py | 2 + doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 139 +++++++++ doubleml/plm/{logistic.py => lplr.py} | 213 +++---------- doubleml/plm/tests/_utils_lplr_manual.py | 335 +++++++++++++++++++++ doubleml/plm/tests/test_lplr.py | 105 +++++++ doubleml/plm/tests/test_lplr_exceptions.py | 293 ++++++++++++++++++ doubleml/plm/tests/test_lplr_tune.py | 227 ++++++++++++++ 9 files changed, 1155 insertions(+), 166 deletions(-) create mode 100644 doubleml/plm/datasets/dgp_lplr_LZZ2020.py rename doubleml/plm/{logistic.py => lplr.py} (69%) create mode 100644 doubleml/plm/tests/_utils_lplr_manual.py create mode 100644 doubleml/plm/tests/test_lplr.py create mode 100644 doubleml/plm/tests/test_lplr_exceptions.py create mode 100644 doubleml/plm/tests/test_lplr_tune.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index ba59a07e0..7c8ead970 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,7 +13,7 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM -from doubleml.plm.logistic import DoubleMLLogit +from doubleml.plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR @@ -45,7 +45,7 @@ "DoubleMLBLP", "DoubleMLPolicyTree", "DoubleMLSSM", - "DoubleMLLogit", + "DoubleMLLPLR", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index 88ff26a8a..37262ed93 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -4,9 +4,10 @@ from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR +from .lplr import DoubleMLLPLR __all__ = [ "DoubleMLPLR", "DoubleMLPLIV", - "DoubleMLLogit" + "DoubleMLLPLR" ] diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index b2bb7df0e..5f433ae79 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -8,6 +8,7 @@ from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 from .dgp_plr_turrell2018 import make_plr_turrell2018 +from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 __all__ = [ "make_plr_CCDDHNR2018", @@ -15,5 +16,6 @@ "make_confounded_plr_data", "make_pliv_CHS2015", "make_pliv_multiway_cluster_CKMS2021", + "make_lplr_LZZ2020", "_make_pliv_data", ] diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py new file mode 100644 index 000000000..007e2b918 --- /dev/null +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -0,0 +1,139 @@ +import numpy as np +import pandas as pd +from scipy.special import expit + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + +def make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): + """ + Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), + designed for use in double/debiased machine learning applications. + + The data generating process is defined as follows: + + - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). + - Treatment \( d_i = a_0(x_i) \). + - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. + - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). + + The nuisance functions are defined as: + + .. math:: + + a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ + &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ + + r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + dim_x : int + Number of covariates. + alpha : float + Value of the causal parameter. + return_type : str + Determines the return format. One of: + + - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. + - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. + - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). + balanced_r0 : bool, default True + If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced + heterogeneity). If False, uses an "unbalanced" r_0 specification with larger + share of Y=0. + treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous' + Determines how the treatment d is generated from a_0(x): + - 'continuous': d = a_0(x) (continuous treatment). + - 'binary': d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) . + - 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ). + + **kwargs + Optional keyword arguments (currently unused in this implementation). + + Returns + ------- + Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] + The generated data in the specified format. + + References + ---------- + Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. + "Double/Debiased Machine Learning for Logistic Partially Linear Model." + The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. + + """ + + if balanced_r0: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 1 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + else: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 4 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + + def a_0(X): + return 2 / (1 + np.exp(X[:, 0])) + \ + -2 / (1 + np.exp(X[:, 1])) + \ + 1 * np.sin(X[:, 2]) + \ + 1 * np.cos(X[:, 3]) + \ + 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ + -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ + 0.2 * X[:, 6] * X[:, 7] + \ + -0.2 * X[:, 8] * X[:, 9] + + + sigma = np.full((dim_x, dim_x), 0.2) + np.fill_diagonal(sigma, 1) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) + np.clip(x, -2, 2, out=x) + + if treatment == "continuous": + d = a_0(x) + elif treatment == "binary": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont - d_cont.mean())) + elif treatment == "binary_unbalanced": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont)) + + p = expit(alpha * d[:] + r_0(x)) + + y = np.random.binomial(1, p) + + if return_type in _array_alias: + return x, y, d, p + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), + columns=x_cols + ['y', 'd', 'p']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') \ No newline at end of file diff --git a/doubleml/plm/logistic.py b/doubleml/plm/lplr.py similarity index 69% rename from doubleml/plm/logistic.py rename to doubleml/plm/lplr.py index 3e21cbf0c..1ed00810a 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/lplr.py @@ -29,79 +29,64 @@ -class DoubleMLLogit(NonLinearScoreMixin, DoubleML): - """Double machine learning for partially linear regression models +class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): + """Double machine learning for partially logistic models (binary outcomes) Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. - - ml_r : estimator implementing ``fit()`` and ``predict()`` - A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. - :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`\\ell_0(X) = E[Y|X]`. - - ml_m : estimator implementing ``fit()`` and ``predict()`` - A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. - :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`m_0(X) = E[D|X]`. - For binary treatment variables :math:`D` (with values 0 and 1), a classifier implementing ``fit()`` and - ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``, - ``predict_proba()`` is used otherwise ``predict()``. - - ml_g : estimator implementing ``fit()`` and ``predict()`` - A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. - :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function - :math:`g_0(X) = E[Y - D \\theta_0|X]`. - Note: The learner `ml_g` is only required for the score ``'IV-type'``. Optionally, it can be specified and - estimated for callable scores. - - n_folds : int - Number of folds. - Default is ``5``. - - n_rep : int - Number of repetitons for the sample splitting. - Default is ``1``. - - score : str or callable - A str (``'nuisance_space'`` or ``'instrument'``) specifying the score function - or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. - Default is ``'partialling out'``. - - draw_sample_splitting : bool - Indicates whether the sample splitting should be drawn during initialization of the object. - Default is ``True``. + obj_dml_data : DoubleMLData + The DoubleMLData object providing the data and variable specification. + The outcome variable y must be binary with values {0, 1}. + ml_M : estimator + Classifier for M_0(D, X) = P[Y = 1 | D, X]. Must implement fit() and predict_proba(). + ml_t : estimator + Regressor for the auxiliary regression used to predict log-odds. Must implement fit() and predict(). + ml_m : estimator + Learner for m_0(X) = E[D | X]. For binary treatments a classifier with predict_proba() is expected; + for continuous treatments a regressor with predict() is expected. + ml_a : estimator, optional + Optional alternative learner for E[D | X]. If not provided, a clone of ml_m is used. + Must support the same prediction interface as ml_m. + n_folds : int, default=5 + Number of outer cross-fitting folds. + n_folds_inner : int, default=5 + Number of inner folds for nested resampling used internally. + n_rep : int, default=1 + Number of repetitions for sample splitting. + score : {'nuisance_space', 'instrument'} or callable, default='nuisance_space' + Score to use. 'nuisance_space' estimates m on subsamples with y=0; 'instrument' uses an instrument-type score. + draw_sample_splitting : bool, default=True + Whether to draw sample splitting during initialization. + error_on_convergence_failure : bool, default=False + If True, raise an error on convergence failure of score. Examples -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 - >>> from sklearn.ensemble import RandomForestRegressor + >>> from doubleml.plm.datasets import make_lplr_LZZ2020 + >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> from sklearn.base import clone >>> np.random.seed(3141) - >>> learner = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) - >>> ml_g = learner - >>> ml_m = learner - >>> obj_dml_data = make_plr_CCDDHNR2018(alpha=0.5, n_obs=500, dim_x=20) - >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> dml_plr_obj.fit().summary - coef std err t P>|t| 2.5 % 97.5 % - d 0.462321 0.04107 11.256983 2.139582e-29 0.381826 0.542816 + >>> ml_t = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_lplr_obj = dml.DoubleMLPLR(obj_dml_data, ml_M, ml_t, ml_m) + >>> dml_lplr_obj.fit().summary + coef std err t P>|t| 2.5 % 97.5 % + d 0.480691 0.040533 11.859129 1.929729e-32 0.401247 0.560135 Notes ----- - **Partially linear regression (PLR)** models take the form + **Partially logistic regression (PLR)** models take the form .. math:: - Y = D \\theta_0 + g_0(X) + \\zeta, & &\\mathbb{E}(\\zeta | D,X) = 0, - - D = m_0(X) + V, & &\\mathbb{E}(V | X) = 0, + Y = \\text{expit} ( D \\theta_0 + r_0(X)) where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. - The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates, - and :math:`\\zeta` and :math:`V` are stochastic errors. + The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates. """ def __init__(self, @@ -122,13 +107,18 @@ def __init__(self, n_rep, score, draw_sample_splitting) + + # Ensure outcome only contains 0 and 1 (validate early in constructor) + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError("The outcome variable y must be binary with values 0 and 1.") + self._error_on_convergence_failure = error_on_convergence_failure self._coef_bounds = (-1e-2, 1e2) self._coef_start_val = 1.0 self._check_data(self._dml_data) valid_scores = ['nuisance_space', 'instrument'] - _check_score(self.score, valid_scores, allow_callable=True) + _check_score(self.score, valid_scores, allow_callable=False) _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) @@ -208,7 +198,6 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s res['preds'][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] else: res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) - res["preds_inner"] res["preds"] /= len(smpls) res['targets'] = np.copy(y) return res @@ -216,7 +205,6 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): - # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, @@ -278,9 +266,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'observed to be binary with values 0 and 1. Make sure that for classifiers ' 'probabilities and not labels are predicted.') - - - if a_external: a_hat = {'preds': external_predictions['ml_a'], 'targets': None, @@ -290,35 +275,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa n_jobs=n_jobs_cv, est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) - - # r_legacy = np.zeros_like(y) - # smpls_inner = self.__smpls__inner - # M_hat_l = {} - # a_hat_l = {} - # M_hat_l['preds_inner'] = [] - # M_hat_l['preds'] = np.full_like(y, np.nan) - # a_hat_l['preds_inner'] = [] - # a_hat_l['preds'] = np.full_like(y, np.nan) - # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - # test = smpls_single_split[1] - # train = smpls_single_split[0] - # # r_legacy[test] = - # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], - # self._learner['ml_m'], self._learner['ml_M'], - # smpls_single_split, smpls_double_split, y, x, d, - # x_d_concat, n_jobs_cv) - # Mtemp = np.full_like(y, np.nan) - # Mtemp[train] = Mleg - # Atemp = np.full_like(y, np.nan) - # Atemp[train] = aleg - # M_hat_l['preds_inner'].append(Mtemp) - # a_hat_l['preds_inner'].append(Atemp) - # a_hat_l['preds'][test] = a_nf_leg - - #r_hat['preds'] = r_legacy - - - W_inner = [] beta = np.zeros(d.shape, dtype=float) @@ -366,74 +322,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa return psi_elements, preds - - def legacy_implementation(self, Yfold: np.ndarray, Xfold: np.ndarray, Afold: np.ndarray, XnotFold: np.ndarray, AnotFold: np.ndarray, - learner, learnerClassifier, smpls_single_split, smpls_double_split, yfull, xfull, afull, x_d_concat, n_jobs_cv, noFolds: int = 5, seed=None, )-> (np.ndarray, np.ndarray, np.ndarray): - - def learn_predict(X, Y, Xpredict, learner, learnerClassifier, fit_args={}): - results = [] - if len(np.unique(Y)) == 2: - learnerClassifier.fit(X, Y, **fit_args) - for x in Xpredict: - results.append(learnerClassifier.predict_proba(x)[:, 1]) - else: - learner.fit(X, Y, **fit_args) - for x in Xpredict: - results.append(learner.predict(x)) - return (*results,) - - nFold = len(Yfold) - i = np.remainder(np.arange(nFold), noFolds) - np.random.default_rng(seed).shuffle(i) - - M = np.zeros((nFold)) - a_hat = np.zeros((nFold)) - a_hat_notFold = np.zeros((len(XnotFold))) - M_notFold = np.zeros((len(XnotFold))) - loss = {} - - a_hat_inner = _dml_cv_predict(self._learner['ml_a'], xfull, afull, smpls=smpls_double_split, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'], - return_models=True, smpls_is_partition=True) - _check_finite_predictions(a_hat_inner['preds'], self._learner['ml_a'], 'ml_a', smpls_double_split) - a_hat_notFold = np.full_like(yfull, 0.) - for model in a_hat_inner['models']: - if self._predict_method['ml_a'] == 'predict_proba': - a_hat_notFold[smpls_single_split[1]] += model.predict_proba(xfull[smpls_single_split[1]])[:, 1] - else: - a_hat_notFold[smpls_single_split[1]] += model.predict(xfull[smpls_single_split[1]]) - - M_hat = _dml_cv_predict(self._learner['ml_M'], x_d_concat, yfull, smpls=smpls_double_split, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'], - return_models=True, smpls_is_partition=True) - _check_finite_predictions(M_hat['preds'], self._learner['ml_M'], 'ml_M', smpls_double_split) - - M = M_hat['preds'][~np.isnan(M_hat['preds'])] - a_hat = a_hat_inner['preds'][~np.isnan(a_hat_inner['preds'])] - a_hat_notFold = a_hat_notFold[smpls_single_split[1]] - - np.clip(M, 1e-8, 1 - 1e-8, out=M) -# loss["M"] = compute_loss(Yfold, M) -# loss["a_hat"] = compute_loss(Afold, a_hat) - a_hat_notFold /= noFolds - # M_notFold /= noFolds - np.clip(M_notFold, 1e-8, 1 - 1e-8, out=M_notFold) - - # Obtain preliminary estimate of beta based on M and residual of a - W = scipy.special.logit(M) - A_resid = Afold - a_hat - beta_notFold = sum(A_resid * W) / sum(A_resid ** 2) - # print(beta_notFold) - t_notFold, = learn_predict(Xfold, W, [XnotFold], learner, learnerClassifier) - W_notFold = scipy.special.expit(M_notFold) -# loss["t"] = compute_loss(W_notFold, t_notFold) - - - # Compute r based on estimates for W=logit(M), beta and residual of A - r_notFold = t_notFold - beta_notFold * a_hat_notFold - - return M, a_hat, a_hat_notFold #r_notFold #, a_hat_notFold, M_notFold, t_notFold - def _score_elements(self, y, d, r_hat, m_hat): # compute residual d_tilde = d - m_hat @@ -470,12 +358,11 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_ self._learner['ml_M'], param_grids['ml_M'], scoring_methods['ml_M'], n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + filtered_train_inds = [] if self.score == 'nuisance_space': - filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] - filtered_smpls.append(train_filtered) - filtered_train_inds = [train_index for (train_index, _) in smpls] + filtered_train_inds.append(train_filtered) elif self.score == 'instrument': filtered_train_inds = train_inds else: @@ -553,7 +440,7 @@ def draw_sample_splitting(self): return self def set_sample_splitting(self): - raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLogit.') + raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLPLR.') def _compute_score(self, psi_elements, coef): @@ -577,4 +464,4 @@ def _compute_score_deriv(self, psi_elements, coef, inds=None): else: raise NotImplementedError - return deriv \ No newline at end of file + return deriv diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py new file mode 100644 index 000000000..f14a1f66c --- /dev/null +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -0,0 +1,335 @@ +import numpy as np +from sklearn.base import clone +from sklearn.model_selection import train_test_split + +from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search +from ...utils._estimation import _predict_zero_one_propensity +from ...utils._propensity_score import _trimm + + +def fit_selection( + y, + x, + d, + z, + s, + learner_g, + learner_pi, + learner_m, + all_smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + normalize_ipw=True, + n_rep=1, + g_d0_params=None, + g_d1_params=None, + pi_params=None, + m_params=None, +): + n_obs = len(y) + + thetas = np.zeros(n_rep) + ses = np.zeros(n_rep) + + all_g_d1_hat = list() + all_g_d0_hat = list() + all_pi_hat = list() + all_m_hat = list() + + all_psi_a = list() + all_psi_b = list() + + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + + g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list = fit_nuisance_selection( + y, + x, + d, + z, + s, + learner_g, + learner_pi, + learner_m, + smpls, + score, + trimming_rule=trimming_rule, + trimming_threshold=trimming_threshold, + g_d0_params=g_d0_params, + g_d1_params=g_d1_params, + pi_params=pi_params, + m_params=m_params, + ) + all_g_d1_hat.append(g_hat_d1_list) + all_g_d0_hat.append(g_hat_d0_list) + all_pi_hat.append(pi_hat_list) + all_m_hat.append(m_hat_list) + + g_hat_d1, g_hat_d0, pi_hat, m_hat = compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls) + + dtreat = d == 1 + dcontrol = d == 0 + psi_a, psi_b = selection_score_elements(dtreat, dcontrol, g_hat_d1, g_hat_d0, pi_hat, m_hat, s, y, normalize_ipw) + + all_psi_a.append(psi_a) + all_psi_b.append(psi_b) + + thetas[i_rep], ses[i_rep] = selection_dml2(psi_a, psi_b) + + theta = np.median(thetas) + se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) + + res = { + "theta": theta, + "se": se, + "thetas": thetas, + "ses": ses, + "all_g_d1_hat": all_g_d1_hat, + "all_g_d0_hat": all_g_d0_hat, + "all_pi_hat": all_pi_hat, + "all_m_hat": all_m_hat, + "all_psi_a": all_psi_a, + "all_psi_b": all_psi_b, + } + + return res + + +def fit_nuisance_selection( + y, + x, + d, + z, + s, + learner_g, + learner_pi, + learner_m, + smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + g_d0_params=None, + g_d1_params=None, + pi_params=None, + m_params=None, +): + ml_g_d1 = clone(learner_g) + ml_g_d0 = clone(learner_g) + ml_pi = clone(learner_pi) + ml_m = clone(learner_m) + + if z is None: + dx = np.column_stack((d, x)) + else: + dx = np.column_stack((d, x, z)) + + if score == "missing-at-random": + pi_hat_list = fit_predict_proba(s, dx, ml_pi, pi_params, smpls, trimming_threshold=trimming_threshold) + + m_hat_list = fit_predict_proba(d, x, ml_m, m_params, smpls) + + train_cond_d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) + g_hat_d1_list = fit_predict(y, x, ml_g_d1, g_d1_params, smpls, train_cond=train_cond_d1_s1) + + train_cond_d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) + g_hat_d0_list = fit_predict(y, x, ml_g_d0, g_d0_params, smpls, train_cond=train_cond_d0_s1) + else: + # initialize empty lists + g_hat_d1_list = [] + g_hat_d0_list = [] + pi_hat_list = [] + m_hat_list = [] + + # create strata for splitting + strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) + + # POTENTIAL OUTCOME Y(1) + for i_fold, _ in enumerate(smpls): + ml_g_d1 = clone(learner_g) + ml_pi = clone(learner_pi) + ml_m = clone(learner_m) + + # set the params for the nuisance learners + if g_d1_params is not None: + ml_g_d1.set_params(**g_d1_params[i_fold]) + if g_d0_params is not None: + ml_g_d0.set_params(**g_d0_params[i_fold]) + if pi_params is not None: + ml_pi.set_params(**pi_params[i_fold]) + if m_params is not None: + ml_m.set_params(**m_params[i_fold]) + + train_inds = smpls[i_fold][0] + test_inds = smpls[i_fold][1] + + # start nested crossfitting + train_inds_1, train_inds_2 = train_test_split( + train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] + ) + + s_train_1 = s[train_inds_1] + dx_train_1 = dx[train_inds_1, :] + + # preliminary propensity score for selection + ml_pi_prelim = clone(ml_pi) + # fit on first part of training set + ml_pi_prelim.fit(dx_train_1, s_train_1) + pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) + + # predictions for small pi in denominator + pi_hat = pi_hat_prelim[test_inds] + + # add selection indicator to covariates + xpi = np.column_stack((x, pi_hat_prelim)) + + # estimate propensity score p using the second training sample + xpi_train_2 = xpi[train_inds_2, :] + d_train_2 = d[train_inds_2] + xpi_test = xpi[test_inds, :] + + ml_m.fit(xpi_train_2, d_train_2) + + m_hat = _predict_zero_one_propensity(ml_m, xpi_test) + + # estimate conditional outcome on second training sample -- treatment + s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] + y_s1_d1_train_2 = y[s1_d1_train_2_indices] + + ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) + + # predict conditional outcome + g_hat_d1 = ml_g_d1.predict(xpi_test) + + # estimate conditional outcome on second training sample -- control + s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] + y_s1_d0_train_2 = y[s1_d0_train_2_indices] + + ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) + + # predict conditional outcome + g_hat_d0 = ml_g_d0.predict(xpi_test) + + m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) + + # append predictions on test sample to final list of predictions + g_hat_d1_list.append(g_hat_d1) + g_hat_d0_list.append(g_hat_d0) + pi_hat_list.append(pi_hat) + m_hat_list.append(m_hat) + + return g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list + + +def compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls): + g_hat_d1 = np.full_like(y, np.nan, dtype="float64") + g_hat_d0 = np.full_like(y, np.nan, dtype="float64") + pi_hat = np.full_like(y, np.nan, dtype="float64") + m_hat = np.full_like(y, np.nan, dtype="float64") + + for idx, (_, test_index) in enumerate(smpls): + g_hat_d1[test_index] = g_hat_d1_list[idx] + g_hat_d0[test_index] = g_hat_d0_list[idx] + pi_hat[test_index] = pi_hat_list[idx] + m_hat[test_index] = m_hat_list[idx] + + return g_hat_d1, g_hat_d0, pi_hat, m_hat + + +def selection_score_elements(dtreat, dcontrol, g_d1, g_d0, pi, m, s, y, normalize_ipw): + # psi_a + psi_a = -1 * np.ones_like(y) + + # psi_b + if normalize_ipw: + weight_treat = sum(dtreat) / sum((dtreat * s) / (m * pi)) + weight_control = sum(dcontrol) / sum((dcontrol * s) / ((1 - m) * pi)) + + psi_b1 = weight_treat * ((dtreat * s * (y - g_d1)) / (m * pi)) + g_d1 + psi_b0 = weight_control * ((dcontrol * s * (y - g_d0)) / ((1 - m) * pi)) + g_d0 + + else: + psi_b1 = (dtreat * s * (y - g_d1)) / (m * pi) + g_d1 + psi_b0 = (dcontrol * s * (y - g_d0)) / ((1 - m) * pi) + g_d0 + + psi_b = psi_b1 - psi_b0 + + return psi_a, psi_b + + +def selection_dml2(psi_a, psi_b): + n_obs = len(psi_a) + theta_hat = -np.mean(psi_b) / np.mean(psi_a) + se = np.sqrt(var_selection(theta_hat, psi_a, psi_b, n_obs)) + + return theta_hat, se + + +def var_selection(theta, psi_a, psi_b, n_obs): + J = np.mean(psi_a) + var = 1 / n_obs * np.mean(np.power(np.multiply(psi_a, theta) + psi_b, 2)) / np.power(J, 2) + return var + + +def tune_nuisance_ssm_mar(y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m): + d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) + d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) + + g0_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d0_s1) + g1_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d1_s1) + + dx = np.column_stack((x, d)) + + pi_tune_res = tune_grid_search(s, dx, ml_pi, smpls, param_grid_pi, n_folds_tune) + + m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) + + g0_best_params = [xx.best_params_ for xx in g0_tune_res] + g1_best_params = [xx.best_params_ for xx in g1_tune_res] + pi_best_params = [xx.best_params_ for xx in pi_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + + return g0_best_params, g1_best_params, pi_best_params, m_best_params + + +def tune_nuisance_ssm_nonignorable( + y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m +): + + train_inds = [tr for (tr, _) in smpls] + + inner0_list, inner1_list = [], [] + for tr in train_inds: + i0, i1 = train_test_split(tr, test_size=0.5, stratify=d[tr] + 2 * s[tr], random_state=42) + inner0_list.append(i0) + inner1_list.append(i1) + + X_dz = np.c_[x, d.reshape(-1, 1), z.reshape(-1, 1)] + pi_tune_res = tune_grid_search(s, X_dz, ml_pi, [(i0, np.array([])) for i0 in inner0_list], param_grid_pi, n_folds_tune) + pi_best_params = [gs.best_params_ for gs in pi_tune_res] + + pi_hat_full = np.full_like(s, np.nan, dtype=float) + for i0, i1, gs in zip(inner0_list, inner1_list, pi_tune_res): + ml_pi_temp = clone(ml_pi) + ml_pi_temp.set_params(**gs.best_params_) + ml_pi_temp.fit(X_dz[i0], s[i0]) + ph = _predict_zero_one_propensity(ml_pi_temp, X_dz) + pi_hat_full[i1] = ph[i1] + + X_pi = np.c_[x, pi_hat_full] + m_tune_res = tune_grid_search(d, X_pi, ml_m, [(i1, np.array([])) for i1 in inner1_list], param_grid_m, n_folds_tune) + m_best_params = [gs.best_params_ for gs in m_tune_res] + + X_pi_d = np.c_[x, d.reshape(-1, 1), pi_hat_full.reshape(-1, 1)] + inner1_d0_s1 = [i1[(d[i1] == 0) & (s[i1] == 1)] for i1 in inner1_list] + inner1_d1_s1 = [i1[(d[i1] == 1) & (s[i1] == 1)] for i1 in inner1_list] + + g0_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d0_s1], param_grid_g, n_folds_tune) + g1_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d1_s1], param_grid_g, n_folds_tune) + + g0_best_params = [gs.best_params_ for gs in g0_tune_res] + g1_best_params = [gs.best_params_ for gs in g1_tune_res] + + return g0_best_params, g1_best_params, pi_best_params, m_best_params diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py new file mode 100644 index 000000000..c561d9fe8 --- /dev/null +++ b/doubleml/plm/tests/test_lplr.py @@ -0,0 +1,105 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import LassoCV, LogisticRegressionCV + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_ssm_manual import fit_selection + + +@pytest.fixture(scope="module", params=[[LassoCV(), LogisticRegressionCV(penalty="l1", solver="liblinear")]]) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def normalize_ipw(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.01]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_selection_fixture( + generate_data_selection_mar, generate_data_selection_nonignorable, learner, score, trimming_threshold, normalize_ipw +): + n_folds = 3 + + # collect data + np.random.seed(42) + if score == "missing-at-random": + (x, y, d, z, s) = generate_data_selection_mar + else: + (x, y, d, z, s) = generate_data_selection_nonignorable + + ml_g = clone(learner[0]) + ml_pi = clone(learner[1]) + ml_m = clone(learner[1]) + + np.random.seed(42) + n_obs = len(y) + all_smpls = draw_smpls(n_obs, n_folds) + + np.random.seed(42) + if score == "missing-at-random": + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) + dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) + else: + assert score == "nonignorable" + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) + dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) + + np.random.seed(42) + dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) + dml_sel_obj.fit() + + np.random.seed(42) + res_manual = fit_selection( + y, + x, + d, + z, + s, + clone(learner[0]), + clone(learner[1]), + clone(learner[1]), + all_smpls, + score, + trimming_rule="truncate", + trimming_threshold=trimming_threshold, + normalize_ipw=normalize_ipw, + ) + + res_dict = { + "coef": dml_sel_obj.coef[0], + "coef_manual": res_manual["theta"], + "se": dml_sel_obj.se[0], + "se_manual": res_manual["se"], + } + + # sensitivity tests + # TODO + + return res_dict + + +@pytest.mark.ci +def test_dml_selection_coef(dml_selection_fixture): + assert math.isclose(dml_selection_fixture["coef"], dml_selection_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-2) + + +@pytest.mark.ci +def test_dml_selection_se(dml_selection_fixture): + assert math.isclose(dml_selection_fixture["se"], dml_selection_fixture["se_manual"], rel_tol=1e-9, abs_tol=5e-2) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py new file mode 100644 index 000000000..4361e7c7b --- /dev/null +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -0,0 +1,293 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData +from doubleml.plm.datasets import make_lplr_LZZ2020 + +np.random.seed(3141) +n = 100 +# create test data and basic learners +dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=10) +ml_M = RandomForestClassifier() +ml_t = RandomForestRegressor() +ml_m = RandomForestRegressor() +dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) +dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") + +@pytest.mark.ci +def test_lplr_exception_data(): + msg = ( + r"The data must be of DoubleMLData type\. .* of type " + r" was passed\." + ) + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) + + dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=5) + dml_data_nb.data[dml_data_nb.y_col] = dml_data_nb.data[dml_data_nb.y_col] + 1 + dml_data_nb._set_y_z() + with pytest.raises(TypeError, match="The outcome variable y must be binary with values 0 and 1."): + _ = DoubleMLLPLR(dml_data_nb, ml_M, ml_t, ml_m) + + +@pytest.mark.ci +def test_lplr_exception_scores(): + # LPLR valid scores are 'nuisance_space' and 'instrument' + msg = "Invalid score MAR" + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="MAR") + msg = "score should be string. 0 was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) + +@pytest.mark.ci +def test_ssm_exception_resampling(): + msg = "The number of folds must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=1.5) + + msg = "The number of repetitions for the sample splitting must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=1.5) + + msg = "The number of folds must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=0) + + msg = "The number of repetitions for the sample splitting must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=0) + + msg = "draw_sample_splitting must be True or False. Got true." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting="true") + + +@pytest.mark.ci +def test_lplr_exception_get_params(): + msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_M or ml_g_t or ml_m or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.get_params("ml_x") + +@pytest.mark.ci +def test_lplr_exception_smpls(): + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + dml_plr_no_smpls = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting=False) + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_smpls.smpls + +@pytest.mark.ci +def test_lplr_exception_fit(): + msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(n_jobs_cv="5") + msg = "store_predictions must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_predictions=1) + msg = "store_models must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_models=1) + +@pytest.mark.ci +def test_lplr_exception_bootstrap(): + dml_lplr_boot = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before bootstrap\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap() + + dml_lplr_boot.fit() + msg = 'Method must be "Bayes", "normal" or "wild". Got Gaussian.' + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(method="Gaussian") + msg = "The number of bootstrap replications must be of int type. 500 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot="500") + msg = "The number of bootstrap replications must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot=0) + + +@pytest.mark.ci +def test_lplr_exception_confint(): + dml_lplr_conf = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before confint\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint() + dml_lplr_conf.fit() + + msg = "joint must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(joint=1) + msg = "The confidence level must be of float type. 5% of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(level="5%") + msg = r"The confidence level must be in \(0,1\). 0.0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(level=0.0) + + msg = r"Apply bootstrap\(\) before confint\(joint=True\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(joint=True) + dml_lplr_conf.bootstrap() + df_lplr_ci = dml_lplr_conf.confint(joint=True) + assert isinstance(df_lplr_ci, pd.DataFrame) + + +@pytest.mark.ci +def test_lplr_exception_set_ml_nuisance_params(): + # invalid learner name + msg = "Invalid nuisance learner g. Valid nuisance learner ml_M or ml_t or ml_m or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) + # invalid treatment variable + msg = "Invalid treatment variable y. Valid treatment variable d." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("ml_M", "y", {"alpha": 0.1}) + + +class _DummyNoSetParams: + def fit(self): + pass + + +class _DummyNoGetParams(_DummyNoSetParams): + def set_params(self): + pass + + +class _DummyNoClassifier(_DummyNoGetParams): + def get_params(self): + pass + + def predict_proba(self): + pass + + +class LogisticRegressionManipulatedType(LogisticRegression): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.estimator_type = None + return tags + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", +) +def test_lplr_exception_learner(): + err_msg_prefix = "Invalid learner provided for ml_t: " + + msg = err_msg_prefix + "provide an instance of a learner instead of a class." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, Lasso, ml_m) + msg = err_msg_prefix + r"BaseEstimator\(\) has no method .fit\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, BaseEstimator(), ml_m) + msg = r"has no method .set_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoSetParams(), ml_m) + msg = r"has no method .get_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoGetParams(), ml_m) + + # ml_m may not be a classifier when treatment is not binary + msg = ( + r"The ml_m learner LogisticRegression\(\) was identified as classifier " + r"but at least one treatment variable is not binary with values 0 and 1\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, LogisticRegression()) + + # construct a classifier which is not identifiable as classifier via is_classifier by sklearn + log_reg = LogisticRegressionManipulatedType() + # TODO(0.11) can be removed if the sklearn dependency is bumped to 1.6.0 + log_reg._estimator_type = None + msg = ( + r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " + r"no classifier\." + ) + with pytest.warns(UserWarning, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, log_reg) + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", + r"ignore: Learner provided for ml_m is probably invalid.*is \(probably\) no classifier.*:UserWarning", +) +def test_lplr_exception_and_warning_learner(): + # invalid ml_M (must be a classifier with predict_proba) + with pytest.raises(TypeError): + _ = DoubleMLLPLR(dml_data, _DummyNoClassifier(), ml_t, ml_m) + msg = "Invalid learner provided for ml_M: " + r"Lasso\(\) has no method .predict_proba\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, Lasso(), ml_t, ml_m) + + +class LassoWithNanPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.nan + return preds + + +class LassoWithInfPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.inf + return preds + + +@pytest.mark.ci +def test_lplr_nan_prediction(): + msg = r"Predictions from learner LassoWithNanPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithNanPred(), ml_m).fit() + msg = r"Predictions from learner LassoWithInfPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithInfPred(), ml_m).fit() + + +@pytest.mark.ci +def test_double_ml_exception_evaluate_learner(): + dml_lplr_obj = DoubleMLLPLR( + dml_data, + ml_M=LogisticRegression(), + ml_t=Lasso(), + ml_m=RandomForestRegressor(), + n_folds=5, + score="nuisance_space", + ) + + msg = r"Apply fit\(\) before evaluate_learners\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners() + + dml_lplr_obj.fit() + + msg = "metric should be a callable. 'mse' was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_obj.evaluate_learners(metric="mse") + + msg = ( + r"The learners have to be a subset of \['ml_M', 'ml_t', 'ml_m', 'ml_a'\]\. " + r"Learners \['ml_mu', 'ml_p'\] provided." + ) + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) + + def eval_fct(y_pred, y_true): + return np.nan + + with pytest.raises(ValueError): + dml_lplr_obj.evaluate_learners(metric=eval_fct) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py new file mode 100644 index 000000000..0e0fa7bfd --- /dev/null +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -0,0 +1,227 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LogisticRegression + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_lplr_manual import fit_selection, tune_nuisance_ssm_mar, tune_nuisance_ssm_nonignorable + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_g(request): + return request.param + + +@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def normalize_ipw(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def tune_on_folds(request): + return request.param + + +def get_par_grid(learner): + if learner.__class__ in [RandomForestRegressor]: + par_grid = {"n_estimators": [5, 10, 20]} + else: + assert learner.__class__ in [LogisticRegression] + par_grid = {"C": np.logspace(-2, 2, 10)} + return par_grid + + +@pytest.fixture(scope="module") +def dml_ssm_fixture( + generate_data_selection_mar, + generate_data_selection_nonignorable, + learner_g, + learner_m, + score, + normalize_ipw, + tune_on_folds, +): + par_grid = {"ml_g": get_par_grid(learner_g), "ml_pi": get_par_grid(learner_m), "ml_m": get_par_grid(learner_m)} + n_folds_tune = 4 + n_folds = 2 + + # collect data + np.random.seed(42) + if score == "missing-at-random": + (x, y, d, z, s) = generate_data_selection_mar + else: + (x, y, d, z, s) = generate_data_selection_nonignorable + + n_obs = len(y) + all_smpls = draw_smpls(n_obs, n_folds) + + ml_g = clone(learner_g) + ml_pi = clone(learner_m) + ml_m = clone(learner_m) + + np.random.seed(42) + if score == "missing-at-random": + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) + dml_sel_obj = dml.DoubleMLSSM( + obj_dml_data, + ml_g, + ml_pi, + ml_m, + n_folds=n_folds, + score=score, + normalize_ipw=normalize_ipw, + draw_sample_splitting=False, + ) + else: + assert score == "nonignorable" + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) + dml_sel_obj = dml.DoubleMLSSM( + obj_dml_data, + ml_g, + ml_pi, + ml_m, + n_folds=n_folds, + score=score, + normalize_ipw=normalize_ipw, + draw_sample_splitting=False, + ) + + # synchronize the sample splitting + np.random.seed(42) + dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) + + np.random.seed(42) + # tune hyperparameters + tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLSSM) + + dml_sel_obj.fit() + + np.random.seed(42) + smpls = all_smpls[0] + if tune_on_folds: + if score == "missing-at-random": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + smpls, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + elif score == "nonignorable": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + smpls, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + + else: + xx = [(np.arange(len(y)), np.array([]))] + if score == "missing-at-random": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + xx, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + elif score == "nonignorable": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + xx, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + + g0_best_params = g0_best_params * n_folds + g1_best_params = g1_best_params * n_folds + pi_best_params = pi_best_params * n_folds + m_best_params = m_best_params * n_folds + + np.random.seed(42) + res_manual = fit_selection( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + all_smpls, + score, + normalize_ipw=normalize_ipw, + g_d0_params=g0_best_params, + g_d1_params=g1_best_params, + pi_params=pi_best_params, + m_params=m_best_params, + ) + + res_dict = { + "coef": dml_sel_obj.coef[0], + "coef_manual": res_manual["theta"], + "se": dml_sel_obj.se[0], + "se_manual": res_manual["se"], + } + + return res_dict + + +@pytest.mark.ci +def test_dml_ssm_coef(dml_ssm_fixture): + assert math.isclose(dml_ssm_fixture["coef"], dml_ssm_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_ssm_se(dml_ssm_fixture): + assert math.isclose(dml_ssm_fixture["se"], dml_ssm_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) From dbfea737dc092c7f3c32531fdaf670b47892a5f6 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 14:19:32 -0700 Subject: [PATCH 15/48] Clean-up of branch --- doubleml/datasets.py | 1772 --------------------------------- doubleml/double_ml.py | 6 - doubleml/double_ml_data.py | 1104 -------------------- doubleml/utils/_estimation.py | 16 - 4 files changed, 2898 deletions(-) delete mode 100644 doubleml/datasets.py delete mode 100644 doubleml/double_ml_data.py diff --git a/doubleml/datasets.py b/doubleml/datasets.py deleted file mode 100644 index 6d9acfc88..000000000 --- a/doubleml/datasets.py +++ /dev/null @@ -1,1772 +0,0 @@ -import pandas as pd -import numpy as np -import warnings - -from scipy.linalg import toeplitz -from scipy.optimize import minimize_scalar -from scipy.special import expit - -from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder -from sklearn.datasets import make_spd_matrix - -from .double_ml_data import DoubleMLData, DoubleMLClusterData - -_array_alias = ['array', 'np.ndarray', 'np.array', np.ndarray] -_data_frame_alias = ['DataFrame', 'pd.DataFrame', pd.DataFrame] -_dml_data_alias = ['DoubleMLData', DoubleMLData] -_dml_cluster_data_alias = ['DoubleMLClusterData', DoubleMLClusterData] - - -def fetch_401K(return_type='DoubleMLData', polynomial_features=False): - """ - Data set on financial wealth and 401(k) plan participation. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of - Econometrics, 113(2): 231-263. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = 'https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta' - raw_data = pd.read_stata(url) - - y_col = 'net_tfa' - d_cols = ['e401'] - x_cols = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown'] - - data = raw_data.copy() - - if polynomial_features: - raise NotImplementedError('polynomial_features os not implemented yet for fetch_401K.') - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError('Invalid return_type.') - - -def fetch_bonus(return_type='DoubleMLData', polynomial_features=False): - """ - Data set on the Pennsylvania Reemployment Bonus experiment. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. - Journal of Applied Econometrics, 15(6): 575-594. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = 'https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat' - raw_data = pd.read_csv(url, sep='\s+') - - ind = (raw_data['tg'] == 0) | (raw_data['tg'] == 4) - data = raw_data.copy()[ind] - data.reset_index(inplace=True) - data['tg'] = data['tg'].replace(4, 1) - data['inuidur1'] = np.log(data['inuidur1']) - - # variable dep as factor (dummy encoding) - dummy_enc = OneHotEncoder(drop='first', categories='auto').fit(data.loc[:, ['dep']]) - xx = dummy_enc.transform(data.loc[:, ['dep']]).toarray() - data['dep1'] = xx[:, 0] - data['dep2'] = xx[:, 1] - - y_col = 'inuidur1' - d_cols = ['tg'] - x_cols = ['female', 'black', 'othrace', - 'dep1', 'dep2', - 'q2', 'q3', 'q4', 'q5', 'q6', - 'agelt35', 'agegt54', 'durable', 'lusd', 'husd'] - - if polynomial_features: - poly = PolynomialFeatures(2, include_bias=False) - data_transf = poly.fit_transform(data[x_cols]) - x_cols = list(poly.get_feature_names_out(x_cols)) - - data_transf = pd.DataFrame(data_transf, columns=x_cols) - data = pd.concat((data[[y_col] + d_cols], data_transf), - axis=1, sort=False) - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError('Invalid return_type.') - - -def _g(x): - return np.power(np.sin(x), 2) - - -def _m(x, nu=0., gamma=1.): - return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) - - -def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): - """ - Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. - The nuisance functions are given by - - .. math:: - - m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, - - g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - alpha : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. - - References - ---------- - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - a_0 = kwargs.get('a_0', 1.) - a_1 = kwargs.get('a_1', 0.25) - s_1 = kwargs.get('s_1', 1.) - - b_0 = kwargs.get('b_0', 1.) - b_1 = kwargs.get('b_1', 0.25) - s_2 = kwargs.get('s_2', 1.) - - cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - d = a_0 * x[:, 0] + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) \ - + s_1 * np.random.standard_normal(size=[n_obs, ]) - y = alpha * d + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) \ - + b_1 * x[:, 2] + s_2 * np.random.standard_normal(size=[n_obs, ]) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type='DoubleMLData', **kwargs): - """ - Generates data from a partially linear regression model used in a blog article by Turrell (2018). - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, - positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. - :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by - - .. math:: - - m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, - - g_0(x_i) &= \\sin(x_i)^2. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\nu=0`, or :math:`\\gamma=1`. - - References - ---------- - Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, - science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ - `_. - """ - nu = kwargs.get('nu', 0.) - gamma = kwargs.get('gamma', 1.) - - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) - G = _g(np.dot(x, b)) - M = _m(np.dot(x, b), nu=nu, gamma=gamma) - d = M + np.random.standard_normal(size=[n_obs, ]) - y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type='DoubleMLData'): - """ - Generates data from a interactive regression (IRM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i - \\sim \\mathcal{U}(0,1), - - y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and - :math:`c_d` are given by - - .. math:: - - c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = - \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. - - The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni - et al. (2017). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - R2_d : - The value of the parameter :math:`R_d^2`. - R2_y : - The value of the parameter :math:`R_y^2`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - - References - ---------- - Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With - High‐Dimensional Data. Econometrica, 85: 233-298. - """ - # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement - v = np.random.uniform(size=[n_obs, ]) - zeta = np.random.standard_normal(size=[n_obs, ]) - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] - b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) - c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) - c_d = np.sqrt(np.pi ** 2 / 3. * R2_d / ((1 - R2_d) * b_sigma_b)) - - xx = np.exp(np.dot(x, np.multiply(beta, c_d))) - d = 1. * ((xx / (1 + xx)) > v) - - y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_iivm_data(n_obs=500, dim_x=20, theta=1., alpha_x=0.2, return_type='DoubleMLData'): - """ - Generates data from a interactive IV regression (IIVM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, - - y_i &= \\theta d_i + x_i' \\beta + u_i, - - with :math:`Z \\sim \\text{Bernoulli}(0.5)` and - - .. math:: - - \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). - - The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries - :math:`\\beta_j=\\frac{1}{j^2}`. - - The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and - Klaassen (2020). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - alpha_x : - The value of the parameter :math:`\\alpha_x`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion - Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. - """ - # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 - xx = np.random.multivariate_normal(np.zeros(2), - np.array([[1., 0.3], [0.3, 1.]]), - size=[n_obs, ]) - u = xx[:, 0] - v = xx[:, 1] - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] - - z = np.random.binomial(p=0.5, n=1, size=[n_obs, ]) - d = 1. * (alpha_x * z + v > 0) - - y = d * theta + np.dot(x, beta) + u - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=x_cols + ['y', 'd', 'z']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, 'z') - else: - raise ValueError('Invalid return_type.') - - -def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type='DoubleMLData'): - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) - G = _g(np.dot(x, b)) - # instrument - z = _m(np.dot(x, b)) + np.random.standard_normal(size=[n_obs, ]) - # treatment - M = _m(gamma_z * z + np.dot(x, b)) - d = M + np.random.standard_normal(size=[n_obs, ]) - y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=x_cols + ['y', 'd', 'z']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, 'z') - else: - raise ValueError('Invalid return_type.') - - -def make_pliv_CHS2015(n_obs, alpha=1., dim_x=200, dim_z=150, return_type='DoubleMLData'): - """ - Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). - The data generating process is defined as - - .. math:: - - z_i &= \\Pi x_i + \\zeta_i, - - d_i &= x_i' \\gamma + z_i' \\delta + u_i, - - y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, - - with - - .. math:: - - \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ - 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) - - where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. - :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, - :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` - and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - alpha : - The value of the causal parameter. - dim_x : - The number of covariates. - dim_z : - The number of instruments. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear - Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. - """ - assert dim_x >= dim_z - # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf - xx = np.random.multivariate_normal(np.zeros(2), - np.array([[1., 0.6], [0.6, 1.]]), - size=[n_obs, ]) - epsilon = xx[:, 0] - u = xx[:, 1] - - sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), - sigma, - size=[n_obs, ]) - - I_z = np.eye(dim_z) - xi = np.random.multivariate_normal(np.zeros(dim_z), - 0.25 * I_z, - size=[n_obs, ]) - - beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] - gamma = beta - delta = [1 / (k ** 2) for k in range(1, dim_z + 1)] - Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) - - z = np.dot(x, np.transpose(Pi)) + xi - d = np.dot(x, gamma) + np.dot(z, delta) + u - y = alpha * d + np.dot(x, beta) + epsilon - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - z_cols = [f'Z{i + 1}' for i in np.arange(dim_z)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=x_cols + ['y', 'd'] + z_cols) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, z_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1., return_type='DoubleMLClusterData', **kwargs): - """ - Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. - (2021). The data generating process is defined as - - .. math:: - - Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, - - D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, - - Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, - - with - - .. math:: - - X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X - + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, - - \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon - + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, - - v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v - + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, - - V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V - + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, - - and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` - where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries - :math:`\\Sigma_{kj} = s_X^{|j-k|}`. - Further - - .. math:: - - \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) - \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ - s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) - - - and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. - - Parameters - ---------- - N : - The number of observations (first dimension). - M : - The number of observations (second dimension). - dim_X : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where - ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s - ``(x, y, d, cluster_vars, z)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, - :math:`s_X = s_{\\varepsilon v} = 0.25`, - or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries - :math:`(\\zeta_{0})_j = 0.5^j`. - - References - ---------- - Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, - Journal of Business & Economic Statistics, - doi: `10.1080/07350015.2021.1895815 `_, - arXiv:`1909.03489 `_. - """ - # additional parameters specifiable via kwargs - pi_10 = kwargs.get('pi_10', 1.0) - - xx = np.arange(1, dim_X + 1) - zeta_0 = kwargs.get('zeta_0', np.power(0.5, xx)) - pi_20 = kwargs.get('pi_20', np.power(0.5, xx)) - xi_0 = kwargs.get('xi_0', np.power(0.5, xx)) - - omega_X = kwargs.get('omega_X', np.array([0.25, 0.25])) - omega_epsilon = kwargs.get('omega_epsilon', np.array([0.25, 0.25])) - omega_v = kwargs.get('omega_v', np.array([0.25, 0.25])) - omega_V = kwargs.get('omega_V', np.array([0.25, 0.25])) - - s_X = kwargs.get('s_X', 0.25) - s_epsilon_v = kwargs.get('s_epsilon_v', 0.25) - - # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., - # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] - # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] - - alpha_V = np.random.normal(size=(N * M)) - alpha_V_i = np.repeat(np.random.normal(size=N), M) - alpha_V_j = np.tile(np.random.normal(size=M), N) - - cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) - alpha_eps_v = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N * M, ]) - alpha_eps = alpha_eps_v[:, 0] - alpha_v = alpha_eps_v[:, 1] - - alpha_eps_v_i = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N, ]) - alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) - alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) - - alpha_eps_v_j = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[M, ]) - alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) - alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) - - cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) - alpha_X = np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N * M, ]) - alpha_X_i = np.repeat(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N, ]), - M, axis=0) - alpha_X_j = np.tile(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[M, ]), - (N, 1)) - - # generate variables - x = (1 - omega_X[0] - omega_X[1]) * alpha_X \ - + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j - - eps = (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps \ - + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j - - v = (1 - omega_v[0] - omega_v[1]) * alpha_v \ - + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j - - V = (1 - omega_V[0] - omega_V[1]) * alpha_V \ - + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j - - z = np.matmul(x, xi_0) + V - d = z * pi_10 + np.matmul(x, pi_20) + v - y = d * theta + np.matmul(x, zeta_0) + eps - - cluster_cols = ['cluster_var_i', 'cluster_var_j'] - cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) - - if return_type in _array_alias: - return x, y, d, cluster_vars.values, z - elif return_type in _data_frame_alias + _dml_cluster_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_X)] - data = pd.concat((cluster_vars, - pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ['Y', 'D', 'Z'])), - axis=1) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLClusterData(data, 'Y', 'D', cluster_cols, x_cols, 'Z') - else: - raise ValueError('Invalid return_type.') - - -def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type='DoubleMLData', **kwargs): - """ - Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). - The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let - - .. math:: - - f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4), - - f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4). - - - Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, - :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. - At first define - - .. math:: - - Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0, - - Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d), - - p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))}, - - D &= 1\\{p(W_{ps}) \\ge U\\}, - - where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables, - :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform - and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`. - The different data generating processes are defined via - - .. math:: - - DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z - - DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X - - DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z - - DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X - - DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 - - DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0, - - such that the last two settings correspond to an experimental setting with treatment probability - of :math:`P(D=1) = \\frac{1}{2}.` - For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`. - For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``. - Then the outcome will be defined to be - - .. math:: - - Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0), - - where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`. - The true average treatment effect on the treated is zero for all data generating processes. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dgp_type : - The DGP to be used. Default value is ``1`` (integer). - cross_sectional_data : - Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)`` - or ``(x, y, d, t)``. - **kwargs - Additional keyword arguments to set non-default values for the parameter - :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - xi = kwargs.get('xi', 0.75) - c = kwargs.get('c', 0.0) - lambda_t = kwargs.get('lambda_t', 0.5) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - dim_x = 4 - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs) - epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2]) - - if dgp_type == 1: - features_ps = z - features_reg = z - elif dgp_type == 2: - features_ps = x - features_reg = z - elif dgp_type == 3: - features_ps = z - features_reg = x - elif dgp_type == 4: - features_ps = x - features_reg = x - elif dgp_type == 5: - features_ps = None - features_reg = z - elif dgp_type == 6: - features_ps = None - features_reg = x - else: - raise ValueError('The dgp_type is not valid.') - - # treatment and propensities - is_experimental = (dgp_type == 5) or (dgp_type == 6) - if is_experimental: - # Set D to be experimental - p = 0.5 * np.ones(n_obs) - else: - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (p >= u) - - # potential outcomes - nu = np.random.normal(loc=d * f_reg(features_reg), scale=1, size=n_obs) - y0 = f_reg(features_reg) + nu + epsilon_0 - y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0] - y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1] - y1 = d * y1_d1 + (1 - d) * y1_d0 - - if not cross_sectional_data: - y = y1 - y0 - - if return_type in _array_alias: - return z, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((z, y, d)), - columns=z_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', z_cols) - else: - raise ValueError('Invalid return_type.') - - else: - u_t = np.random.uniform(low=0, high=1, size=n_obs) - t = 1.0 * (u_t <= lambda_t) - y = t * y1 + (1 - t) * y0 - - if return_type in _array_alias: - return z, y, d, t - elif return_type in _data_frame_alias + _dml_data_alias: - z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((z, y, d, t)), - columns=z_cols + ['y', 'd', 't']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', z_cols, t_col='t') - else: - raise ValueError('Invalid return_type.') - - -def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): - """ - Generates counfounded data from an interactive regression model. - - The data generating process is defined as follows (inspired by the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the propensity score as - - .. math:: - - m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A - - where - - .. math:: - - p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, - - f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). - - and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. - Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as - - .. math:: - - P(D=1|X) = p(Z). - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) - \\cdot D (Z_5 + 1) + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be - set via the parameters ``gamma_a`` and ``beta_a``. - - The observed data is given as :math:`W = (Y, D, Z)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, - the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and - in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) - are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``0.0``. - gamma_a : float - Coefficient of the unobserved confounder in the propensity score. - Default is ``0.127``. - beta_a : float - Coefficient of the unobserved confounder in the outcome regression. - Default is ``0.58``. - linear : bool - If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = 0.0 # the confounding strength is only valid for c=0 - xi = 0.75 - dim_x = kwargs.get('dim_x', 5) - trimming_threshold = kwargs.get('trimming_threshold', 0.01) - var_eps_y = kwargs.get('var_eps_y', 1.0) - - # Specification of main regression function - def f_reg(w): - res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) - return res - - # Specification of prop score function - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - z_tilde_5 = x[:, 4] - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - # error terms and unobserved confounder - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # Choose the features used in the models - if linear: - features_ps = x - features_reg = x - else: - features_ps = z - features_reg = z - - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - # compute short and long form of propensity score - m_long = p + gamma_a * a - m_short = p - # check propensity score bounds - if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): - m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) - m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) - warnings.warn(f'Propensity score is close to 0 or 1. ' - f'Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied') - # generate treatment based on long form - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (m_long >= u) - # add treatment heterogeneity - d1x = z[:, 4] + 1 - var_dx = np.var(d * (d1x)) - cov_adx = gamma_a * var_a - # Outcome regression - g_partial_reg = f_reg(features_reg) - # short model - g_short_d0 = g_partial_reg - g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg - g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 - # long model - g_long_d0 = g_partial_reg + beta_a * a - g_long_d1 = theta * d1x + g_partial_reg + beta_a * a - g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 - # Potential outcomes - y_0 = g_long_d0 + eps_y - y_1 = g_long_d1 + eps_y - # Realized outcome - y = d * y_1 + (1.0 - d) * y_0 - # In-sample values for confounding strength - explained_residual_variance = np.square(g_long - g_short) - residual_variance = np.square(y - g_short) - cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) - # compute the Riesz representation - treated_weight = d / np.mean(d) - untreated_weight = (1.0 - d) / np.mean(d) - # Odds ratios - propensity_ratio_long = m_long / (1.0 - m_long) - rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) - rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) - propensity_ratio_short = m_short / (1.0 - m_short) - rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) - rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) - cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( - 1 / (m_long * (1 - m_long))) - cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) - if (beta_a == 0) | (gamma_a == 0): - rho_ate = 0.0 - rho_atte = 0.0 - else: - rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] - rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] - oracle_values = { - 'g_long': g_long, - 'g_short': g_short, - 'm_long': m_long, - 'm_short': m_short, - 'gamma_a': gamma_a, - 'beta_a': beta_a, - 'a': a, - 'y_0': y_0, - 'y_1': y_1, - 'z': z, - 'cf_y': cf_y, - 'cf_d_ate': cf_d_ate, - 'cf_d_atte': cf_d_atte, - 'rho_ate': rho_ate, - 'rho_atte': rho_atte, - } - res_dict = { - 'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values - } - return res_dict - - -def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): - """ - Generates counfounded data from an partially linear regression model. - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, - where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the treatment as - - .. math:: - - D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D - - and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. - Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as - - .. math:: - - E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A - - E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. - Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). - - The observed data is given as :math:`W = (Y, D, X)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, - the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and - the propensity score are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``5.0``. - cf_y : float - Percentage of the residual variation of the outcome explained by latent/confounding variable. - Default is ``0.04``. - cf_d : float - Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. - Default is ``0.04``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = kwargs.get('c', 0.0) - dim_x = kwargs.get('dim_x', 4) - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # get the required impact of the confounder on the propensity score - m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] - - def f_m(gamma_a): - rr_long = eps_d / var_eps_d - rr_short = (gamma_a * a + eps_d) / (gamma_a ** 2 * var_a + var_eps_d) - C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) - return np.square(C2_D / (1 + C2_D) - cf_d) - - gamma_a = minimize_scalar(f_m).x - m_long = m_short + gamma_a * a - d = m_long + eps_d - - # short and long version of g - g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) - - var_d = np.var(d) - - def f_g(beta_a): - g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) - y_diff = eps_y + g_diff - return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) - - beta_a = minimize_scalar(f_g).x - - g_long = theta * d + g_partial_reg + beta_a * a - g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg - - y = g_long + eps_y - - oracle_values = {'g_long': g_long, - 'g_short': g_short, - 'm_long': m_long, - 'm_short': m_short, - 'theta': theta, - 'gamma_a': gamma_a, - 'beta_a': beta_a, - 'a': a, - 'z': z} - - res_dict = {'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values} - - return res_dict - - -def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): - """ - Creates a simple synthetic example for heterogeneous treatment effects. - The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). - - The data is generated as - - .. math:: - - Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i - - D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, - - where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i - \\sim\\mathcal{U}[-1,1]`. - If the treatment is set to be binary, the treatment is generated as - - .. math:: - D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. - - The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support - which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. - Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending - on the dimension of :math:`x`. - - If the heterogeneity is univariate the conditional treatment effect takes the following form - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), - - whereas for the two-dimensional case the conditional treatment effect is defined as - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). - - Parameters - ---------- - n_obs : int - Number of observations to simulate. - Default is ``200``. - - p : int - Dimension of covariates. - Default is ``30``. - - support_size : int - Number of relevant (confounding) covariates. - Default is ``5``. - - n_x : int - Dimension of the heterogeneity. Can be either ``1`` or ``2``. - Default is ``1``. - - binary_treatment : bool - Indicates whether the treatment is binary. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``data``, ``effects``, ``treatment_effect``. - - """ - # simple input checks - assert n_x in [1, 2], 'n_x must be either 1 or 2.' - assert support_size <= p, 'support_size must be smaller than p.' - assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.' - - # define treatment effects - if n_x == 1: - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) - else: - assert n_x == 2 - - # redefine treatment effect - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) - - # Outcome support and coefficients - support_y = np.random.choice(np.arange(p), size=support_size, replace=False) - coefs_y = np.random.uniform(0, 1, size=support_size) - # treatment support and coefficients - support_d = support_y - coefs_d = np.random.uniform(0, 0.3, size=support_size) - - # noise - epsilon = np.random.uniform(-1, 1, size=n_obs) - eta = np.random.uniform(-1, 1, size=n_obs) - - # Generate controls, covariates, treatments and outcomes - x = np.random.uniform(0, 1, size=(n_obs, p)) - # Heterogeneous treatment effects - te = treatment_effect(x) - if binary_treatment: - d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) - else: - d = np.dot(x[:, support_d], coefs_d) + eta - y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon - - # Now we build the dataset - y_df = pd.DataFrame({'y': y}) - d_df = pd.DataFrame({'d': d}) - x_df = pd.DataFrame( - data=x, - index=np.arange(x.shape[0]), - columns=[f'X_{i}' for i in range(x.shape[1])] - ) - - data = pd.concat([y_df, d_df, x_df], axis=1) - res_dict = { - 'data': data, - 'effects': te, - 'treatment_effect': treatment_effect} - return res_dict - - -def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleMLData'): - """ - Generates data from a sample selection model (SSM). - The data generating process is defined as - - .. math:: - - y_i &= \\theta d_i + x_i' \\beta d_i + u_i, - - s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, - - d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, - - with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where - :math:`\\Sigma^2_x` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` - :math:`z_i \\sim \\mathcal{N}(0, 1)`, - :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, - :math:`w_i \\sim \\mathcal{N}(0, 1)`. - - - The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, - Huber and Lafférs (2023). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - mar: - Boolean. Indicates whether missingness at random holds. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. - - References - ---------- - Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, - Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 - """ - if mar: - sigma = np.array([[1, 0], [0, 1]]) - gamma = 0 - else: - sigma = np.array([[1, 0.8], [0.8, 1]]) - gamma = 1 - - e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - beta = [0.4 / (k ** 2) for k in range(1, dim_x + 1)] - - d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) - z = np.random.randn(n_obs) - s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) - - y = np.dot(x, beta) + theta * d + e[1] - y[s == 0] = 0 - - if return_type in _array_alias: - return x, y, d, z, s - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - if mar: - data = pd.DataFrame(np.column_stack((x, y, d, s)), - columns=x_cols + ['y', 'd', 's']) - else: - data = pd.DataFrame(np.column_stack((x, y, d, z, s)), - columns=x_cols + ['y', 'd', 'z', 's']) - if return_type in _data_frame_alias: - return data - else: - if mar: - return DoubleMLData(data, 'y', 'd', x_cols, None, None, 's') - return DoubleMLData(data, 'y', 'd', x_cols, 'z', None, 's') - else: - raise ValueError('Invalid return_type.') - - -def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): - """ - Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an - underlying continous treatment). - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - A continuous treatment :math:`D_{\\text{cont}}` is generated as - - .. math:: - - D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, - - where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment - effect is defined as - - .. math:: - - \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. - - Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of - :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels - is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. - - The potential outcomes are defined as - - .. math:: - - Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y - - Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), - - where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as - - .. math:: - - Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. - - The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``200``. - - n_levels : int - The number of treatment levels. - Default is ``3``. - - linear : bool - Indicates whether the true underlying regression is linear. - Default is ``False``. - - random_state : int - Random seed for reproducibility. - Default is ``42``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - The oracle values contain the continuous treatment, the level bounds, the potential level, ITE - and the potential outcome without treatment. - - """ - if random_state is not None: - np.random.seed(random_state) - xi = kwargs.get('xi', 0.3) - c = kwargs.get('c', 0.0) - dim_x = kwargs.get('dim_x', 5) - - if not isinstance(n_levels, int): - raise ValueError('n_levels must be an integer.') - if n_levels < 2: - raise ValueError('n_levels must be at least 2.') - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_treatment(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - def treatment_effect(d, scale=15): - return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - if linear: - g = f_reg(x) - m = f_treatment(x, xi) - else: - assert not linear - g = f_reg(z) - m = f_treatment(z, xi) - - cont_d = m + eps_d - level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) - potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 - eta = np.random.uniform(0, 1, size=n_obs) - d = 1.0 * (eta >= 1 / n_levels) * potential_level - - ite = treatment_effect(cont_d) - y0 = g + eps_y - # only treated for d > 0 compared to the baseline - y = ite * (d > 0) + y0 - - oracle_values = { - 'cont_d': cont_d, - 'level_bounds': level_bounds, - 'potential_level': potential_level, - 'ite': ite, - 'y0': y0, - } - - resul_dict = { - 'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values - } - - return resul_dict - - -def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): - """ - Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), - designed for use in double/debiased machine learning applications. - - The data generating process is defined as follows: - - - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). - - Treatment \( d_i = a_0(x_i) \). - - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. - - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). - - The nuisance functions are defined as: - - .. math:: - - a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ - &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ - - r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ - &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ - &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) - - Parameters - ---------- - n_obs : int - Number of observations to simulate. - dim_x : int - Number of covariates. - alpha : float - Value of the causal parameter. - return_type : str - Determines the return format. One of: - - - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. - - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. - - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). - - **kwargs - Optional keyword arguments (currently unused in this implementation). - - Returns - ------- - Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] - The generated data in the specified format. - - References - ---------- - Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. - "Double/Debiased Machine Learning for Logistic Partially Linear Model." - The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. - - """ - - if balanced_r0: - def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 1 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) - else: - def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 4 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) - - def a_0(X): - return 2 / (1 + np.exp(X[:, 0])) + \ - -2 / (1 + np.exp(X[:, 1])) + \ - 1 * np.sin(X[:, 2]) + \ - 1 * np.cos(X[:, 3]) + \ - 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ - -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ - 0.2 * X[:, 6] * X[:, 7] + \ - -0.2 * X[:, 8] * X[:, 9] - - - sigma = np.full((dim_x, dim_x), 0.2) - np.fill_diagonal(sigma, 1) - - x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) - np.clip(x, -2, 2, out=x) - - if treatment == "continuous": - d = a_0(x) - elif treatment == "binary": - d_cont = a_0(x) - d = np.random.binomial(1, expit(d_cont - d_cont.mean())) - elif treatment == "binary_unbalanced": - d_cont = a_0(x) - d = np.random.binomial(1, expit(d_cont)) - - p = expit(alpha * d[:] + r_0(x)) - - y = np.random.binomial(1, p) - - if return_type in _array_alias: - return x, y, d, p - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, p)), - columns=x_cols + ['y', 'd', 'p']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, p_cols='p') - else: - raise ValueError('Invalid return_type.') diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 1cc6bcf9b..05481bf16 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -585,12 +585,6 @@ def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None, # construct framework for inference self._framework = self.construct_framework() - - - - - - return self def construct_framework(self): diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py deleted file mode 100644 index 612e6b7f0..000000000 --- a/doubleml/double_ml_data.py +++ /dev/null @@ -1,1104 +0,0 @@ -import numpy as np -import pandas as pd -import io - -from abc import ABC, abstractmethod - -from sklearn.utils.validation import check_array, column_or_1d, check_consistent_length -from sklearn.utils import assert_all_finite -from sklearn.utils.multiclass import type_of_target -from .utils._estimation import _assure_2d_array -from .utils._checks import _check_set - - -class DoubleMLBaseData(ABC): - """Base Class Double machine learning data-backends - """ - def __init__(self, - data): - if not isinstance(data, pd.DataFrame): - raise TypeError('data must be of pd.DataFrame type. ' - f'{str(data)} of type {str(type(data))} was passed.') - if not data.columns.is_unique: - raise ValueError('Invalid pd.DataFrame: ' - 'Contains duplicate column names.') - self._data = data - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = '================== DoubleMLBaseData Object ==================\n' + \ - '\n------------------ Data summary ------------------\n' + data_summary + \ - '\n------------------ DataFrame info ------------------\n' + df_info - return res - - def _data_summary_str(self): - data_summary = f'No. Observations: {self.n_obs}\n' - return data_summary - - @property - def data(self): - """ - The data. - """ - return self._data - - @property - def all_variables(self): - """ - All variables available in the dataset. - """ - return self.data.columns - - @property - def n_obs(self): - """ - The number of observations. - """ - return self.data.shape[0] - - # TODO: This and the following property does not make sense but the base class DoubleML needs it (especially for the - # multiple treatment variables case) and other things are also build around it, see for example DoubleML._params - @property - def d_cols(self): - return ['theta'] - - @property - def n_treat(self): - """ - The number of treatment variables. - """ - return 1 - - @property - @abstractmethod - def n_coefs(self): - pass - - -class DoubleMLData(DoubleMLBaseData): - """Double machine learning data-backend. - - :class:`DoubleMLData` objects can be initialized from - :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. - - Parameters - ---------- - data : :class:`pandas.DataFrame` - The data. - - y_col : str - The outcome variable. - - d_cols : str or list - The treatment variable(s). - - x_cols : None, str or list - The covariates. - If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. - Default is ``None``. - - z_cols : None, str or list - The instrumental variable(s). - Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD or SSM Estimatiors). - Default is ``None``. - - p_cols : None, str or list, optional - The column(s) containing the probabilities of the outcome (only for simulated, binary data). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 - >>> # initialization from pandas.DataFrame - >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') - >>> obj_dml_data_from_df = DoubleMLData(df, 'y', 'd') - >>> # initialization from np.ndarray - >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') - >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) - """ - def __init__(self, - data, - y_col, - d_cols, - x_cols=None, - z_cols=None, - t_col=None, - s_col=None, - p_cols=None, - use_other_treat_as_covariate=True, - force_all_x_finite=True): - DoubleMLBaseData.__init__(self, data) - - self.y_col = y_col - self.d_cols = d_cols - self.z_cols = z_cols - self.t_col = t_col - self.s_col = s_col - self.x_cols = x_cols - self.p_cols = p_cols - self._check_disjoint_sets_y_d_x_z_t_s() - self.use_other_treat_as_covariate = use_other_treat_as_covariate - self.force_all_x_finite = force_all_x_finite - self._binary_treats = self._check_binary_treats() - self._binary_outcome = self._check_binary_outcome() - self._set_y_z_t_s() - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = '================== DoubleMLData Object ==================\n' + \ - '\n------------------ Data summary ------------------\n' + data_summary + \ - '\n------------------ DataFrame info ------------------\n' + df_info - return res - - def _data_summary_str(self): - data_summary = f'Outcome variable: {self.y_col}\n' \ - f'Treatment variable(s): {self.d_cols}\n' \ - f'Covariates: {self.x_cols}\n' \ - f'Instrument variable(s): {self.z_cols}\n' - if self.t_col is not None: - data_summary += f'Time variable: {self.t_col}\n' - if self.s_col is not None: - data_summary += f'Score/Selection variable: {self.s_col}\n' - data_summary += f'No. Observations: {self.n_obs}\n' - return data_summary - - @classmethod - def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as_covariate=True, - force_all_x_finite=True): - """ - Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. - - Parameters - ---------- - x : :class:`numpy.ndarray` - Array of covariates. - - y : :class:`numpy.ndarray` - Array of the outcome variable. - - d : :class:`numpy.ndarray` - Array of treatment variables. - - z : None or :class:`numpy.ndarray` - Array of instrumental variables. - Default is ``None``. - - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD and SSM models). - Default is ``None``. - - p : None or :class:`numpy.ndarray` - Array of the probabilities of the outcome (only for simulated, binary data). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 - >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') - >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) - """ - if isinstance(force_all_x_finite, str): - if force_all_x_finite != 'allow-nan': - raise ValueError("Invalid force_all_x_finite " + force_all_x_finite + ". " + - "force_all_x_finite must be True, False or 'allow-nan'.") - elif not isinstance(force_all_x_finite, bool): - raise TypeError("Invalid force_all_x_finite. " + - "force_all_x_finite must be True, False or 'allow-nan'.") - - x = check_array(x, ensure_2d=False, allow_nd=False, - force_all_finite=force_all_x_finite) - d = check_array(d, ensure_2d=False, allow_nd=False) - y = column_or_1d(y, warn=True) - - x = _assure_2d_array(x) - d = _assure_2d_array(d) - - y_col = 'y' - if z is None: - check_consistent_length(x, y, d) - z_cols = None - else: - z = check_array(z, ensure_2d=False, allow_nd=False) - z = _assure_2d_array(z) - check_consistent_length(x, y, d, z) - if z.shape[1] == 1: - z_cols = ['z'] - else: - z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])] - - if t is None: - t_col = None - else: - t = column_or_1d(t, warn=True) - check_consistent_length(x, y, d, t) - t_col = 't' - - if s is None: - s_col = None - else: - s = column_or_1d(s, warn=True) - check_consistent_length(x, y, d, s) - s_col = 's' - - - if p is None: - p_cols = None - else: - if p.shape[1] == 1: - p_cols = ['p'] - else: - p_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] - - if d.shape[1] == 1: - d_cols = ['d'] - else: - d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])] - - x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])] - - # basline version with features, outcome and treatments - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + [y_col] + d_cols) - - if z is not None: - df_z = pd.DataFrame(z, columns=z_cols) - data = pd.concat([data, df_z], axis=1) - - if t is not None: - data[t_col] = t - - if s is not None: - data[s_col] = s - - if p is not None: - data[p_cols] = p - - return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, p_cols, use_other_treat_as_covariate, force_all_x_finite) - - @property - def x(self): - """ - Array of covariates; - Dynamic! May depend on the currently set treatment variable; - To get an array of all covariates (independent of the currently set treatment variable) - call ``obj.data[obj.x_cols].values``. - """ - return self._X.values - - @property - def y(self): - """ - Array of outcome variable. - """ - return self._y.values - - @property - def d(self): - """ - Array of treatment variable; - Dynamic! Depends on the currently set treatment variable; - To get an array of all treatment variables (independent of the currently set treatment variable) - call ``obj.data[obj.d_cols].values``. - """ - return self._d.values - - @property - def z(self): - """ - Array of instrumental variables. - """ - if self.z_cols is not None: - return self._z.values - else: - return None - - @property - def t(self): - """ - Array of time variable. - """ - if self.t_col is not None: - return self._t.values - else: - return None - - @property - def s(self): - """ - Array of score or selection variable. - """ - if self.s_col is not None: - return self._s.values - else: - return None - - @property - def p_cols(self): - """ - The column(s) containing the probabilities of the outcome (only for simulated data). - """ - return self._p_cols - - @p_cols.setter - def p_cols(self, value): - if value is not None: - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The probability column(s) p_cols must be of str or list type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid probability column(s) p_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid probability column(s) p_cols. ' - 'At least one probability column is not a data column.') - self._p_cols = value - else: - self._p_cols = None - - @property - def p(self): - """ - Array of probabilities of the outcome (only for simulated data). - """ - if self.p_cols is not None: - return self._p.values - else: - return None - - @property - def n_treat(self): - """ - The number of treatment variables. - """ - return len(self.d_cols) - - @property - def n_coefs(self): - """ - The number of coefficients to be estimated. - """ - return self.n_treat - - @property - def n_instr(self): - """ - The number of instruments. - """ - if self.z_cols is not None: - n_instr = len(self.z_cols) - else: - n_instr = 0 - return n_instr - - @property - def binary_treats(self): - """ - Series with logical(s) indicating whether the treatment variable(s) are binary with values 0 and 1. - """ - return self._binary_treats - - @property - def binary_outcome(self): - """ - Logical indicating whether the outcome variable is binary with values 0 and 1. - """ - return self._binary_outcome - - @property - def x_cols(self): - """ - The covariates. - """ - return self._x_cols - - @x_cols.setter - def x_cols(self, value): - reset_value = hasattr(self, '_x_cols') - if value is not None: - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The covariates x_cols must be of str or list type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid covariates x_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid covariates x_cols. ' - 'At least one covariate is no data column.') - assert set(value).issubset(set(self.all_variables)) - self._x_cols = value - else: - excluded_cols = set.union({self.y_col}, set(self.d_cols)) - if (self.z_cols is not None): - excluded_cols = set.union(excluded_cols, set(self.z_cols)) - for col in [self.t_col, self.s_col]: - col = _check_set(col) - excluded_cols = set.union(excluded_cols, col) - self._x_cols = [col for col in self.data.columns if col not in excluded_cols] - if reset_value: - self._check_disjoint_sets() - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - @property - def d_cols(self): - """ - The treatment variable(s). - """ - return self._d_cols - - @d_cols.setter - def d_cols(self, value): - reset_value = hasattr(self, '_d_cols') - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The treatment variable(s) d_cols must be of str or list type. ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid treatment variable(s) d_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid treatment variable(s) d_cols. ' - 'At least one treatment variable is no data column.') - self._d_cols = value - if reset_value: - self._check_disjoint_sets() - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - @property - def y_col(self): - """ - The outcome variable. - """ - return self._y_col - - @y_col.setter - def y_col(self, value): - reset_value = hasattr(self, '_y_col') - if not isinstance(value, str): - raise TypeError('The outcome variable y_col must be of str type. ' - f'{str(value)} of type {str(type(value))} was passed.') - if value not in self.all_variables: - raise ValueError('Invalid outcome variable y_col. ' - f'{value} is no data column.') - self._y_col = value - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def z_cols(self): - """ - The instrumental variable(s). - """ - return self._z_cols - - @z_cols.setter - def z_cols(self, value): - reset_value = hasattr(self, '_z_cols') - if value is not None: - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The instrumental variable(s) z_cols must be of str or list type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid instrumental variable(s) z_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid instrumental variable(s) z_cols. ' - 'At least one instrumental variable is no data column.') - self._z_cols = value - else: - self._z_cols = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def t_col(self): - """ - The time variable. - """ - return self._t_col - - @t_col.setter - def t_col(self, value): - reset_value = hasattr(self, '_t_col') - if value is not None: - if not isinstance(value, str): - raise TypeError('The time variable t_col must be of str type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if value not in self.all_variables: - raise ValueError('Invalid time variable t_col. ' - f'{value} is no data column.') - self._t_col = value - else: - self._t_col = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def s_col(self): - """ - The score or selection variable. - """ - return self._s_col - - @s_col.setter - def s_col(self, value): - reset_value = hasattr(self, '_s_col') - if value is not None: - if not isinstance(value, str): - raise TypeError('The score or selection variable s_col must be of str type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if value not in self.all_variables: - raise ValueError('Invalid score or selection variable s_col. ' - f'{value} is no data column.') - self._s_col = value - else: - self._s_col = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def use_other_treat_as_covariate(self): - """ - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - """ - return self._use_other_treat_as_covariate - - @use_other_treat_as_covariate.setter - def use_other_treat_as_covariate(self, value): - reset_value = hasattr(self, '_use_other_treat_as_covariate') - if not isinstance(value, bool): - raise TypeError('use_other_treat_as_covariate must be True or False. ' - f'Got {str(value)}.') - self._use_other_treat_as_covariate = value - if reset_value: - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - @property - def force_all_x_finite(self): - """ - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - """ - return self._force_all_x_finite - - @force_all_x_finite.setter - def force_all_x_finite(self, value): - reset_value = hasattr(self, '_force_all_x_finite') - if isinstance(value, str): - if value != 'allow-nan': - raise ValueError("Invalid force_all_x_finite " + value + ". " + - "force_all_x_finite must be True, False or 'allow-nan'.") - elif not isinstance(value, bool): - raise TypeError("Invalid force_all_x_finite. " + - "force_all_x_finite must be True, False or 'allow-nan'.") - self._force_all_x_finite = value - if reset_value: - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - def _set_y_z_t_s(self): - assert_all_finite(self.data.loc[:, self.y_col]) - self._y = self.data.loc[:, self.y_col] - if self.z_cols is None: - self._z = None - else: - assert_all_finite(self.data.loc[:, self.z_cols]) - self._z = self.data.loc[:, self.z_cols] - - if self.t_col is None: - self._t = None - else: - assert_all_finite(self.data.loc[:, self.t_col]) - self._t = self.data.loc[:, self.t_col] - - if self.s_col is None: - self._s = None - else: - assert_all_finite(self.data.loc[:, self.s_col]) - self._s = self.data.loc[:, self.s_col] - - def set_x_d(self, treatment_var): - """ - Function that assigns the role for the treatment variables in the multiple-treatment case. - - Parameters - ---------- - treatment_var : str - Active treatment variable that will be set to d. - """ - if not isinstance(treatment_var, str): - raise TypeError('treatment_var must be of str type. ' - f'{str(treatment_var)} of type {str(type(treatment_var))} was passed.') - if treatment_var not in self.d_cols: - raise ValueError('Invalid treatment_var. ' - f'{treatment_var} is not in d_cols.') - if self.use_other_treat_as_covariate: - # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed - # (see https://github.com/DoubleML/doubleml-for-py/issues/83) - xd_list = self.x_cols + self.d_cols - xd_list.remove(treatment_var) - else: - xd_list = self.x_cols - assert_all_finite(self.data.loc[:, treatment_var]) - if self.force_all_x_finite: - assert_all_finite(self.data.loc[:, xd_list], - allow_nan=self.force_all_x_finite == 'allow-nan') - self._d = self.data.loc[:, treatment_var] - self._X = self.data.loc[:, xd_list] - - def _check_binary_treats(self): - is_binary = pd.Series(dtype=bool, index=self.d_cols) - for treatment_var in self.d_cols: - this_d = self.data.loc[:, treatment_var] - binary_treat = (type_of_target(this_d) == 'binary') - zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) - is_binary[treatment_var] = (binary_treat & zero_one_treat) - return is_binary - - def _check_binary_outcome(self): - y = self.data.loc[:, self.y_col] - binary_outcome = (type_of_target(y) == 'binary') - zero_one_outcome = np.all((np.power(y, 2) - y) == 0) - is_binary = (binary_outcome & zero_one_outcome) - return is_binary - - def _check_disjoint_sets(self): - # this function can be extended in inherited subclasses - self._check_disjoint_sets_y_d_x_z_t_s() - - def _check_disjoint_sets_y_d_x_z_t_s(self): - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - - if not y_col_set.isdisjoint(x_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and covariate in ' - '``x_cols``.') - if not y_col_set.isdisjoint(d_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and treatment variable in ' - '``d_cols``.') - # note that the line xd_list = self.x_cols + self.d_cols in method set_x_d needs adaption if an intersection of - # x_cols and d_cols as allowed (see https://github.com/DoubleML/doubleml-for-py/issues/83) - if not d_cols_set.isdisjoint(x_cols_set): - raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and as covariate' - '(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``.') - - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not y_col_set.isdisjoint(z_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental ' - 'variable in ``z_cols``.') - if not d_cols_set.isdisjoint(z_cols_set): - raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' - 'instrumental variable in ``z_cols``.') - if not x_cols_set.isdisjoint(z_cols_set): - raise ValueError('At least one variable/column is set as covariate (``x_cols``) and instrumental ' - 'variable in ``z_cols``.') - - self._check_disjoint_sets_t_s() - - def _check_disjoint_sets_t_s(self): - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - - if self.t_col is not None: - t_col_set = {self.t_col} - if not t_col_set.isdisjoint(x_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and covariate in ' - '``x_cols``.') - if not t_col_set.isdisjoint(d_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and treatment variable in ' - '``d_cols``.') - if not t_col_set.isdisjoint(y_col_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and outcome variable ' - '``y_col``.') - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not t_col_set.isdisjoint(z_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and instrumental ' - 'variable in ``z_cols``.') - - if self.s_col is not None: - s_col_set = {self.s_col} - if not s_col_set.isdisjoint(x_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in ' - '``x_cols``.') - if not s_col_set.isdisjoint(d_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment ' - 'variable in ``d_cols``.') - if not s_col_set.isdisjoint(y_col_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome ' - 'variable ``y_col``.') - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not s_col_set.isdisjoint(z_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' - 'instrumental variable in ``z_cols``.') - if self.t_col is not None: - t_col_set = {self.t_col} - if not s_col_set.isdisjoint(t_col_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time ' - 'variable ``t_col``.') - - -class DoubleMLClusterData(DoubleMLData): - """Double machine learning data-backend for data with cluster variables. - - :class:`DoubleMLClusterData` objects can be initialized from - :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. - - Parameters - ---------- - data : :class:`pandas.DataFrame` - The data. - - y_col : str - The outcome variable. - - d_cols : str or list - The treatment variable(s). - - cluster_cols : str or list - The cluster variable(s). - - x_cols : None, str or list - The covariates. - If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. - Default is ``None``. - - z_cols : None, str or list - The instrumental variable(s). - Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD and SSM Estimatiors). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> # initialization from pandas.DataFrame - >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') - >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') - >>> # initialization from np.ndarray - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - def __init__(self, - data, - y_col, - d_cols, - cluster_cols, - x_cols=None, - z_cols=None, - t_col=None, - s_col=None, - use_other_treat_as_covariate=True, - force_all_x_finite=True): - DoubleMLBaseData.__init__(self, data) - - # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter - self.cluster_cols = cluster_cols - self._set_cluster_vars() - DoubleMLData.__init__(self, - data, - y_col, - d_cols, - x_cols, - z_cols, - t_col, - s_col, - use_other_treat_as_covariate, - force_all_x_finite) - self._check_disjoint_sets_cluster_cols() - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = '================== DoubleMLClusterData Object ==================\n' + \ - '\n------------------ Data summary ------------------\n' + data_summary + \ - '\n------------------ DataFrame info ------------------\n' + df_info - return res - - def _data_summary_str(self): - data_summary = f'Outcome variable: {self.y_col}\n' \ - f'Treatment variable(s): {self.d_cols}\n' \ - f'Cluster variable(s): {self.cluster_cols}\n' \ - f'Covariates: {self.x_cols}\n' \ - f'Instrument variable(s): {self.z_cols}\n' - if self.t_col is not None: - data_summary += f'Time variable: {self.t_col}\n' - if self.s_col is not None: - data_summary += f'Score/Selection variable: {self.s_col}\n' - - data_summary += f'No. Observations: {self.n_obs}\n' - return data_summary - - @classmethod - def from_arrays(cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, - force_all_x_finite=True): - """ - Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. - - Parameters - ---------- - x : :class:`numpy.ndarray` - Array of covariates. - - y : :class:`numpy.ndarray` - Array of the outcome variable. - - d : :class:`numpy.ndarray` - Array of treatment variables. - - cluster_vars : :class:`numpy.ndarray` - Array of cluster variables. - - z : None or :class:`numpy.ndarray` - Array of instrumental variables. - Default is ``None``. - - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD or SSM models). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) - cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) - cluster_vars = _assure_2d_array(cluster_vars) - if cluster_vars.shape[1] == 1: - cluster_cols = ['cluster_var'] - else: - cluster_cols = [f'cluster_var{i + 1}' for i in np.arange(cluster_vars.shape[1])] - - data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) - - return (cls(data, dml_data.y_col, dml_data.d_cols, cluster_cols, - dml_data.x_cols, dml_data.z_cols, dml_data.t_col, dml_data.s_col, - dml_data.use_other_treat_as_covariate, dml_data.force_all_x_finite)) - - @property - def cluster_cols(self): - """ - The cluster variable(s). - """ - return self._cluster_cols - - @cluster_cols.setter - def cluster_cols(self, value): - reset_value = hasattr(self, '_cluster_cols') - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The cluster variable(s) cluster_cols must be of str or list type. ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid cluster variable(s) cluster_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid cluster variable(s) cluster_cols. ' - 'At least one cluster variable is no data column.') - self._cluster_cols = value - if reset_value: - self._check_disjoint_sets() - self._set_cluster_vars() - - @property - def n_cluster_vars(self): - """ - The number of cluster variables. - """ - return len(self.cluster_cols) - - @property - def cluster_vars(self): - """ - Array of cluster variable(s). - """ - return self._cluster_vars.values - - @DoubleMLData.x_cols.setter - def x_cols(self, value): - if value is not None: - # this call might become much easier with https://github.com/python/cpython/pull/26194 - super(self.__class__, self.__class__).x_cols.__set__(self, value) - else: - if self.s_col is None: - if (self.z_cols is not None) & (self.t_col is not None): - y_d_z_t = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_t] - elif self.z_cols is not None: - y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z] - elif self.t_col is not None: - y_d_t = set.union({self.y_col}, set(self.d_cols), {self.t_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_t] - else: - y_d = set.union({self.y_col}, set(self.d_cols), set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d] - else: - if (self.z_cols is not None) & (self.t_col is not None): - y_d_z_t_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, {self.s_col}, - set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_t_s] - elif self.z_cols is not None: - y_d_z_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_s] - elif self.t_col is not None: - y_d_t_s = set.union({self.y_col}, set(self.d_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_t_s] - else: - y_d_s = set.union({self.y_col}, set(self.d_cols), {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_s] - # this call might become much easier with https://github.com/python/cpython/pull/26194 - super(self.__class__, self.__class__).x_cols.__set__(self, x_cols) - - def _check_disjoint_sets(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - self._check_disjoint_sets_cluster_cols() - - def _check_disjoint_sets_cluster_cols(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - - # special checks for the additional cluster variables - cluster_cols_set = set(self.cluster_cols) - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - t_col_set = {self.t_col} - s_col_set = {self.s_col} - - if not y_col_set.isdisjoint(cluster_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and cluster ' - 'variable in ``cluster_cols``.') - if not d_cols_set.isdisjoint(cluster_cols_set): - raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' - 'cluster variable in ``cluster_cols``.') - # TODO: Is the following combination allowed, or not? - if not x_cols_set.isdisjoint(cluster_cols_set): - raise ValueError('At least one variable/column is set as covariate (``x_cols``) and cluster ' - 'variable in ``cluster_cols``.') - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not z_cols_set.isdisjoint(cluster_cols_set): - raise ValueError('At least one variable/column is set as instrumental variable (``z_cols``) and ' - 'cluster variable in ``cluster_cols``.') - if self.t_col is not None: - if not t_col_set.isdisjoint(cluster_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and ' - 'cluster variable in ``cluster_cols``.') - if self.s_col is not None: - if not s_col_set.isdisjoint(cluster_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' - 'cluster variable in ``cluster_cols``.') - - def _set_cluster_vars(self): - assert_all_finite(self.data.loc[:, self.cluster_cols]) - self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 8086322a8..7f24fde5f 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -187,22 +187,6 @@ def _draw_weights(method, n_rep_boot, n_obs): return weights -def _trimm(preds, trimming_rule, trimming_threshold): - if trimming_rule == 'truncate': - preds[preds < trimming_threshold] = trimming_threshold - preds[preds > 1 - trimming_threshold] = 1 - trimming_threshold - return preds - - -def _normalize_ipw(propensity, treatment): - mean_treat1 = np.mean(np.divide(treatment, propensity)) - mean_treat0 = np.mean(np.divide(1.0 - treatment, 1.0 - propensity)) - normalized_weights = np.multiply(treatment, np.multiply(propensity, mean_treat1)) \ - + np.multiply(1.0 - treatment, 1.0 - np.multiply(1.0 - propensity, mean_treat0)) - - return normalized_weights - - def _rmse(y_true, y_pred): subset = np.logical_not(np.isnan(y_true)) rmse = root_mean_squared_error(y_true[subset], y_pred[subset]) From 29114ce4ac7663618b5113285f660b86c46298fe Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 14:35:46 -0700 Subject: [PATCH 16/48] Ruff checks and formatting --- doubleml/__init__.py | 4 +- doubleml/double_ml_score_mixins.py | 31 +- doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 98 ++-- doubleml/plm/lplr.py | 523 +++++++++++-------- doubleml/plm/tests/_utils_logistic_manual.py | 313 ----------- doubleml/plm/tests/_utils_lplr_manual.py | 1 - doubleml/plm/tests/test_lplr_exceptions.py | 6 +- doubleml/plm/tests/tests_logistic.py | 307 ----------- doubleml/utils/_estimation.py | 59 ++- doubleml/utils/resampling.py | 38 +- 10 files changed, 447 insertions(+), 933 deletions(-) delete mode 100644 doubleml/plm/tests/_utils_logistic_manual.py delete mode 100644 doubleml/plm/tests/tests_logistic.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 7c8ead970..cb3891bac 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,11 +13,9 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM -from doubleml.plm.lplr import DoubleMLLPLR - +from .plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR -from .logistic.logistic import DoubleMLLogit from .utils.blp import DoubleMLBLP from .utils.policytree import DoubleMLPolicyTree diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py index b0c69c25e..f1112db9c 100644 --- a/doubleml/double_ml_score_mixins.py +++ b/doubleml/double_ml_score_mixins.py @@ -150,10 +150,12 @@ def score_deriv(theta): theta_hat = root_res.root if not root_res.converged: score_val = score(theta_hat) - msg = ('Could not find a root of the score function.\n ' - f'Flag: {root_res.flag}.\n' - f'Score value found is {score_val} ' - f'for parameter theta equal to {theta_hat}.') + msg = ( + "Could not find a root of the score function.\n " + f"Flag: {root_res.flag}.\n" + f"Score value found is {score_val} " + f"for parameter theta equal to {theta_hat}." + ) if self._error_on_convergence_failure: raise ValueError(msg) else: @@ -185,15 +187,16 @@ def score_squared(theta): else: score_val_sign = np.sign(score(alt_coef_start)) if score_val_sign > 0: - theta_hat_array, score_val, _ = fmin_l_bfgs_b( score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - msg = ('Could not find a root of the score function.\n ' - f'Minimum score value found is {score_val} ' - f'for parameter theta equal to {theta_hat}.\n ' - 'No theta found such that the score function evaluates to a negative value.') + msg = ( + "Could not find a root of the score function.\n " + f"Minimum score value found is {score_val} " + f"for parameter theta equal to {theta_hat}.\n " + "No theta found such that the score function evaluates to a negative value." + ) if self._error_on_convergence_failure: raise ValueError(msg) else: @@ -208,10 +211,12 @@ def neg_score(theta): neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - msg = ('Could not find a root of the score function. ' - f'Maximum score value found is {-1*neg_score_val} ' - f'for parameter theta equal to {theta_hat}. ' - 'No theta found such that the score function evaluates to a positive value.') + msg = ( + "Could not find a root of the score function. " + f"Maximum score value found is {-1 * neg_score_val} " + f"for parameter theta equal to {theta_hat}. " + "No theta found such that the score function evaluates to a positive value." + ) if self._error_on_convergence_failure: raise ValueError(msg) else: diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py index 007e2b918..3d6d71277 100644 --- a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -9,28 +9,32 @@ _data_frame_alias = _get_data_frame_alias() _dml_data_alias = _get_dml_data_alias() -def make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): - """ + +def make_lplr_LZZ2020( + n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs +): + r""" Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), designed for use in double/debiased machine learning applications. The data generating process is defined as follows: - - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). - - Treatment \( d_i = a_0(x_i) \). - - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. - - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). + - Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`. + - Treatment :math:`d_i = a_0(x_i)`. + - Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function. + - Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`. The nuisance functions are defined as: .. math:: - + \begin{aligned} a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ - &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ - - r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ - &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ - &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + &\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8} + - 0.2\, x_{i,9} x_{i,10} \\ + r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + \end{aligned} Parameters ---------- @@ -73,38 +77,45 @@ def make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData' """ if balanced_r0: + def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 1 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 1 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 0.25 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) else: + def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 4 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 4 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 1.5 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) def a_0(X): - return 2 / (1 + np.exp(X[:, 0])) + \ - -2 / (1 + np.exp(X[:, 1])) + \ - 1 * np.sin(X[:, 2]) + \ - 1 * np.cos(X[:, 3]) + \ - 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ - -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ - 0.2 * X[:, 6] * X[:, 7] + \ - -0.2 * X[:, 8] * X[:, 9] - + return ( + 2 / (1 + np.exp(X[:, 0])) + + -2 / (1 + np.exp(X[:, 1])) + + 1 * np.sin(X[:, 2]) + + 1 * np.cos(X[:, 3]) + + 0.5 * np.where(X[:, 4] > 0, 1, 0) + + -0.5 * np.where(X[:, 5] > 0, 1, 0) + + 0.2 * X[:, 6] * X[:, 7] + + -0.2 * X[:, 8] * X[:, 9] + ) sigma = np.full((dim_x, dim_x), 0.2) np.fill_diagonal(sigma, 1) @@ -128,12 +139,11 @@ def a_0(X): if return_type in _array_alias: return x, y, d, p elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, p)), - columns=x_cols + ['y', 'd', 'p']) + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"]) if return_type in _data_frame_alias: return data else: - return DoubleMLData(data, 'y', 'd', x_cols) + return DoubleMLData(data, "y", "d", x_cols) else: - raise ValueError('Invalid return_type.') \ No newline at end of file + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 1ed00810a..edf17f082 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -1,34 +1,22 @@ import inspect import numpy as np - -from doubleml.utils._estimation import ( - _dml_cv_predict, - _trimm, - _predict_zero_one_propensity, - _cond_targets, - _get_bracket_guess, - _default_kde, - _normalize_ipw, - _dml_tune, - _solve_ipw_score, -) +import scipy from sklearn.base import clone from sklearn.utils import check_X_y -import scipy from sklearn.utils.multiclass import type_of_target from doubleml import DoubleMLData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import NonLinearScoreMixin -from doubleml.utils import DoubleMLClusterResampling -from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score +from doubleml.utils._estimation import ( + _dml_cv_predict, + _dml_tune, +) from doubleml.utils.resampling import DoubleMLDoubleResampling - - - class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): """Double machine learning for partially logistic models (binary outcomes) @@ -89,24 +77,22 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates. """ - def __init__(self, - obj_dml_data, - ml_M, - ml_t, - ml_m, - ml_a=None, - n_folds=5, - n_folds_inner=5, - n_rep=1, - score='nuisance_space', - draw_sample_splitting=True, - error_on_convergence_failure=False,): + def __init__( + self, + obj_dml_data, + ml_M, + ml_t, + ml_m, + ml_a=None, + n_folds=5, + n_folds_inner=5, + n_rep=1, + score="nuisance_space", + draw_sample_splitting=True, + error_on_convergence_failure=False, + ): self.n_folds_inner = n_folds_inner - super().__init__(obj_dml_data, - n_folds, - n_rep, - score, - draw_sample_splitting) + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) # Ensure outcome only contains 0 and 1 (validate early in constructor) if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): @@ -117,208 +103,264 @@ def __init__(self, self._coef_start_val = 1.0 self._check_data(self._dml_data) - valid_scores = ['nuisance_space', 'instrument'] + valid_scores = ["nuisance_space", "instrument"] _check_score(self.score, valid_scores, allow_callable=False) - _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) - _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) + _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) + _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) if np.array_equal(np.unique(obj_dml_data.d), [0, 1]): - ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True) + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True) else: - ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=False) - self._learner = {'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=False) + self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} if ml_a is not None: - ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) - self._learner['ml_a'] = ml_a + ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) + self._learner["ml_a"] = ml_a self._ml_a_provided = True else: - self._learner['ml_a'] = clone(ml_m) + self._learner["ml_a"] = clone(ml_m) ml_a_is_classifier = ml_m_is_classifier self._ml_a_provided = False - self._predict_method = {'ml_t': 'predict', 'ml_M': 'predict_proba'} + self._predict_method = {"ml_t": "predict", "ml_M": "predict_proba"} if ml_m_is_classifier: if self._dml_data.binary_treats.all(): - self._predict_method['ml_m'] = 'predict_proba' + self._predict_method["ml_m"] = "predict_proba" else: - raise ValueError(f'The ml_m learner {str(ml_m)} was identified as classifier ' - 'but at least one treatment variable is not binary with values 0 and 1.') + raise ValueError( + f"The ml_m learner {str(ml_m)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) else: - self._predict_method['ml_m'] = 'predict' + self._predict_method["ml_m"] = "predict" if ml_a_is_classifier: if self._dml_data.binary_treats.all(): - self._predict_method['ml_a'] = 'predict_proba' + self._predict_method["ml_a"] = "predict_proba" else: - raise ValueError(f'The ml_a learner {str(ml_a)} was identified as classifier ' - 'but at least one treatment variable is not binary with values 0 and 1.') + raise ValueError( + f"The ml_a learner {str(ml_a)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) else: - self._predict_method['ml_a'] = 'predict' + self._predict_method["ml_a"] = "predict" - if score == 'instrument': - sig = inspect.signature(self.learner['ml_a'].fit) - if not 'sample_weight' in sig.parameters: - raise ValueError('Learner \"ml_a\" who supports sample_weight is required for score type \"instrument\"') + if score == "instrument": + sig = inspect.signature(self.learner["ml_a"].fit) + if "sample_weight" not in sig.parameters: + raise ValueError('Learner "ml_a" who supports sample_weight is required for score type "instrument"') self._initialize_ml_nuisance_params() self._external_predictions_implemented = True def _initialize_ml_nuisance_params(self): - self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} - for learner in self._learner} + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} def _check_data(self, obj_dml_data): if not isinstance(obj_dml_data, DoubleMLData): - raise TypeError('The data must be of DoubleMLData type. ' - f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.') + raise TypeError( + f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): - raise TypeError('The outcome variable y must be binary with values 0 and 1.') + raise TypeError("The outcome variable y must be binary with values 0 and 1.") return - - def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, - n_jobs=None, est_params=None, method='predict', sample_weights=None): + def _double_dml_cv_predict( + self, + estimator, + estimator_name, + x, + y, + smpls=None, + smpls_inner=None, + n_jobs=None, + est_params=None, + method="predict", + sample_weights=None, + ): res = {} - res['preds'] = np.zeros(y.shape, dtype=float) - res['preds_inner'] = [] - res['models'] = [] + res["preds"] = np.zeros(y.shape, dtype=float) + res["preds_inner"] = [] + res["models"] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, - est_params=est_params, method=method, - return_models=True, smpls_is_partition=True, sample_weights=sample_weights) - _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) - - res['preds_inner'].append(res_inner['preds']) - for model in res_inner['models']: - res['models'].append(model) - if method == 'predict_proba': - res['preds'][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + res_inner = _dml_cv_predict( + estimator, + x, + y, + smpls=smpls_double_split, + n_jobs=n_jobs, + est_params=est_params, + method=method, + return_models=True, + smpls_is_partition=True, + sample_weights=sample_weights, + ) + _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) + + res["preds_inner"].append(res_inner["preds"]) + for model in res_inner["models"]: + res["models"].append(model) + if method == "predict_proba": + res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] else: - res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) res["preds"] /= len(smpls) - res['targets'] = np.copy(y) + res["targets"] = np.copy(y) return res - - def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): - x, y = check_X_y(self._dml_data.x, self._dml_data.y, - force_all_finite=False) - x, d = check_X_y(x, self._dml_data.d, - force_all_finite=False) - x_d_concat = np.hstack((d.reshape(-1,1), x)) - m_external = external_predictions['ml_m'] is not None - M_external = external_predictions['ml_M'] is not None - t_external = external_predictions['ml_t'] is not None - if 'ml_a' in self._learner: - a_external = external_predictions['ml_a'] is not None + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + m_external = external_predictions["ml_m"] is not None + M_external = external_predictions["ml_M"] is not None + t_external = external_predictions["ml_t"] is not None + if "ml_a" in self._learner: + a_external = external_predictions["ml_a"] is not None else: a_external = False if M_external: - M_hat = {'preds': external_predictions['ml_M'], - 'targets': None, - 'models': None} + M_hat = {"preds": external_predictions["ml_M"], "targets": None, "models": None} else: - M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) - + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_M"), + method=self._predict_method["ml_M"], + ) # nuisance m if m_external: - m_hat = {'preds': external_predictions['ml_m'], - 'targets': None, - 'models': None} + m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} else: - if self.score == 'instrument': + if self.score == "instrument": weights = [] for i, (train, test) in enumerate(smpls): - weights.append( M_hat['preds_inner'][i][train] * (1-M_hat['preds_inner'][i][train])) - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models, sample_weights=weights) - - elif self.score == 'nuisance_space': + weights.append(M_hat["preds_inner"][i][train] * (1 - M_hat["preds_inner"][i][train])) + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + sample_weights=weights, + ) + + elif self.score == "nuisance_space": filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] filtered_smpls.append((train_filtered, test)) - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models) + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=filtered_smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + ) else: raise NotImplementedError - _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) - if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): - _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12) + if self._check_learner(self._learner["ml_m"], "ml_m", regressor=True, classifier=True): + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: - binary_preds = (type_of_target(m_hat['preds']) == 'binary') - zero_one_preds = np.all((np.power(m_hat['preds'], 2) - m_hat['preds']) == 0) + binary_preds = type_of_target(m_hat["preds"]) == "binary" + zero_one_preds = np.all((np.power(m_hat["preds"], 2) - m_hat["preds"]) == 0) if binary_preds & zero_one_preds: - raise ValueError(f'For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, ' - f'predictions obtained with the ml_m learner {str(self._learner["ml_m"])} are also ' - 'observed to be binary with values 0 and 1. Make sure that for classifiers ' - 'probabilities and not labels are predicted.') + raise ValueError( + f"For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, " + f"predictions obtained with the ml_m learner {str(self._learner['ml_m'])} are also " + "observed to be binary with values 0 and 1. Make sure that for classifiers " + "probabilities and not labels are predicted." + ) if a_external: - a_hat = {'preds': external_predictions['ml_a'], - 'targets': None, - 'models': None} + a_hat = {"preds": external_predictions["ml_a"], "targets": None, "models": None} else: - a_hat = (self._double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x, d, smpls=smpls, smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) + a_hat = self._double_dml_cv_predict( + self._learner["ml_a"], + "ml_a", + x, + d, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_a"), + method=self._predict_method["ml_a"], + ) W_inner = [] beta = np.zeros(d.shape, dtype=float) for i, (train, test) in enumerate(smpls): - M_iteration = M_hat['preds_inner'][i][train] + M_iteration = M_hat["preds_inner"][i][train] M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) w = scipy.special.logit(M_iteration) W_inner.append(w) - d_tilde = (d - a_hat['preds_inner'][i])[train] - beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde ** 2) - + d_tilde = (d - a_hat["preds_inner"][i])[train] + beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde**2) # nuisance t if t_external: - t_hat = {'preds': external_predictions['ml_t'], - 'targets': None, - 'models': None} + t_hat = {"preds": external_predictions["ml_t"], "targets": None, "models": None} else: - t_hat = _dml_cv_predict(self._learner['ml_t'], x, W_inner, smpls=smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], - return_models=return_models) - _check_finite_predictions(t_hat['preds'], self._learner['ml_t'], 'ml_t', smpls) - + t_hat = _dml_cv_predict( + self._learner["ml_t"], + x, + W_inner, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_t"), + method=self._predict_method["ml_t"], + return_models=return_models, + ) + _check_finite_predictions(t_hat["preds"], self._learner["ml_t"], "ml_t", smpls) r_hat = {} - r_hat['preds'] = t_hat['preds'] - beta * a_hat['preds'] - - psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) - - preds = {'predictions': {'ml_r': r_hat['preds'], - 'ml_m': m_hat['preds'], - 'ml_a': a_hat['preds'], - 'ml_t': t_hat['preds'], - 'ml_M': M_hat['preds']}, - 'targets': {'ml_r': None, - 'ml_m': m_hat['targets'], - 'ml_a': a_hat['targets'], - 'ml_t': t_hat['targets'], - 'ml_M': M_hat['targets']}, - 'models': {'ml_r': None, - 'ml_m': m_hat['models'], - 'ml_a': a_hat['models'], - 'ml_t': t_hat['models'], - 'ml_M': M_hat['models']}} + r_hat["preds"] = t_hat["preds"] - beta * a_hat["preds"] + + psi_elements = self._score_elements(y, d, r_hat["preds"], m_hat["preds"]) + + preds = { + "predictions": { + "ml_r": r_hat["preds"], + "ml_m": m_hat["preds"], + "ml_a": a_hat["preds"], + "ml_t": t_hat["preds"], + "ml_M": M_hat["preds"], + }, + "targets": { + "ml_r": None, + "ml_m": m_hat["targets"], + "ml_a": a_hat["targets"], + "ml_t": t_hat["targets"], + "ml_M": M_hat["targets"], + }, + "models": { + "ml_r": None, + "ml_m": m_hat["models"], + "ml_a": a_hat["models"], + "ml_t": t_hat["models"], + "ml_M": M_hat["models"], + }, + } return psi_elements, preds @@ -327,90 +369,128 @@ def _score_elements(self, y, d, r_hat, m_hat): d_tilde = d - m_hat psi_hat = scipy.special.expit(-r_hat) score_const = d_tilde * (1 - y) * np.exp(r_hat) - psi_elements = {"y": y, "d": d, "d_tilde": d_tilde, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} + psi_elements = { + "y": y, + "d": d, + "d_tilde": d_tilde, + "r_hat": r_hat, + "m_hat": m_hat, + "psi_hat": psi_hat, + "score_const": score_const, + } return psi_elements @property def _score_element_names(self): - return ['y', 'd', 'd_tilde', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] + return ["y", "d", "d_tilde", "r_hat", "m_hat", "psi_hat", "score_const"] def _sensitivity_element_est(self, preds): - pass + pass - def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, - search_mode, n_iter_randomized_search): + def _nuisance_tuning( + self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + ): # TODO: test - x, y = check_X_y(self._dml_data.x, self._dml_data.y, - force_all_finite=False) - x, d = check_X_y(x, self._dml_data.d, - force_all_finite=False) + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) x_d_concat = np.hstack((d.reshape(-1, 1), x)) if scoring_methods is None: - scoring_methods = {'ml_m': None, - 'ml_M': None, - 'ml_a': None, - 'ml_t': None} + scoring_methods = {"ml_m": None, "ml_M": None, "ml_a": None, "ml_t": None} train_inds = [train_index for (train_index, _) in smpls] - M_tune_res = _dml_tune(y, x_d_concat, train_inds, - self._learner['ml_M'], param_grids['ml_M'], scoring_methods['ml_M'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + M_tune_res = _dml_tune( + y, + x_d_concat, + train_inds, + self._learner["ml_M"], + param_grids["ml_M"], + scoring_methods["ml_M"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) filtered_train_inds = [] - if self.score == 'nuisance_space': + if self.score == "nuisance_space": for train, test in smpls: train_filtered = train[y[train] == 0] filtered_train_inds.append(train_filtered) - elif self.score == 'instrument': + elif self.score == "instrument": filtered_train_inds = train_inds else: raise NotImplementedError - m_tune_res = _dml_tune(d, x, filtered_train_inds, - self._learner['ml_m'], param_grids['ml_m'], scoring_methods['ml_m'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) - - a_tune_res = _dml_tune(d, x, train_inds, - self._learner['ml_a'], param_grids['ml_a'], scoring_methods['ml_a'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + m_tune_res = _dml_tune( + d, + x, + filtered_train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + a_tune_res = _dml_tune( + d, + x, + train_inds, + self._learner["ml_a"], + param_grids["ml_a"], + scoring_methods["ml_a"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) M_best_params = [xx.best_params_ for xx in M_tune_res] m_best_params = [xx.best_params_ for xx in m_tune_res] a_best_params = [xx.best_params_ for xx in a_tune_res] # Create targets for tuning ml_t - M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, - smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=M_best_params, method=self._predict_method['ml_M'])) + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=M_best_params, + method=self._predict_method["ml_M"], + ) W_inner = [] for i, (train, test) in enumerate(smpls): - M_iteration = M_hat['preds_inner'][i][train] + M_iteration = M_hat["preds_inner"][i][train] M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) w = scipy.special.logit(M_iteration) W_inner.append(w) - t_tune_res = _dml_tune(W_inner, x, train_inds, - self._learner['ml_t'], param_grids['ml_t'], scoring_methods['ml_t'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + t_tune_res = _dml_tune( + W_inner, + x, + train_inds, + self._learner["ml_t"], + param_grids["ml_t"], + scoring_methods["ml_t"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) t_best_params = [xx.best_params_ for xx in t_tune_res] - - # Update params and tune_res to include ml_a and ml_t - params = {'ml_M': M_best_params, - 'ml_m': m_best_params, - 'ml_a': a_best_params, - 'ml_t': t_best_params} - tune_res = {'M_tune': M_tune_res, - 'm_tune': m_tune_res, - 'a_tune': a_tune_res, - 't_tune': t_tune_res} - - res = {'params': params, - 'tune_res': tune_res} + params = {"ml_M": M_best_params, "ml_m": m_best_params, "ml_a": a_best_params, "ml_t": t_best_params} + tune_res = {"M_tune": M_tune_res, "m_tune": m_tune_res, "a_tune": a_tune_res, "t_tune": t_tune_res} + + res = {"params": params, "tune_res": tune_res} return res @@ -430,37 +510,40 @@ def draw_sample_splitting(self): self : object """ - obj_dml_resampling = DoubleMLDoubleResampling(n_folds=self.n_folds, - n_folds_inner=self.n_folds_inner, - n_rep=self.n_rep, - n_obs=self._dml_data.n_obs, - stratify=self._strata) + obj_dml_resampling = DoubleMLDoubleResampling( + n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata, + ) self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() return self def set_sample_splitting(self): - raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLPLR.') + raise NotImplementedError("set_sample_splitting is not implemented for DoubleMLLPLR.") def _compute_score(self, psi_elements, coef): - - if self.score == 'nuisance_space': + if self.score == "nuisance_space": score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) - elif self.score == 'instrument': - score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] + elif self.score == "instrument": + score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"])) * psi_elements[ + "d_tilde" + ] else: raise NotImplementedError return score def _compute_score_deriv(self, psi_elements, coef, inds=None): - if self.score == 'nuisance_space': - deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] - deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 - elif self.score == 'instrument': - expit = scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"]) - deriv = - psi_elements["d"] * expit * (1-expit) * psi_elements["d_tilde"] + if self.score == "nuisance_space": + deriv_1 = -psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + elif self.score == "instrument": + expit = scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"]) + deriv = -psi_elements["d"] * expit * (1 - expit) * psi_elements["d_tilde"] else: raise NotImplementedError diff --git a/doubleml/plm/tests/_utils_logistic_manual.py b/doubleml/plm/tests/_utils_logistic_manual.py deleted file mode 100644 index af4d034eb..000000000 --- a/doubleml/plm/tests/_utils_logistic_manual.py +++ /dev/null @@ -1,313 +0,0 @@ -import numpy as np -import scipy -from sklearn.base import clone, is_classifier - -from doubleml.tests._utils_boot import boot_manual, draw_weights -from doubleml.tests._utils import fit_predict, fit_predict_proba, tune_grid_search - - -def fit_logistic_multitreat(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, - n_rep=1, l_params=None, m_params=None, g_params=None, - use_other_treat_as_covariate=True): - n_obs = len(y) - n_d = d.shape[1] - - thetas = list() - ses = list() - all_l_hat = list() - all_m_hat = list() - all_g_hat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - thetas_this_rep = np.full(n_d, np.nan) - ses_this_rep = np.full(n_d, np.nan) - all_l_hat_this_rep = list() - all_m_hat_this_rep = list() - all_g_hat_this_rep = list() - - for i_d in range(n_d): - if use_other_treat_as_covariate: - xd = np.hstack((x, np.delete(d, i_d, axis=1))) - else: - xd = x - - l_hat, m_hat, g_hat, thetas_this_rep[i_d], ses_this_rep[i_d] = fit_plr_single_split( - y, xd, d[:, i_d], - learner_l, learner_m, learner_g, - smpls, score, - l_params, m_params, g_params) - all_l_hat_this_rep.append(l_hat) - all_m_hat_this_rep.append(m_hat) - all_g_hat_this_rep.append(g_hat) - - thetas.append(thetas_this_rep) - ses.append(ses_this_rep) - all_l_hat.append(all_l_hat_this_rep) - all_m_hat.append(all_m_hat_this_rep) - all_g_hat.append(all_g_hat_this_rep) - - theta = np.full(n_d, np.nan) - se = np.full(n_d, np.nan) - for i_d in range(n_d): - theta_vec = np.array([xx[i_d] for xx in thetas]) - se_vec = np.array([xx[i_d] for xx in ses]) - theta[i_d] = np.median(theta_vec) - se[i_d] = np.sqrt(np.median(np.power(se_vec, 2) * n_obs + np.power(theta_vec - theta[i_d], 2)) / n_obs) - - res = {'theta': theta, 'se': se, - 'thetas': thetas, 'ses': ses, - 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} - - return res - - -def fit_logistic(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, - n_rep=1, l_params=None, m_params=None, g_params=None): - n_obs = len(y) - - thetas = np.zeros(n_rep) - ses = np.zeros(n_rep) - all_l_hat = list() - all_m_hat = list() - all_g_hat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - l_hat, m_hat, g_hat, thetas[i_rep], ses[i_rep] = fit_plr_single_split( - y, x, d, - learner_l, learner_m, learner_g, - smpls, score, - l_params, m_params, g_params) - all_l_hat.append(l_hat) - all_m_hat.append(m_hat) - all_g_hat.append(g_hat) - - theta = np.median(thetas) - se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) - - res = {'theta': theta, 'se': se, - 'thetas': thetas, 'ses': ses, - 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} - - return res - - -def fit_plr_logistic_split(y, x, d, learner_l, learner_m, learner_g, smpls, score, - l_params=None, m_params=None, g_params=None): - fit_g = (score == 'IV-type') | callable(score) - if is_classifier(learner_m): - l_hat, m_hat, g_hat = fit_nuisance_plr_classifier(y, x, d, - learner_l, learner_m, learner_g, - smpls, fit_g, - l_params, m_params, g_params) - else: - l_hat, m_hat, g_hat = fit_nuisance_plr(y, x, d, - learner_l, learner_m, learner_g, - smpls, fit_g, - l_params, m_params, g_params) - - theta, se = plr_dml2(y, x, d, l_hat, m_hat, g_hat, - smpls, score) - - return l_hat, m_hat, g_hat, theta, se - - -def fit_nuisance_logistic(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, - l_params=None, m_params=None, g_params=None): - ml_l = clone(learner_l) - l_hat = fit_predict(y, x, ml_l, l_params, smpls) - - ml_m = clone(learner_m) - m_hat = fit_predict(d, x, ml_m, m_params, smpls) - - if fit_g: - y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) - psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) - psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) - theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) - - ml_g = clone(learner_g) - g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) - else: - g_hat = [] - - return l_hat, m_hat, g_hat - - -def fit_nuisance_logistic_classifier(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, - l_params=None, m_params=None, g_params=None): - ml_l = clone(learner_l) - l_hat = fit_predict(y, x, ml_l, l_params, smpls) - - ml_m = clone(learner_m) - m_hat = fit_predict_proba(d, x, ml_m, m_params, smpls) - - if fit_g: - y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) - psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) - psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) - theta_initial = -np.mean(psi_b) / np.mean(psi_a) - - ml_g = clone(learner_g) - g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) - else: - g_hat = [] - - return l_hat, m_hat, g_hat - - -def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): - y_minus_l_hat = np.full_like(y, np.nan, dtype='float64') - d_minus_m_hat = np.full_like(d, np.nan, dtype='float64') - y_minus_g_hat = np.full_like(y, np.nan, dtype='float64') - for idx, (_, test_index) in enumerate(smpls): - y_minus_l_hat[test_index] = y[test_index] - l_hat[idx] - if len(g_hat) > 0: - y_minus_g_hat[test_index] = y[test_index] - g_hat[idx] - d_minus_m_hat[test_index] = d[test_index] - m_hat[idx] - return y_minus_l_hat, d_minus_m_hat, y_minus_g_hat - - - - -def var_plr(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs): - if score == 'partialling out': - var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d_minus_m_hat)), 2) * \ - np.mean(np.power(np.multiply(y_minus_l_hat - d_minus_m_hat*theta, d_minus_m_hat), 2)) - else: - assert score == 'IV-type' - var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d)), 2) * \ - np.mean(np.power(np.multiply(y_minus_g_hat - d*theta, d_minus_m_hat), 2)) - - return var - - -def plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score): - if score == 'IV-type': - res = np.mean(np.multiply(d_minus_m_hat, y_minus_g_hat))/np.mean(np.multiply(d_minus_m_hat, d)) - else: - assert score == 'partialling out' - res = scipy.linalg.lstsq(d_minus_m_hat.reshape(-1, 1), y_minus_l_hat)[0] - - return res - - -def boot_plr(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, - all_smpls, score, bootstrap, n_rep_boot, - n_rep=1, apply_cross_fitting=True): - all_boot_t_stat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - if apply_cross_fitting: - n_obs = len(y) - else: - test_index = smpls[0][1] - n_obs = len(test_index) - weights = draw_weights(bootstrap, n_rep_boot, n_obs) - - boot_t_stat = boot_plr_single_split( - thetas[i_rep], y, d, all_l_hat[i_rep], all_m_hat[i_rep], all_g_hat[i_rep], smpls, - score, ses[i_rep], - weights, n_rep_boot, apply_cross_fitting) - all_boot_t_stat.append(boot_t_stat) - - # differently for plr because of n_rep_boot and multiple treatmentsa - boot_t_stat = np.transpose(np.vstack(all_boot_t_stat)) - - return boot_t_stat - - -def boot_plr_multitreat(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, - all_smpls, score, bootstrap, n_rep_boot, - n_rep=1, apply_cross_fitting=True): - n_d = d.shape[1] - all_boot_t_stat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - if apply_cross_fitting: - n_obs = len(y) - else: - test_index = smpls[0][1] - n_obs = len(test_index) - weights = draw_weights(bootstrap, n_rep_boot, n_obs) - - boot_t_stat = np.full((n_d, n_rep_boot), np.nan) - for i_d in range(n_d): - boot_t_stat[i_d, :] = boot_plr_single_split( - thetas[i_rep][i_d], y, d[:, i_d], - all_l_hat[i_rep][i_d], all_m_hat[i_rep][i_d], all_g_hat[i_rep][i_d], - smpls, score, ses[i_rep][i_d], - weights, n_rep_boot, apply_cross_fitting) - - # transpose for shape (n_rep_boot, n_d) - boot_t_stat = np.transpose(boot_t_stat) - all_boot_t_stat.append(boot_t_stat) - - # stack repetitions along the last axis - boot_t_stat = np.stack(all_boot_t_stat, axis=2) - - return boot_t_stat - - -def boot_plr_single_split(theta, y, d, l_hat, m_hat, g_hat, - smpls, score, se, weights, n_rep, apply_cross_fitting): - y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) - - if apply_cross_fitting: - if score == 'partialling out': - J = np.mean(-np.multiply(d_minus_m_hat, d_minus_m_hat)) - else: - assert score == 'IV-type' - J = np.mean(-np.multiply(d_minus_m_hat, d)) - else: - test_index = smpls[0][1] - if score == 'partialling out': - J = np.mean(-np.multiply(d_minus_m_hat[test_index], d_minus_m_hat[test_index])) - else: - assert score == 'IV-type' - J = np.mean(-np.multiply(d_minus_m_hat[test_index], d[test_index])) - - if score == 'partialling out': - psi = np.multiply(y_minus_l_hat - d_minus_m_hat * theta, d_minus_m_hat) - else: - assert score == 'IV-type' - psi = np.multiply(y_minus_g_hat - d * theta, d_minus_m_hat) - - boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep, apply_cross_fitting) - - return boot_t_stat - - -def fit_sensitivity_elements_plr(y, d, all_coef, predictions, score, n_rep): - n_treat = d.shape[1] - n_obs = len(y) - - sigma2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) - nu2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) - psi_sigma2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) - psi_nu2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) - - for i_rep in range(n_rep): - for i_treat in range(n_treat): - d_tilde = d[:, i_treat] - m_hat = predictions['ml_m'][:, i_rep, i_treat] - theta = all_coef[i_treat, i_rep] - if score == 'partialling out': - l_hat = predictions['ml_l'][:, i_rep, i_treat] - sigma2_score_element = np.square(y - l_hat - np.multiply(theta, d_tilde-m_hat)) - else: - assert score == 'IV-type' - g_hat = predictions['ml_g'][:, i_rep, i_treat] - sigma2_score_element = np.square(y - g_hat - np.multiply(theta, d_tilde)) - - sigma2[0, i_rep, i_treat] = np.mean(sigma2_score_element) - psi_sigma2[:, i_rep, i_treat] = sigma2_score_element - sigma2[0, i_rep, i_treat] - - nu2[0, i_rep, i_treat] = np.divide(1.0, np.mean(np.square(d_tilde-m_hat))) - psi_nu2[:, i_rep, i_treat] = nu2[0, i_rep, i_treat] - \ - np.multiply(np.square(d_tilde-m_hat), np.square(nu2[0, i_rep, i_treat])) - - element_dict = {'sigma2': sigma2, - 'nu2': nu2, - 'psi_sigma2': psi_sigma2, - 'psi_nu2': psi_nu2} - return element_dict diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py index f14a1f66c..8f45b5b08 100644 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -297,7 +297,6 @@ def tune_nuisance_ssm_mar(y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, def tune_nuisance_ssm_nonignorable( y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m ): - train_inds = [tr for (tr, _) in smpls] inner0_list, inner1_list = [], [] diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 4361e7c7b..8a55fe595 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -6,7 +6,6 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLLPLR -from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData from doubleml.plm.datasets import make_lplr_LZZ2020 np.random.seed(3141) @@ -19,6 +18,7 @@ dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") + @pytest.mark.ci def test_lplr_exception_data(): msg = ( @@ -45,6 +45,7 @@ def test_lplr_exception_scores(): with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) + @pytest.mark.ci def test_ssm_exception_resampling(): msg = "The number of folds must be of int type. 1.5 of type was passed." @@ -74,6 +75,7 @@ def test_lplr_exception_get_params(): with pytest.raises(ValueError, match=msg): dml_lplr.get_params("ml_x") + @pytest.mark.ci def test_lplr_exception_smpls(): msg = ( @@ -84,6 +86,7 @@ def test_lplr_exception_smpls(): with pytest.raises(ValueError, match=msg): _ = dml_plr_no_smpls.smpls + @pytest.mark.ci def test_lplr_exception_fit(): msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." @@ -96,6 +99,7 @@ def test_lplr_exception_fit(): with pytest.raises(TypeError, match=msg): dml_lplr.fit(store_models=1) + @pytest.mark.ci def test_lplr_exception_bootstrap(): dml_lplr_boot = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) diff --git a/doubleml/plm/tests/tests_logistic.py b/doubleml/plm/tests/tests_logistic.py deleted file mode 100644 index a77db7a67..000000000 --- a/doubleml/plm/tests/tests_logistic.py +++ /dev/null @@ -1,307 +0,0 @@ -import pytest -import math -import scipy -import numpy as np -import pandas as pd - -from sklearn.base import clone - -from sklearn.linear_model import LinearRegression, Lasso -from sklearn.ensemble import RandomForestRegressor - -import doubleml as dml - -from doubleml.tests._utils import draw_smpls -from ._utils_logistic_manual import fit_logistic, boot_plr - - -@pytest.fixture(scope='module', - params=[RandomForestRegressor(max_depth=2, n_estimators=10), - LinearRegression(), - Lasso(alpha=0.1)]) -def learner(request): - return request.param - - -@pytest.fixture(scope='module', - params=['IV-type', 'partialling out']) -def score(request): - return request.param - - -@pytest.fixture(scope="module") -def dml_plr_fixture(generate_data1, learner, score): - boot_methods = ['normal'] - n_folds = 2 - n_rep_boot = 502 - - # collect data - data = generate_data1 - x_cols = data.columns[data.columns.str.startswith('X')].tolist() - - # Set machine learning methods for m & g - ml_l = clone(learner) - ml_m = clone(learner) - ml_g = clone(learner) - - np.random.seed(3141) - obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) - if score == 'partialling out': - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, - n_folds=n_folds, - score=score) - else: - assert score == 'IV-type' - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, ml_g, - n_folds, - score=score) - - dml_plr_obj.fit() - - np.random.seed(3141) - y = data['y'].values - x = data.loc[:, x_cols].values - d = data['d'].values - n_obs = len(y) - all_smpls = draw_smpls(n_obs, n_folds) - - res_manual = fit_plr(y, x, d, clone(learner), clone(learner), clone(learner), - all_smpls, score) - - np.random.seed(3141) - # test with external nuisance predictions - if score == 'partialling out': - dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, - n_folds, - score=score) - else: - assert score == 'IV-type' - dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, ml_g, - n_folds, - score=score) - - # synchronize the sample splitting - dml_plr_obj_ext.set_sample_splitting(all_smpls=all_smpls) - - if score == 'partialling out': - prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), - 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1)}} - else: - assert score == 'IV-type' - prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), - 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1), - 'ml_g': dml_plr_obj.predictions['ml_g'].reshape(-1, 1)}} - - dml_plr_obj_ext.fit(external_predictions=prediction_dict) - - res_dict = {'coef': dml_plr_obj.coef, - 'coef_manual': res_manual['theta'], - 'coef_ext': dml_plr_obj_ext.coef, - 'se': dml_plr_obj.se, - 'se_manual': res_manual['se'], - 'se_ext': dml_plr_obj_ext.se, - 'boot_methods': boot_methods} - - for bootstrap in boot_methods: - np.random.seed(3141) - boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], - res_manual['all_l_hat'], res_manual['all_m_hat'], res_manual['all_g_hat'], - all_smpls, score, bootstrap, n_rep_boot) - - np.random.seed(3141) - dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) - np.random.seed(3141) - dml_plr_obj_ext.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) - res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat - res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) - res_dict['boot_t_stat' + bootstrap + '_ext'] = dml_plr_obj_ext.boot_t_stat - - # sensitivity tests - res_dict['sensitivity_elements'] = dml_plr_obj.sensitivity_elements - res_dict['sensitivity_elements_manual'] = fit_sensitivity_elements_plr(y, d.reshape(-1, 1), - all_coef=dml_plr_obj.all_coef, - predictions=dml_plr_obj.predictions, - score=score, - n_rep=1) - # check if sensitivity score with rho=0 gives equal asymptotic standard deviation - dml_plr_obj.sensitivity_analysis(rho=0.0) - res_dict['sensitivity_ses'] = dml_plr_obj.sensitivity_params['se'] - return res_dict - - -@pytest.mark.ci -def test_dml_plr_coef(dml_plr_fixture): - assert math.isclose(dml_plr_fixture['coef'], - dml_plr_fixture['coef_manual'], - rel_tol=1e-9, abs_tol=1e-4) - assert math.isclose(dml_plr_fixture['coef'], - dml_plr_fixture['coef_ext'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_se(dml_plr_fixture): - assert math.isclose(dml_plr_fixture['se'], - dml_plr_fixture['se_manual'], - rel_tol=1e-9, abs_tol=1e-4) - assert math.isclose(dml_plr_fixture['se'], - dml_plr_fixture['se_ext'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_boot(dml_plr_fixture): - for bootstrap in dml_plr_fixture['boot_methods']: - assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], - dml_plr_fixture['boot_t_stat' + bootstrap + '_manual'], - rtol=1e-9, atol=1e-4) - assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], - dml_plr_fixture['boot_t_stat' + bootstrap + '_ext'], - rtol=1e-9, atol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_sensitivity(dml_plr_fixture): - sensitivity_element_names = ['sigma2', 'nu2', 'psi_sigma2', 'psi_nu2'] - for sensitivity_element in sensitivity_element_names: - assert np.allclose(dml_plr_fixture['sensitivity_elements'][sensitivity_element], - dml_plr_fixture['sensitivity_elements_manual'][sensitivity_element]) - - -@pytest.mark.ci -def test_dml_plr_sensitivity_rho0(dml_plr_fixture): - assert np.allclose(dml_plr_fixture['se'], - dml_plr_fixture['sensitivity_ses']['lower'], - rtol=1e-9, atol=1e-4) - assert np.allclose(dml_plr_fixture['se'], - dml_plr_fixture['sensitivity_ses']['upper'], - rtol=1e-9, atol=1e-4) - - -@pytest.fixture(scope="module") -def dml_plr_ols_manual_fixture(generate_data1, score): - learner = LinearRegression() - boot_methods = ['Bayes', 'normal', 'wild'] - n_folds = 2 - n_rep_boot = 501 - - # collect data - data = generate_data1 - x_cols = data.columns[data.columns.str.startswith('X')].tolist() - - # Set machine learning methods for m & g - ml_l = clone(learner) - ml_g = clone(learner) - ml_m = clone(learner) - - obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) - if score == 'partialling out': - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, - n_folds=n_folds, - score=score) - else: - assert score == 'IV-type' - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, ml_g, - n_folds, - score=score) - - n = data.shape[0] - this_smpl = list() - xx = int(n/2) - this_smpl.append((np.arange(xx, n), np.arange(0, xx))) - this_smpl.append((np.arange(0, xx), np.arange(xx, n))) - smpls = [this_smpl] - dml_plr_obj.set_sample_splitting(smpls) - - dml_plr_obj.fit() - - y = data['y'].values - x = data.loc[:, x_cols].values - d = data['d'].values - - # add column of ones for intercept - o = np.ones((n, 1)) - x = np.append(x, o, axis=1) - - smpls = dml_plr_obj.smpls[0] - - l_hat = [] - l_hat_vec = np.full_like(y, np.nan) - for (train_index, test_index) in smpls: - ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0] - preds = np.dot(x[test_index], ols_est) - l_hat.append(preds) - l_hat_vec[test_index] = preds - - m_hat = [] - m_hat_vec = np.full_like(d, np.nan) - for (train_index, test_index) in smpls: - ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0] - preds = np.dot(x[test_index], ols_est) - m_hat.append(preds) - m_hat_vec[test_index] = preds - - g_hat = [] - if score == 'IV-type': - theta_initial = scipy.linalg.lstsq((d - m_hat_vec).reshape(-1, 1), y - l_hat_vec)[0] - for (train_index, test_index) in smpls: - ols_est = scipy.linalg.lstsq(x[train_index], - y[train_index] - d[train_index] * theta_initial)[0] - g_hat.append(np.dot(x[test_index], ols_est)) - - res_manual, se_manual = plr_dml2(y, x, d, - l_hat, m_hat, g_hat, - smpls, score) - - res_dict = {'coef': dml_plr_obj.coef, - 'coef_manual': res_manual, - 'se': dml_plr_obj.se, - 'se_manual': se_manual, - 'boot_methods': boot_methods} - - for bootstrap in boot_methods: - np.random.seed(3141) - boot_t_stat = boot_plr(y, d, [res_manual], [se_manual], - [l_hat], [m_hat], [g_hat], - [smpls], score, bootstrap, n_rep_boot) - - np.random.seed(3141) - dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) - res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat - res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) - - return res_dict - - -@pytest.mark.ci -def test_dml_plr_ols_manual_coef(dml_plr_ols_manual_fixture): - assert math.isclose(dml_plr_ols_manual_fixture['coef'], - dml_plr_ols_manual_fixture['coef_manual'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_ols_manual_se(dml_plr_ols_manual_fixture): - assert math.isclose(dml_plr_ols_manual_fixture['se'], - dml_plr_ols_manual_fixture['se_manual'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture): - for bootstrap in dml_plr_ols_manual_fixture['boot_methods']: - assert np.allclose(dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap], - dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap + '_manual'], - rtol=1e-9, atol=1e-4) - - -@pytest.fixture(scope='module', - params=["nonrobust", "HC0", "HC1", "HC2", "HC3"]) -def cov_type(request): - return request.param \ No newline at end of file diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 7f24fde5f..d10ae48bc 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -43,9 +43,19 @@ def _fit(estimator, x, y, train_index, idx=None): return estimator, idx -def _dml_cv_predict(estimator, x, y, smpls=None, - n_jobs=None, est_params=None, method='predict', return_train_preds=False, return_models=False, - smpls_is_partition=None, sample_weights=None): +def _dml_cv_predict( + estimator, + x, + y, + smpls=None, + n_jobs=None, + est_params=None, + method="predict", + return_train_preds=False, + return_models=False, + smpls_is_partition=None, + sample_weights=None, +): n_obs = x.shape[0] # TODO: Better name for smples_is_partition @@ -53,9 +63,15 @@ def _dml_cv_predict(estimator, x, y, smpls=None, smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) - manual_cv_predict = (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target \ - | return_models | bool(sample_weights) - #TODO: Check if cross_val_predict supports weights + manual_cv_predict = ( + (not smpls_is_partition) + | return_train_preds + | fold_specific_params + | fold_specific_target + | return_models + | bool(sample_weights) + ) + # TODO: Check if cross_val_predict supports weights res = {"models": None} if not manual_cv_predict: @@ -149,21 +165,34 @@ def _dml_cv_predict(estimator, x, y, smpls=None, return res -def _dml_tune(y, x, train_inds, - learner, param_grid, scoring_method, - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search, fold_specific_target=False): +def _dml_tune( + y, + x, + train_inds, + learner, + param_grid, + scoring_method, + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + fold_specific_target=False, +): tune_res = list() for i, train_index in enumerate(train_inds): tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True) if search_mode == "grid_search": g_grid_search = GridSearchCV(learner, param_grid, scoring=scoring_method, cv=tune_resampling, n_jobs=n_jobs_cv) else: - assert search_mode == 'randomized_search' - g_grid_search = RandomizedSearchCV(learner, param_grid, - scoring=scoring_method, - cv=tune_resampling, - n_jobs=n_jobs_cv, - n_iter=n_iter_randomized_search) + assert search_mode == "randomized_search" + g_grid_search = RandomizedSearchCV( + learner, + param_grid, + scoring=scoring_method, + cv=tune_resampling, + n_jobs=n_jobs_cv, + n_iter=n_iter_randomized_search, + ) if fold_specific_target: tune_res.append(g_grid_search.fit(x[train_index, :], y[i])) else: diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index d10145176..38c1ac595 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -26,12 +26,7 @@ def split_samples(self): class DoubleMLDoubleResampling: - def __init__(self, - n_folds, - n_folds_inner, - n_rep, - n_obs, - stratify=None): + def __init__(self, n_folds, n_folds_inner, n_rep, n_obs, stratify=None): self.n_folds = n_folds self.n_folds_inner = n_folds_inner self.n_rep = n_rep @@ -39,12 +34,13 @@ def __init__(self, self.stratify = stratify if n_folds < 2: - raise ValueError('n_folds must be greater than 1. ' - 'You can use set_sample_splitting with a tuple to only use one fold.') + raise ValueError( + "n_folds must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) if n_folds_inner < 2: - raise ValueError('n_folds_inner must be greater than 1. ' - 'You can use set_sample_splitting with a tuple to only use one fold.') - + raise ValueError( + "n_folds_inner must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) if self.stratify is None: self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) @@ -55,17 +51,27 @@ def __init__(self, def split_samples(self): all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] - smpls = [all_smpls[(i_repeat * self.n_folds):((i_repeat + 1) * self.n_folds)] - for i_repeat in range(self.n_rep)] + smpls = [all_smpls[(i_repeat * self.n_folds) : ((i_repeat + 1) * self.n_folds)] for i_repeat in range(self.n_rep)] smpls_inner = [] for _ in range(self.n_rep): smpls_inner_rep = [] for train, test in all_smpls: if self.stratify is None: - smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in self.resampling_inner.split(X=train)]) + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split(X=train) + ] + ) else: - smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in - self.resampling_inner.split(X=np.zeros(len(train)), y=self.stratify[train])]) + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split( + X=np.zeros(len(train)), y=self.stratify[train] + ) + ] + ) smpls_inner.append(smpls_inner_rep) return smpls, smpls_inner From 5d2d1ed24deec8ca565b9ebe1260e3f9b0584b94 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 21:56:22 -0700 Subject: [PATCH 17/48] Unit tests work and bug fix in lplr --- doubleml/plm/lplr.py | 6 +- doubleml/plm/tests/_utils_lplr_manual.py | 371 +++++++-------------- doubleml/plm/tests/test_lplr.py | 31 +- doubleml/plm/tests/test_lplr_exceptions.py | 18 +- doubleml/plm/tests/test_lplr_tune.py | 163 +++------ 5 files changed, 205 insertions(+), 384 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index edf17f082..08a6bbfac 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -109,10 +109,8 @@ def __init__( _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) - if np.array_equal(np.unique(obj_dml_data.d), [0, 1]): - ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True) - else: - ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=False) + + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} if ml_a is not None: diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py index 8f45b5b08..699047019 100644 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -8,74 +8,54 @@ def fit_selection( - y, - x, - d, - z, - s, - learner_g, - learner_pi, - learner_m, - all_smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - normalize_ipw=True, - n_rep=1, - g_d0_params=None, - g_d1_params=None, - pi_params=None, - m_params=None, + y, + x, + d, + learner_M, + learner_t, + learner_m, + all_smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + n_rep=1, + M_params=None, + t_params=None, + m_params=None, ): n_obs = len(y) thetas = np.zeros(n_rep) ses = np.zeros(n_rep) - all_g_d1_hat = list() - all_g_d0_hat = list() - all_pi_hat = list() + all_M_hat = list() + all_t_hat = list() all_m_hat = list() - all_psi_a = list() - all_psi_b = list() - for i_rep in range(n_rep): smpls = all_smpls[i_rep] - g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list = fit_nuisance_selection( + M_hat_list, t_hat_list, m_hat_list = fit_nuisance_selection( y, x, d, - z, - s, - learner_g, - learner_pi, + learner_M, + learner_t, learner_m, smpls, score, trimming_rule=trimming_rule, trimming_threshold=trimming_threshold, - g_d0_params=g_d0_params, - g_d1_params=g_d1_params, - pi_params=pi_params, + M_params=M_params, + t_params=t_params, m_params=m_params, ) - all_g_d1_hat.append(g_hat_d1_list) - all_g_d0_hat.append(g_hat_d0_list) - all_pi_hat.append(pi_hat_list) - all_m_hat.append(m_hat_list) - - g_hat_d1, g_hat_d0, pi_hat, m_hat = compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls) - - dtreat = d == 1 - dcontrol = d == 0 - psi_a, psi_b = selection_score_elements(dtreat, dcontrol, g_hat_d1, g_hat_d0, pi_hat, m_hat, s, y, normalize_ipw) - all_psi_a.append(psi_a) - all_psi_b.append(psi_b) + all_M_hat.append(M_hat) + all_t_hat.append(t_hat) + all_m_hat.append(m_hat) - thetas[i_rep], ses[i_rep] = selection_dml2(psi_a, psi_b) + thetas[i_rep], ses[i_rep] = solve_score(M_hat_list, t_hat_list, m_hat_list) theta = np.median(thetas) se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) @@ -85,9 +65,8 @@ def fit_selection( "se": se, "thetas": thetas, "ses": ses, - "all_g_d1_hat": all_g_d1_hat, - "all_g_d0_hat": all_g_d0_hat, - "all_pi_hat": all_pi_hat, + "all_M_hat": all_M_hat, + "all_t_hat": all_t_hat, "all_m_hat": all_m_hat, "all_psi_a": all_psi_a, "all_psi_b": all_psi_b, @@ -95,176 +74,125 @@ def fit_selection( return res +def solve_score(M_hat, t_hat, m_hat): + pass def fit_nuisance_selection( - y, - x, - d, - z, - s, - learner_g, - learner_pi, - learner_m, - smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - g_d0_params=None, - g_d1_params=None, - pi_params=None, - m_params=None, + y, + x, + d, + learner_M, + learner_t, + learner_m, + smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + M_params=None, + t_params=None, + m_params=None, ): - ml_g_d1 = clone(learner_g) - ml_g_d0 = clone(learner_g) - ml_pi = clone(learner_pi) + # TODO: complete for lplr + n_obs = len(y) + ml_M = clone(learner_M) + ml_t = clone(learner_t) ml_m = clone(learner_m) - if z is None: - dx = np.column_stack((d, x)) - else: - dx = np.column_stack((d, x, z)) - - if score == "missing-at-random": - pi_hat_list = fit_predict_proba(s, dx, ml_pi, pi_params, smpls, trimming_threshold=trimming_threshold) - - m_hat_list = fit_predict_proba(d, x, ml_m, m_params, smpls) - - train_cond_d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) - g_hat_d1_list = fit_predict(y, x, ml_g_d1, g_d1_params, smpls, train_cond=train_cond_d1_s1) - - train_cond_d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) - g_hat_d0_list = fit_predict(y, x, ml_g_d0, g_d0_params, smpls, train_cond=train_cond_d0_s1) - else: - # initialize empty lists - g_hat_d1_list = [] - g_hat_d0_list = [] - pi_hat_list = [] - m_hat_list = [] - - # create strata for splitting - strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) - - # POTENTIAL OUTCOME Y(1) - for i_fold, _ in enumerate(smpls): - ml_g_d1 = clone(learner_g) - ml_pi = clone(learner_pi) - ml_m = clone(learner_m) - - # set the params for the nuisance learners - if g_d1_params is not None: - ml_g_d1.set_params(**g_d1_params[i_fold]) - if g_d0_params is not None: - ml_g_d0.set_params(**g_d0_params[i_fold]) - if pi_params is not None: - ml_pi.set_params(**pi_params[i_fold]) - if m_params is not None: - ml_m.set_params(**m_params[i_fold]) - - train_inds = smpls[i_fold][0] - test_inds = smpls[i_fold][1] - - # start nested crossfitting - train_inds_1, train_inds_2 = train_test_split( - train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] - ) + dx = np.column_stack((d, x)) + + # initialize empty lists + g_hat_d1_list = [] + g_hat_d0_list = [] + pi_hat_list = [] + m_hat_list = [] + + # create strata for splitting + strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) + + # POTENTIAL OUTCOME Y(1) + for i_fold, _ in enumerate(smpls): + ml_g_d1 = clone(learner_g) + ml_pi = clone(learner_pi) + ml_m = clone(learner_m) + + # set the params for the nuisance learners + if g_d1_params is not None: + ml_g_d1.set_params(**g_d1_params[i_fold]) + if g_d0_params is not None: + ml_g_d0.set_params(**g_d0_params[i_fold]) + if pi_params is not None: + ml_pi.set_params(**pi_params[i_fold]) + if m_params is not None: + ml_m.set_params(**m_params[i_fold]) + + train_inds = smpls[i_fold][0] + test_inds = smpls[i_fold][1] + + # start nested crossfitting + train_inds_1, train_inds_2 = train_test_split( + train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] + ) - s_train_1 = s[train_inds_1] - dx_train_1 = dx[train_inds_1, :] + s_train_1 = s[train_inds_1] + dx_train_1 = dx[train_inds_1, :] - # preliminary propensity score for selection - ml_pi_prelim = clone(ml_pi) - # fit on first part of training set - ml_pi_prelim.fit(dx_train_1, s_train_1) - pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) + # preliminary propensity score for selection + ml_pi_prelim = clone(ml_pi) + # fit on first part of training set + ml_pi_prelim.fit(dx_train_1, s_train_1) + pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) - # predictions for small pi in denominator - pi_hat = pi_hat_prelim[test_inds] + # predictions for small pi in denominator + pi_hat = pi_hat_prelim[test_inds] - # add selection indicator to covariates - xpi = np.column_stack((x, pi_hat_prelim)) + # add selection indicator to covariates + xpi = np.column_stack((x, pi_hat_prelim)) - # estimate propensity score p using the second training sample - xpi_train_2 = xpi[train_inds_2, :] - d_train_2 = d[train_inds_2] - xpi_test = xpi[test_inds, :] + # estimate propensity score p using the second training sample + xpi_train_2 = xpi[train_inds_2, :] + d_train_2 = d[train_inds_2] + xpi_test = xpi[test_inds, :] - ml_m.fit(xpi_train_2, d_train_2) + ml_m.fit(xpi_train_2, d_train_2) - m_hat = _predict_zero_one_propensity(ml_m, xpi_test) + m_hat = _predict_zero_one_propensity(ml_m, xpi_test) - # estimate conditional outcome on second training sample -- treatment - s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] - y_s1_d1_train_2 = y[s1_d1_train_2_indices] + # estimate conditional outcome on second training sample -- treatment + s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] + y_s1_d1_train_2 = y[s1_d1_train_2_indices] - ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) + ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) - # predict conditional outcome - g_hat_d1 = ml_g_d1.predict(xpi_test) + # predict conditional outcome + g_hat_d1 = ml_g_d1.predict(xpi_test) - # estimate conditional outcome on second training sample -- control - s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] - y_s1_d0_train_2 = y[s1_d0_train_2_indices] + # estimate conditional outcome on second training sample -- control + s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] + y_s1_d0_train_2 = y[s1_d0_train_2_indices] - ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) + ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) - # predict conditional outcome - g_hat_d0 = ml_g_d0.predict(xpi_test) + # predict conditional outcome + g_hat_d0 = ml_g_d0.predict(xpi_test) - m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) + m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) - # append predictions on test sample to final list of predictions - g_hat_d1_list.append(g_hat_d1) - g_hat_d0_list.append(g_hat_d0) - pi_hat_list.append(pi_hat) - m_hat_list.append(m_hat) + # append predictions on test sample to final list of predictions + g_hat_d1_list.append(g_hat_d1) + g_hat_d0_list.append(g_hat_d0) + pi_hat_list.append(pi_hat) + m_hat_list.append(m_hat) - return g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list -def compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls): - g_hat_d1 = np.full_like(y, np.nan, dtype="float64") - g_hat_d0 = np.full_like(y, np.nan, dtype="float64") - pi_hat = np.full_like(y, np.nan, dtype="float64") m_hat = np.full_like(y, np.nan, dtype="float64") - for idx, (_, test_index) in enumerate(smpls): - g_hat_d1[test_index] = g_hat_d1_list[idx] - g_hat_d0[test_index] = g_hat_d0_list[idx] - pi_hat[test_index] = pi_hat_list[idx] + M_hat[test_index] = M_hat_list[idx] + t_hat[test_index] = t_hat_list[idx] m_hat[test_index] = m_hat_list[idx] - - return g_hat_d1, g_hat_d0, pi_hat, m_hat - - -def selection_score_elements(dtreat, dcontrol, g_d1, g_d0, pi, m, s, y, normalize_ipw): - # psi_a - psi_a = -1 * np.ones_like(y) - - # psi_b - if normalize_ipw: - weight_treat = sum(dtreat) / sum((dtreat * s) / (m * pi)) - weight_control = sum(dcontrol) / sum((dcontrol * s) / ((1 - m) * pi)) - - psi_b1 = weight_treat * ((dtreat * s * (y - g_d1)) / (m * pi)) + g_d1 - psi_b0 = weight_control * ((dcontrol * s * (y - g_d0)) / ((1 - m) * pi)) + g_d0 - - else: - psi_b1 = (dtreat * s * (y - g_d1)) / (m * pi) + g_d1 - psi_b0 = (dcontrol * s * (y - g_d0)) / ((1 - m) * pi) + g_d0 - - psi_b = psi_b1 - psi_b0 - - return psi_a, psi_b - - -def selection_dml2(psi_a, psi_b): - n_obs = len(psi_a) - theta_hat = -np.mean(psi_b) / np.mean(psi_a) - se = np.sqrt(var_selection(theta_hat, psi_a, psi_b, n_obs)) - - return theta_hat, se + return M_hat, t_hat, m_hat def var_selection(theta, psi_a, psi_b, n_obs): @@ -273,62 +201,17 @@ def var_selection(theta, psi_a, psi_b, n_obs): return var -def tune_nuisance_ssm_mar(y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m): - d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) - d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) - - g0_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d0_s1) - g1_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d1_s1) - +def tune_nuisance(y, x, d, ml_M, ml_t, ml_m, smpls, n_folds_tune, param_grid_M, param_grid_t, param_grid_m): dx = np.column_stack((x, d)) - pi_tune_res = tune_grid_search(s, dx, ml_pi, smpls, param_grid_pi, n_folds_tune) + M_tune_res = tune_grid_search(y, dx, ml_M, smpls, param_grid_M, n_folds_tune) m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) - g0_best_params = [xx.best_params_ for xx in g0_tune_res] - g1_best_params = [xx.best_params_ for xx in g1_tune_res] - pi_best_params = [xx.best_params_ for xx in pi_tune_res] - m_best_params = [xx.best_params_ for xx in m_tune_res] - - return g0_best_params, g1_best_params, pi_best_params, m_best_params + t_tune_res = tune_grid_search(d, x, ml_t, smpls, param_grid_t, n_folds_tune) + M_best_params = [xx.best_params_ for xx in M_tune_res] + t_best_params = [xx.best_params_ for xx in t_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] -def tune_nuisance_ssm_nonignorable( - y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m -): - train_inds = [tr for (tr, _) in smpls] - - inner0_list, inner1_list = [], [] - for tr in train_inds: - i0, i1 = train_test_split(tr, test_size=0.5, stratify=d[tr] + 2 * s[tr], random_state=42) - inner0_list.append(i0) - inner1_list.append(i1) - - X_dz = np.c_[x, d.reshape(-1, 1), z.reshape(-1, 1)] - pi_tune_res = tune_grid_search(s, X_dz, ml_pi, [(i0, np.array([])) for i0 in inner0_list], param_grid_pi, n_folds_tune) - pi_best_params = [gs.best_params_ for gs in pi_tune_res] - - pi_hat_full = np.full_like(s, np.nan, dtype=float) - for i0, i1, gs in zip(inner0_list, inner1_list, pi_tune_res): - ml_pi_temp = clone(ml_pi) - ml_pi_temp.set_params(**gs.best_params_) - ml_pi_temp.fit(X_dz[i0], s[i0]) - ph = _predict_zero_one_propensity(ml_pi_temp, X_dz) - pi_hat_full[i1] = ph[i1] - - X_pi = np.c_[x, pi_hat_full] - m_tune_res = tune_grid_search(d, X_pi, ml_m, [(i1, np.array([])) for i1 in inner1_list], param_grid_m, n_folds_tune) - m_best_params = [gs.best_params_ for gs in m_tune_res] - - X_pi_d = np.c_[x, d.reshape(-1, 1), pi_hat_full.reshape(-1, 1)] - inner1_d0_s1 = [i1[(d[i1] == 0) & (s[i1] == 1)] for i1 in inner1_list] - inner1_d1_s1 = [i1[(d[i1] == 1) & (s[i1] == 1)] for i1 in inner1_list] - - g0_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d0_s1], param_grid_g, n_folds_tune) - g1_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d1_s1], param_grid_g, n_folds_tune) - - g0_best_params = [gs.best_params_ for gs in g0_tune_res] - g1_best_params = [gs.best_params_ for gs in g1_tune_res] - - return g0_best_params, g1_best_params, pi_best_params, m_best_params + t_tune_res = tune_grid_search(t_targets, x, ml_t, smpls, param_grid_t, n_folds_tune) diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index c561d9fe8..8e551cab9 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -3,7 +3,8 @@ import numpy as np import pytest from sklearn.base import clone -from sklearn.linear_model import LassoCV, LogisticRegressionCV +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LassoCV, LogisticRegressionCV, LogisticRegression import doubleml as dml @@ -11,38 +12,36 @@ from ._utils_ssm_manual import fit_selection -@pytest.fixture(scope="module", params=[[LassoCV(), LogisticRegressionCV(penalty="l1", solver="liblinear")]]) -def learner(request): +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): return request.param - -@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) -def score(request): +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[True, False]) -def normalize_ipw(request): +@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +def learner_m(request): return request.param - -@pytest.fixture(scope="module", params=[0.01]) -def trimming_threshold(request): +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def score(request): return request.param @pytest.fixture(scope="module") def dml_selection_fixture( - generate_data_selection_mar, generate_data_selection_nonignorable, learner, score, trimming_threshold, normalize_ipw + generate_data_selection, learner, score, learner_M, + learner_t, + learner_m, ): n_folds = 3 # collect data np.random.seed(42) - if score == "missing-at-random": - (x, y, d, z, s) = generate_data_selection_mar - else: - (x, y, d, z, s) = generate_data_selection_nonignorable + (x, y, d, z, s) = generate_data_selection + ml_g = clone(learner[0]) ml_pi = clone(learner[1]) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 8a55fe595..cfe9f0679 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -11,7 +11,7 @@ np.random.seed(3141) n = 100 # create test data and basic learners -dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=10) +dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20) ml_M = RandomForestClassifier() ml_t = RandomForestRegressor() ml_m = RandomForestRegressor() @@ -22,13 +22,13 @@ @pytest.mark.ci def test_lplr_exception_data(): msg = ( - r"The data must be of DoubleMLData type\. .* of type " + r"The data must be of DoubleMLData.* type\.[\s\S]* of type " r" was passed\." ) with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) - dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=5) + dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=20) dml_data_nb.data[dml_data_nb.y_col] = dml_data_nb.data[dml_data_nb.y_col] + 1 dml_data_nb._set_y_z() with pytest.raises(TypeError, match="The outcome variable y must be binary with values 0 and 1."): @@ -41,7 +41,7 @@ def test_lplr_exception_scores(): msg = "Invalid score MAR" with pytest.raises(ValueError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="MAR") - msg = "score should be string. 0 was passed." + msg = "score should be a string. 0 was passed." with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) @@ -71,7 +71,7 @@ def test_ssm_exception_resampling(): @pytest.mark.ci def test_lplr_exception_get_params(): - msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_M or ml_g_t or ml_m or ml_a." + msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." with pytest.raises(ValueError, match=msg): dml_lplr.get_params("ml_x") @@ -148,7 +148,7 @@ def test_lplr_exception_confint(): @pytest.mark.ci def test_lplr_exception_set_ml_nuisance_params(): # invalid learner name - msg = "Invalid nuisance learner g. Valid nuisance learner ml_M or ml_t or ml_m or ml_a." + msg = "Invalid nuisance learner g. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." with pytest.raises(ValueError, match=msg): dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) # invalid treatment variable @@ -171,7 +171,7 @@ class _DummyNoClassifier(_DummyNoGetParams): def get_params(self): pass - def predict_proba(self): + def predict(self): pass @@ -216,7 +216,7 @@ def test_lplr_exception_learner(): log_reg._estimator_type = None msg = ( r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " - r"no classifier\." + r"neither a regressor nor a classifier. Method predict is used for prediction\." ) with pytest.warns(UserWarning, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, log_reg) @@ -284,7 +284,7 @@ def test_double_ml_exception_evaluate_learner(): dml_lplr_obj.evaluate_learners(metric="mse") msg = ( - r"The learners have to be a subset of \['ml_M', 'ml_t', 'ml_m', 'ml_a'\]\. " + r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." ) with pytest.raises(ValueError, match=msg): diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 0e0fa7bfd..28aa387f5 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -3,17 +3,20 @@ import numpy as np import pytest from sklearn.base import clone -from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml from ...tests._utils import draw_smpls -from ._utils_lplr_manual import fit_selection, tune_nuisance_ssm_mar, tune_nuisance_ssm_nonignorable +from ._utils_lplr_manual import fit_selection, tune_nuisance +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): + return request.param @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) -def learner_g(request): +def learner_t(request): return request.param @@ -22,84 +25,63 @@ def learner_m(request): return request.param -@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param -@pytest.fixture(scope="module", params=[True, False]) -def normalize_ipw(request): - return request.param - - @pytest.fixture(scope="module", params=[True, False]) def tune_on_folds(request): return request.param def get_par_grid(learner): - if learner.__class__ in [RandomForestRegressor]: + if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: par_grid = {"n_estimators": [5, 10, 20]} else: - assert learner.__class__ in [LogisticRegression] + assert learner.__class__ in [LogisticRegression, Lasso] par_grid = {"C": np.logspace(-2, 2, 10)} return par_grid @pytest.fixture(scope="module") -def dml_ssm_fixture( - generate_data_selection_mar, - generate_data_selection_nonignorable, - learner_g, +def dml_lplr_fixture( + generate_data_selection, + learner_M, + learner_t, learner_m, score, - normalize_ipw, tune_on_folds, ): - par_grid = {"ml_g": get_par_grid(learner_g), "ml_pi": get_par_grid(learner_m), "ml_m": get_par_grid(learner_m)} + par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m)} n_folds_tune = 4 n_folds = 2 # collect data np.random.seed(42) - if score == "missing-at-random": - (x, y, d, z, s) = generate_data_selection_mar - else: - (x, y, d, z, s) = generate_data_selection_nonignorable + x, y, d = generate_data_selection + n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) - ml_g = clone(learner_g) - ml_pi = clone(learner_m) + ml_M = clone(learner_M) + ml_t = clone(learner_t) ml_m = clone(learner_m) np.random.seed(42) - if score == "missing-at-random": - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) - dml_sel_obj = dml.DoubleMLSSM( - obj_dml_data, - ml_g, - ml_pi, - ml_m, - n_folds=n_folds, - score=score, - normalize_ipw=normalize_ipw, - draw_sample_splitting=False, - ) - else: - assert score == "nonignorable" - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) - dml_sel_obj = dml.DoubleMLSSM( - obj_dml_data, - ml_g, - ml_pi, - ml_m, - n_folds=n_folds, - score=score, - normalize_ipw=normalize_ipw, - draw_sample_splitting=False, - ) + + obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + dml_sel_obj = dml.DoubleMLLPLR( + obj_dml_data, + ml_M, + ml_t, + ml_m, + n_folds=n_folds, + score=score, + draw_sample_splitting=False, + ) + # synchronize the sample splitting np.random.seed(42) @@ -115,95 +97,54 @@ def dml_ssm_fixture( np.random.seed(42) smpls = all_smpls[0] if tune_on_folds: - if score == "missing-at-random": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( - y, - x, - d, - z, - s, - clone(learner_g), - clone(learner_m), - clone(learner_m), - smpls, - n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], - par_grid["ml_m"], - ) - elif score == "nonignorable": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( + + M_best_params, t_best_params, m_best_params = tune_nuisance( y, x, d, - z, - s, - clone(learner_g), - clone(learner_m), + clone(learner_M), + clone(learner_t), clone(learner_m), smpls, n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], + par_grid["ml_M"], + par_grid["ml_t"], par_grid["ml_m"], ) else: xx = [(np.arange(len(y)), np.array([]))] - if score == "missing-at-random": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance( y, x, d, - z, - s, - clone(learner_g), - clone(learner_m), + clone(learner_M), + clone(learner_t), clone(learner_m), xx, n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], - par_grid["ml_m"], - ) - elif score == "nonignorable": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( - y, - x, - d, - z, - s, - clone(learner_g), - clone(learner_m), - clone(learner_m), - xx, - n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], + par_grid["ml_M"], + par_grid["ml_t"], par_grid["ml_m"], ) - g0_best_params = g0_best_params * n_folds - g1_best_params = g1_best_params * n_folds - pi_best_params = pi_best_params * n_folds - m_best_params = m_best_params * n_folds + + M_best_params = M_best_params * n_folds + t_best_params = t_best_params * n_folds + m_best_params = m_best_params * n_folds np.random.seed(42) res_manual = fit_selection( y, x, d, - z, - s, - clone(learner_g), - clone(learner_m), + clone(learner_M), + clone(learner_t), clone(learner_m), all_smpls, score, - normalize_ipw=normalize_ipw, - g_d0_params=g0_best_params, - g_d1_params=g1_best_params, - pi_params=pi_best_params, + M_params=M_best_params, + t_params=t_best_params, m_params=m_best_params, ) @@ -219,9 +160,9 @@ def dml_ssm_fixture( @pytest.mark.ci def test_dml_ssm_coef(dml_ssm_fixture): - assert math.isclose(dml_ssm_fixture["coef"], dml_ssm_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_lplr_fixture["coef"], dml_lplr_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) @pytest.mark.ci def test_dml_ssm_se(dml_ssm_fixture): - assert math.isclose(dml_ssm_fixture["se"], dml_ssm_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_lplr_fixture["se"], dml_lplr_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) From 2c626a011bb2d68f658f2113eaff47a37dcbcd8a Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 22:07:08 -0700 Subject: [PATCH 18/48] Cleanup --- doubleml/plm/__init__.py | 2 +- doubleml/plm/datasets/__init__.py | 2 +- doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 2 ++ doubleml/plm/tests/_utils_lplr_manual.py | 2 +- doubleml/plm/tests/test_lplr.py | 2 +- doubleml/plm/tests/test_lplr_tune.py | 3 ++- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index 37262ed93..f5e135e3a 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -2,9 +2,9 @@ The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models. """ +from .lplr import DoubleMLLPLR from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR -from .lplr import DoubleMLLPLR __all__ = [ "DoubleMLPLR", diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index 5f433ae79..6e8e9bb51 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -4,11 +4,11 @@ from ._make_pliv_data import _make_pliv_data from .dgp_confounded_plr_data import make_confounded_plr_data +from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 from .dgp_pliv_CHS2015 import make_pliv_CHS2015 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 from .dgp_plr_turrell2018 import make_plr_turrell2018 -from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 __all__ = [ "make_plr_CCDDHNR2018", diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py index 3d6d71277..a9b4ece9b 100644 --- a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -131,6 +131,8 @@ def a_0(X): elif treatment == "binary_unbalanced": d_cont = a_0(x) d = np.random.binomial(1, expit(d_cont)) + else: + raise ValueError("Invalid treatment type.") p = expit(alpha * d[:] + r_0(x)) diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py index 699047019..072eb2b56 100644 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -2,7 +2,7 @@ from sklearn.base import clone from sklearn.model_selection import train_test_split -from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search +from ...tests._utils import tune_grid_search from ...utils._estimation import _predict_zero_one_propensity from ...utils._propensity_score import _trimm diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 8e551cab9..9ef7ec732 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -4,7 +4,7 @@ import pytest from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LassoCV, LogisticRegressionCV, LogisticRegression +from sklearn.linear_model import LogisticRegression import doubleml as dml diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 28aa387f5..6d13e5d18 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -3,7 +3,7 @@ import numpy as np import pytest from sklearn.base import clone -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml @@ -11,6 +11,7 @@ from ...tests._utils import draw_smpls from ._utils_lplr_manual import fit_selection, tune_nuisance + @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) def learner_M(request): return request.param From 98194367463f0e382726c8a01dbb05a7d5ff9f19 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 5 Nov 2025 18:41:19 -0800 Subject: [PATCH 19/48] Tests updated --- doubleml/plm/lplr.py | 10 + doubleml/plm/tests/_utils_lplr_manual.py | 217 --------------------- doubleml/plm/tests/test_lplr.py | 79 ++------ doubleml/plm/tests/test_lplr_exceptions.py | 1 + doubleml/plm/tests/test_lplr_tune.py | 129 ++++-------- 5 files changed, 67 insertions(+), 369 deletions(-) delete mode 100644 doubleml/plm/tests/_utils_lplr_manual.py diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 08a6bbfac..468b93593 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -389,6 +389,8 @@ def _sensitivity_element_est(self, preds): def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search ): + if self._i_rep is None: + raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.") # TODO: test x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) @@ -470,6 +472,13 @@ def _nuisance_tuning( w = scipy.special.logit(M_iteration) W_inner.append(w) + # Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN + W_targets = [] + for i, train in enumerate(train_inds): + wt = np.full(x.shape[0], np.nan, dtype=float) + wt[train] = W_inner[i] + W_targets.append(wt) + t_tune_res = _dml_tune( W_inner, x, @@ -481,6 +490,7 @@ def _nuisance_tuning( n_jobs_cv, search_mode, n_iter_randomized_search, + fold_specific_target=True ) t_best_params = [xx.best_params_ for xx in t_tune_res] diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py deleted file mode 100644 index 072eb2b56..000000000 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ /dev/null @@ -1,217 +0,0 @@ -import numpy as np -from sklearn.base import clone -from sklearn.model_selection import train_test_split - -from ...tests._utils import tune_grid_search -from ...utils._estimation import _predict_zero_one_propensity -from ...utils._propensity_score import _trimm - - -def fit_selection( - y, - x, - d, - learner_M, - learner_t, - learner_m, - all_smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - n_rep=1, - M_params=None, - t_params=None, - m_params=None, -): - n_obs = len(y) - - thetas = np.zeros(n_rep) - ses = np.zeros(n_rep) - - all_M_hat = list() - all_t_hat = list() - all_m_hat = list() - - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - - M_hat_list, t_hat_list, m_hat_list = fit_nuisance_selection( - y, - x, - d, - learner_M, - learner_t, - learner_m, - smpls, - score, - trimming_rule=trimming_rule, - trimming_threshold=trimming_threshold, - M_params=M_params, - t_params=t_params, - m_params=m_params, - ) - - all_M_hat.append(M_hat) - all_t_hat.append(t_hat) - all_m_hat.append(m_hat) - - thetas[i_rep], ses[i_rep] = solve_score(M_hat_list, t_hat_list, m_hat_list) - - theta = np.median(thetas) - se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) - - res = { - "theta": theta, - "se": se, - "thetas": thetas, - "ses": ses, - "all_M_hat": all_M_hat, - "all_t_hat": all_t_hat, - "all_m_hat": all_m_hat, - "all_psi_a": all_psi_a, - "all_psi_b": all_psi_b, - } - - return res - -def solve_score(M_hat, t_hat, m_hat): - pass - -def fit_nuisance_selection( - y, - x, - d, - learner_M, - learner_t, - learner_m, - smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - M_params=None, - t_params=None, - m_params=None, -): - # TODO: complete for lplr - n_obs = len(y) - ml_M = clone(learner_M) - ml_t = clone(learner_t) - ml_m = clone(learner_m) - - dx = np.column_stack((d, x)) - - # initialize empty lists - g_hat_d1_list = [] - g_hat_d0_list = [] - pi_hat_list = [] - m_hat_list = [] - - # create strata for splitting - strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) - - # POTENTIAL OUTCOME Y(1) - for i_fold, _ in enumerate(smpls): - ml_g_d1 = clone(learner_g) - ml_pi = clone(learner_pi) - ml_m = clone(learner_m) - - # set the params for the nuisance learners - if g_d1_params is not None: - ml_g_d1.set_params(**g_d1_params[i_fold]) - if g_d0_params is not None: - ml_g_d0.set_params(**g_d0_params[i_fold]) - if pi_params is not None: - ml_pi.set_params(**pi_params[i_fold]) - if m_params is not None: - ml_m.set_params(**m_params[i_fold]) - - train_inds = smpls[i_fold][0] - test_inds = smpls[i_fold][1] - - # start nested crossfitting - train_inds_1, train_inds_2 = train_test_split( - train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] - ) - - s_train_1 = s[train_inds_1] - dx_train_1 = dx[train_inds_1, :] - - # preliminary propensity score for selection - ml_pi_prelim = clone(ml_pi) - # fit on first part of training set - ml_pi_prelim.fit(dx_train_1, s_train_1) - pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) - - # predictions for small pi in denominator - pi_hat = pi_hat_prelim[test_inds] - - # add selection indicator to covariates - xpi = np.column_stack((x, pi_hat_prelim)) - - # estimate propensity score p using the second training sample - xpi_train_2 = xpi[train_inds_2, :] - d_train_2 = d[train_inds_2] - xpi_test = xpi[test_inds, :] - - ml_m.fit(xpi_train_2, d_train_2) - - m_hat = _predict_zero_one_propensity(ml_m, xpi_test) - - # estimate conditional outcome on second training sample -- treatment - s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] - y_s1_d1_train_2 = y[s1_d1_train_2_indices] - - ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) - - # predict conditional outcome - g_hat_d1 = ml_g_d1.predict(xpi_test) - - # estimate conditional outcome on second training sample -- control - s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] - y_s1_d0_train_2 = y[s1_d0_train_2_indices] - - ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) - - # predict conditional outcome - g_hat_d0 = ml_g_d0.predict(xpi_test) - - m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) - - # append predictions on test sample to final list of predictions - g_hat_d1_list.append(g_hat_d1) - g_hat_d0_list.append(g_hat_d0) - pi_hat_list.append(pi_hat) - m_hat_list.append(m_hat) - - - - m_hat = np.full_like(y, np.nan, dtype="float64") - for idx, (_, test_index) in enumerate(smpls): - M_hat[test_index] = M_hat_list[idx] - t_hat[test_index] = t_hat_list[idx] - m_hat[test_index] = m_hat_list[idx] - return M_hat, t_hat, m_hat - - -def var_selection(theta, psi_a, psi_b, n_obs): - J = np.mean(psi_a) - var = 1 / n_obs * np.mean(np.power(np.multiply(psi_a, theta) + psi_b, 2)) / np.power(J, 2) - return var - - -def tune_nuisance(y, x, d, ml_M, ml_t, ml_m, smpls, n_folds_tune, param_grid_M, param_grid_t, param_grid_m): - dx = np.column_stack((x, d)) - - M_tune_res = tune_grid_search(y, dx, ml_M, smpls, param_grid_M, n_folds_tune) - - m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) - - t_tune_res = tune_grid_search(d, x, ml_t, smpls, param_grid_t, n_folds_tune) - - M_best_params = [xx.best_params_ for xx in M_tune_res] - t_best_params = [xx.best_params_ for xx in t_tune_res] - m_best_params = [xx.best_params_ for xx in m_tune_res] - - t_tune_res = tune_grid_search(t_targets, x, ml_t, smpls, param_grid_t, n_folds_tune) diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 9ef7ec732..154c47633 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -1,15 +1,10 @@ -import math - import numpy as np import pytest from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegression import doubleml as dml - -from ...tests._utils import draw_smpls -from ._utils_ssm_manual import fit_selection +from ..datasets import make_lplr_LZZ2020 @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) @@ -21,7 +16,7 @@ def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_m(request): return request.param @@ -31,74 +26,36 @@ def score(request): @pytest.fixture(scope="module") -def dml_selection_fixture( - generate_data_selection, learner, score, learner_M, - learner_t, - learner_m, +def dml_lplr_fixture( + score, learner_M, learner_t, learner_m, ): - n_folds = 3 + n_folds = 5 + alpha = 0.5 # collect data np.random.seed(42) - (x, y, d, z, s) = generate_data_selection - - - ml_g = clone(learner[0]) - ml_pi = clone(learner[1]) - ml_m = clone(learner[1]) - - np.random.seed(42) - n_obs = len(y) - all_smpls = draw_smpls(n_obs, n_folds) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha) - np.random.seed(42) - if score == "missing-at-random": - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) - dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) - else: - assert score == "nonignorable" - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) - dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) - np.random.seed(42) - dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) + dml_sel_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m, n_folds=n_folds, score=score) dml_sel_obj.fit() - np.random.seed(42) - res_manual = fit_selection( - y, - x, - d, - z, - s, - clone(learner[0]), - clone(learner[1]), - clone(learner[1]), - all_smpls, - score, - trimming_rule="truncate", - trimming_threshold=trimming_threshold, - normalize_ipw=normalize_ipw, - ) - res_dict = { "coef": dml_sel_obj.coef[0], - "coef_manual": res_manual["theta"], "se": dml_sel_obj.se[0], - "se_manual": res_manual["se"], + "true_coef": alpha, } - # sensitivity tests - # TODO - return res_dict @pytest.mark.ci -def test_dml_selection_coef(dml_selection_fixture): - assert math.isclose(dml_selection_fixture["coef"], dml_selection_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-2) - - -@pytest.mark.ci -def test_dml_selection_se(dml_selection_fixture): - assert math.isclose(dml_selection_fixture["se"], dml_selection_fixture["se_manual"], rel_tol=1e-9, abs_tol=5e-2) +def test_dml_lplr_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index cfe9f0679..1be83c122 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -295,3 +295,4 @@ def eval_fct(y_pred, y_true): with pytest.raises(ValueError): dml_lplr_obj.evaluate_learners(metric=eval_fct) + diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 6d13e5d18..2926d755e 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -7,10 +7,7 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml - -from ...tests._utils import draw_smpls -from ._utils_lplr_manual import fit_selection, tune_nuisance - +from ..datasets import make_lplr_LZZ2020 @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) def learner_M(request): @@ -21,20 +18,19 @@ def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_m(request): return request.param +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_a(request): + return request.param @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param -@pytest.fixture(scope="module", params=[True, False]) -def tune_on_folds(request): - return request.param - def get_par_grid(learner): if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: @@ -47,123 +43,74 @@ def get_par_grid(learner): @pytest.fixture(scope="module") def dml_lplr_fixture( - generate_data_selection, learner_M, learner_t, learner_m, + learner_a, score, - tune_on_folds, + tune_on_folds=True, ): - par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m)} + par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), "ml_a": get_par_grid(learner_a)} n_folds_tune = 4 - n_folds = 2 - - # collect data - np.random.seed(42) - x, y, d = generate_data_selection - + n_folds = 5 + alpha = 0.5 - n_obs = len(y) - all_smpls = draw_smpls(n_obs, n_folds) ml_M = clone(learner_M) ml_t = clone(learner_t) ml_m = clone(learner_m) + ml_a = clone(learner_a) - np.random.seed(42) - - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha) dml_sel_obj = dml.DoubleMLLPLR( obj_dml_data, ml_M, ml_t, ml_m, + ml_a=ml_a, n_folds=n_folds, score=score, - draw_sample_splitting=False, ) - - # synchronize the sample splitting - np.random.seed(42) - dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) - - np.random.seed(42) # tune hyperparameters tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) - assert isinstance(tune_res, dml.DoubleMLSSM) + assert isinstance(tune_res, dml.DoubleMLLPLR) dml_sel_obj.fit() - np.random.seed(42) - smpls = all_smpls[0] - if tune_on_folds: - - M_best_params, t_best_params, m_best_params = tune_nuisance( - y, - x, - d, - clone(learner_M), - clone(learner_t), - clone(learner_m), - smpls, - n_folds_tune, - par_grid["ml_M"], - par_grid["ml_t"], - par_grid["ml_m"], - ) - - else: - xx = [(np.arange(len(y)), np.array([]))] - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance( - y, - x, - d, - clone(learner_M), - clone(learner_t), - clone(learner_m), - xx, - n_folds_tune, - par_grid["ml_M"], - par_grid["ml_t"], - par_grid["ml_m"], - ) - - - M_best_params = M_best_params * n_folds - t_best_params = t_best_params * n_folds - m_best_params = m_best_params * n_folds - - np.random.seed(42) - res_manual = fit_selection( - y, - x, - d, - clone(learner_M), - clone(learner_t), - clone(learner_m), - all_smpls, - score, - M_params=M_best_params, - t_params=t_best_params, - m_params=m_best_params, - ) - res_dict = { "coef": dml_sel_obj.coef[0], - "coef_manual": res_manual["theta"], "se": dml_sel_obj.se[0], - "se_manual": res_manual["se"], + "true_coef": alpha, } return res_dict @pytest.mark.ci -def test_dml_ssm_coef(dml_ssm_fixture): - assert math.isclose(dml_lplr_fixture["coef"], dml_lplr_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) +def test_dml_selection_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) @pytest.mark.ci -def test_dml_ssm_se(dml_ssm_fixture): - assert math.isclose(dml_lplr_fixture["se"], dml_lplr_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) +def test_lplr_exception_tuning( + learner_M, + learner_t, + learner_m, + learner_a,): + # LPLR valid scores are 'nuisance_space' and 'instrument' + obj_dml_data = make_lplr_LZZ2020(alpha=0.5) + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + ml_a = clone(learner_a) + dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) + par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), + "ml_a": get_par_grid(learner_a)} + msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.tune(par_grid, tune_on_folds=False) \ No newline at end of file From 5a7e2796fb35282e49c8ef23e6db95b6030a6d22 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 5 Nov 2025 18:45:15 -0800 Subject: [PATCH 20/48] Pre-commit checks --- doubleml/plm/lplr.py | 3 +-- doubleml/plm/tests/test_lplr.py | 8 +++++- doubleml/plm/tests/test_lplr_exceptions.py | 1 - doubleml/plm/tests/test_lplr_tune.py | 31 +++++++++++++++------- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 468b93593..af545216b 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -109,7 +109,6 @@ def __init__( _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) - ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} @@ -490,7 +489,7 @@ def _nuisance_tuning( n_jobs_cv, search_mode, n_iter_randomized_search, - fold_specific_target=True + fold_specific_target=True, ) t_best_params = [xx.best_params_ for xx in t_tune_res] diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 154c47633..4eaf86136 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -4,6 +4,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import doubleml as dml + from ..datasets import make_lplr_LZZ2020 @@ -11,6 +12,7 @@ def learner_M(request): return request.param + @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_t(request): return request.param @@ -20,6 +22,7 @@ def learner_t(request): def learner_m(request): return request.param + @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param @@ -27,7 +30,10 @@ def score(request): @pytest.fixture(scope="module") def dml_lplr_fixture( - score, learner_M, learner_t, learner_m, + score, + learner_M, + learner_t, + learner_m, ): n_folds = 5 alpha = 0.5 diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 1be83c122..cfe9f0679 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -295,4 +295,3 @@ def eval_fct(y_pred, y_true): with pytest.raises(ValueError): dml_lplr_obj.evaluate_learners(metric=eval_fct) - diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 2926d755e..70ea63817 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -1,5 +1,3 @@ -import math - import numpy as np import pytest from sklearn.base import clone @@ -7,12 +5,15 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml + from ..datasets import make_lplr_LZZ2020 + @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) def learner_M(request): return request.param + @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_t(request): return request.param @@ -22,16 +23,17 @@ def learner_t(request): def learner_m(request): return request.param + @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_a(request): return request.param + @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param - def get_par_grid(learner): if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: par_grid = {"n_estimators": [5, 10, 20]} @@ -50,12 +52,16 @@ def dml_lplr_fixture( score, tune_on_folds=True, ): - par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), "ml_a": get_par_grid(learner_a)} + par_grid = { + "ml_M": get_par_grid(learner_M), + "ml_t": get_par_grid(learner_t), + "ml_m": get_par_grid(learner_m), + "ml_a": get_par_grid(learner_a), + } n_folds_tune = 4 n_folds = 5 alpha = 0.5 - ml_M = clone(learner_M) ml_t = clone(learner_t) ml_m = clone(learner_m) @@ -101,16 +107,21 @@ def test_lplr_exception_tuning( learner_M, learner_t, learner_m, - learner_a,): + learner_a, +): # LPLR valid scores are 'nuisance_space' and 'instrument' obj_dml_data = make_lplr_LZZ2020(alpha=0.5) ml_M = clone(learner_M) ml_t = clone(learner_t) ml_m = clone(learner_m) - ml_a = clone(learner_a) + dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) - par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), - "ml_a": get_par_grid(learner_a)} + par_grid = { + "ml_M": get_par_grid(learner_M), + "ml_t": get_par_grid(learner_t), + "ml_m": get_par_grid(learner_m), + "ml_a": get_par_grid(learner_a), + } msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." with pytest.raises(ValueError, match=msg): - dml_lplr_obj.tune(par_grid, tune_on_folds=False) \ No newline at end of file + dml_lplr_obj.tune(par_grid, tune_on_folds=False) From fc03cc65aaf2f216b8e44d2e5f4aee9adf8727ca Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 6 Nov 2025 11:39:25 -0800 Subject: [PATCH 21/48] Pre-commit checks on all files --- doubleml/plm/__init__.py | 6 +----- doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 1 + doubleml/plm/tests/test_lplr_exceptions.py | 8 ++------ 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index f5e135e3a..283bc91b4 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -6,8 +6,4 @@ from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR -__all__ = [ - "DoubleMLPLR", - "DoubleMLPLIV", - "DoubleMLLPLR" -] +__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"] diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py index a9b4ece9b..284da7d8b 100644 --- a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -90,6 +90,7 @@ def r_0(X): + 0.25 * np.where(X[:, 10] > 0, 1, 0) + -0.25 * np.where(X[:, 12] > 0, 1, 0) ) + else: def r_0(X): diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index cfe9f0679..c4c57fd98 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -21,10 +21,7 @@ @pytest.mark.ci def test_lplr_exception_data(): - msg = ( - r"The data must be of DoubleMLData.* type\.[\s\S]* of type " - r" was passed\." - ) + msg = r"The data must be of DoubleMLData.* type\.[\s\S]* of type " r" was passed\." with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) @@ -284,8 +281,7 @@ def test_double_ml_exception_evaluate_learner(): dml_lplr_obj.evaluate_learners(metric="mse") msg = ( - r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " - r"Learners \['ml_mu', 'ml_p'\] provided." + r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." ) with pytest.raises(ValueError, match=msg): dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) From 5dae65189666090406604cafb3438e04dcfd1ebf Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 6 Nov 2025 16:06:48 -0800 Subject: [PATCH 22/48] Changed function signature, test --- doubleml/plm/lplr.py | 4 ++-- doubleml/plm/tests/test_lplr.py | 8 +++++++- doubleml/plm/tests/test_lplr_tune.py | 18 ++++++------------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index af545216b..3ef6e4960 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -414,7 +414,7 @@ def _nuisance_tuning( filtered_train_inds = [] if self.score == "nuisance_space": - for train, test in smpls: + for train, _ in smpls: train_filtered = train[y[train] == 0] filtered_train_inds.append(train_filtered) elif self.score == "instrument": @@ -528,7 +528,7 @@ def draw_sample_splitting(self): return self - def set_sample_splitting(self): + def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): raise NotImplementedError("set_sample_splitting is not implemented for DoubleMLLPLR.") def _compute_score(self, psi_elements, coef): diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 4eaf86136..9c94a8a44 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -28,19 +28,25 @@ def score(request): return request.param +@pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"]) +def treatment(request): + return request.param + + @pytest.fixture(scope="module") def dml_lplr_fixture( score, learner_M, learner_t, learner_m, + treatment, ): n_folds = 5 alpha = 0.5 # collect data np.random.seed(42) - obj_dml_data = make_lplr_LZZ2020(alpha=alpha) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha, treatment=treatment) ml_M = clone(learner_M) ml_t = clone(learner_t) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 70ea63817..64653f5e8 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -2,7 +2,6 @@ import pytest from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml @@ -34,13 +33,8 @@ def score(request): return request.param -def get_par_grid(learner): - if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: - par_grid = {"n_estimators": [5, 10, 20]} - else: - assert learner.__class__ in [LogisticRegression, Lasso] - par_grid = {"C": np.logspace(-2, 2, 10)} - return par_grid +def get_par_grid(): + return {"n_estimators": [5, 10, 20]} @pytest.fixture(scope="module") @@ -53,10 +47,10 @@ def dml_lplr_fixture( tune_on_folds=True, ): par_grid = { - "ml_M": get_par_grid(learner_M), - "ml_t": get_par_grid(learner_t), - "ml_m": get_par_grid(learner_m), - "ml_a": get_par_grid(learner_a), + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), } n_folds_tune = 4 n_folds = 5 From 13fca2f6b166e2550c586e6c548d65ddf67f9b62 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 6 Nov 2025 16:09:35 -0800 Subject: [PATCH 23/48] Argument fix --- doubleml/plm/lplr.py | 2 +- doubleml/plm/tests/test_lplr_tune.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 3ef6e4960..8f609e04c 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -465,7 +465,7 @@ def _nuisance_tuning( ) W_inner = [] - for i, (train, test) in enumerate(smpls): + for i, (train, _) in enumerate(smpls): M_iteration = M_hat["preds_inner"][i][train] M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) w = scipy.special.logit(M_iteration) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 64653f5e8..7c7c4aebb 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -111,10 +111,10 @@ def test_lplr_exception_tuning( dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) par_grid = { - "ml_M": get_par_grid(learner_M), - "ml_t": get_par_grid(learner_t), - "ml_m": get_par_grid(learner_m), - "ml_a": get_par_grid(learner_a), + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), } msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." with pytest.raises(ValueError, match=msg): From ff4c75b9d4e881363492381d5e91730b1b26ea18 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 7 Nov 2025 12:55:43 -0800 Subject: [PATCH 24/48] Updated tests for improved coverage --- doubleml/plm/lplr.py | 17 ++----- doubleml/plm/tests/test_lplr.py | 1 + doubleml/tests/test_datasets.py | 36 ++++++++++++++ doubleml/tests/test_nonlinear_score_mixin.py | 26 +++++++++++ doubleml/utils/resampling.py | 10 ++-- doubleml/utils/tests/test_resampling.py | 49 ++++++++++++++++++++ 6 files changed, 120 insertions(+), 19 deletions(-) create mode 100644 doubleml/utils/tests/test_resampling.py diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 8f609e04c..c9b39c94e 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -215,10 +215,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa m_external = external_predictions["ml_m"] is not None M_external = external_predictions["ml_M"] is not None t_external = external_predictions["ml_t"] is not None - if "ml_a" in self._learner: - a_external = external_predictions["ml_a"] is not None - else: - a_external = False + a_external = external_predictions["ml_a"] is not None if M_external: M_hat = {"preds": external_predictions["ml_M"], "targets": None, "models": None} @@ -270,8 +267,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa method=self._predict_method["ml_m"], return_models=return_models, ) - else: - raise NotImplementedError + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) if self._check_learner(self._learner["ml_m"], "ml_m", regressor=True, classifier=True): @@ -383,7 +379,7 @@ def _score_element_names(self): return ["y", "d", "d_tilde", "r_hat", "m_hat", "psi_hat", "score_const"] def _sensitivity_element_est(self, preds): - pass + raise NotImplementedError() def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search @@ -419,8 +415,7 @@ def _nuisance_tuning( filtered_train_inds.append(train_filtered) elif self.score == "instrument": filtered_train_inds = train_inds - else: - raise NotImplementedError + m_tune_res = _dml_tune( d, x, @@ -539,8 +534,6 @@ def _compute_score(self, psi_elements, coef): score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"])) * psi_elements[ "d_tilde" ] - else: - raise NotImplementedError return score @@ -551,7 +544,5 @@ def _compute_score_deriv(self, psi_elements, coef, inds=None): elif self.score == "instrument": expit = scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"]) deriv = -psi_elements["d"] * expit * (1 - expit) * psi_elements["d_tilde"] - else: - raise NotImplementedError return deriv diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 9c94a8a44..2e58bfeaf 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -29,6 +29,7 @@ def score(request): @pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"]) +# TODO: Error for continuous treatment? def treatment(request): return request.param diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index f69b681e2..05c75d00a 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -15,6 +15,7 @@ from doubleml.plm.datasets import ( _make_pliv_data, make_confounded_plr_data, + make_lplr_LZZ2020, make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018, @@ -294,3 +295,38 @@ def test_make_data_discrete_treatments(n_levels): msg = "n_levels must be an integer." with pytest.raises(ValueError, match=msg): _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1.1) + + +@pytest.mark.ci +def test_make_lplr_LZZ2020_return_types(): + np.random.seed(3141) + res = make_lplr_LZZ2020(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_lplr_LZZ2020(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_lplr_LZZ2020(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_lplr_LZZ2020(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_lplr_LZZ2020_variants(): + np.random.seed(3141) + res = make_lplr_LZZ2020(n_obs=100, treatment="binary") + assert np.array_equal(np.unique(res.d), np.array([0, 1])) + res = make_lplr_LZZ2020(n_obs=100, treatment="binary_unbalanced") + assert np.array_equal(np.unique(res.d), np.array([0, 1])) + res = make_lplr_LZZ2020(n_obs=100, treatment="continuous") + assert len(np.unique(res.d)) == 100 + + msg = "Invalid treatment type." + with pytest.raises(ValueError, match=msg): + _ = make_lplr_LZZ2020(n_obs=100, treatment="colors") + + res = make_lplr_LZZ2020(n_obs=100, balanced_r0=False) + _, y_unique = np.unique(res.y, return_counts=True) + assert np.abs(y_unique[0] - y_unique[1]) > 10 diff --git a/doubleml/tests/test_nonlinear_score_mixin.py b/doubleml/tests/test_nonlinear_score_mixin.py index 0fce08c3b..d68785aa4 100644 --- a/doubleml/tests/test_nonlinear_score_mixin.py +++ b/doubleml/tests/test_nonlinear_score_mixin.py @@ -253,3 +253,29 @@ def test_nonlinear_warnings(generate_data1, coef_bounds): with pytest.warns(UserWarning, match=msg): dml_plr_obj._coef_bounds = coef_bounds dml_plr_obj.fit() + + +@pytest.mark.ci +def test_nonlinear_errors(generate_data1, coef_bounds): + # collect data + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith("X")].tolist() + + np.random.seed(3141) + obj_dml_data = dml.DoubleMLData(data, "y", ["d"], x_cols) + + dml_plr_obj = DoubleMLPLRWithNonLinearScoreMixin(obj_dml_data, LinearRegression(), LinearRegression(), score="no_root_pos") + dml_plr_obj._error_on_convergence_failure = True + + msg = "Could not find a root of the score function." + with pytest.raises(ValueError, match=msg): + dml_plr_obj._coef_bounds = coef_bounds + dml_plr_obj.fit() + + dml_plr_obj = DoubleMLPLRWithNonLinearScoreMixin(obj_dml_data, LinearRegression(), LinearRegression(), score="no_root_neg") + dml_plr_obj._error_on_convergence_failure = True + + msg = "Could not find a root of the score function." + with pytest.raises(ValueError, match=msg): + dml_plr_obj._coef_bounds = coef_bounds + dml_plr_obj.fit() diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 38c1ac595..e0668ef25 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -31,7 +31,7 @@ def __init__(self, n_folds, n_folds_inner, n_rep, n_obs, stratify=None): self.n_folds_inner = n_folds_inner self.n_rep = n_rep self.n_obs = n_obs - self.stratify = stratify + self.stratify = np.array(stratify) if n_folds < 2: raise ValueError( @@ -53,9 +53,9 @@ def split_samples(self): all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] smpls = [all_smpls[(i_repeat * self.n_folds) : ((i_repeat + 1) * self.n_folds)] for i_repeat in range(self.n_rep)] smpls_inner = [] - for _ in range(self.n_rep): + for i_rep in range(self.n_rep): smpls_inner_rep = [] - for train, test in all_smpls: + for train, test in smpls[i_rep]: if self.stratify is None: smpls_inner_rep.append( [ @@ -67,9 +67,7 @@ def split_samples(self): smpls_inner_rep.append( [ (train[train_inner], train[test_inner]) - for train_inner, test_inner in self.resampling_inner.split( - X=np.zeros(len(train)), y=self.stratify[train] - ) + for train_inner, test_inner in self.resampling_inner.split(X=train, y=self.stratify[train]) ] ) smpls_inner.append(smpls_inner_rep) diff --git a/doubleml/utils/tests/test_resampling.py b/doubleml/utils/tests/test_resampling.py new file mode 100644 index 000000000..baab61b4c --- /dev/null +++ b/doubleml/utils/tests/test_resampling.py @@ -0,0 +1,49 @@ +import pytest + +from doubleml.utils.resampling import DoubleMLDoubleResampling + + +@pytest.mark.ci +def test_DoubleMLDoubleResampling_stratify(): + n_folds = 5 + n_folds_inner = 3 + n_rep = 2 + n_obs = 100 + stratify = [0] * 50 + [1] * 50 + + obj_dml_double_resampling = DoubleMLDoubleResampling( + n_folds=n_folds, + n_folds_inner=n_folds_inner, + n_rep=n_rep, + n_obs=n_obs, + stratify=stratify, + ) + smpls, smpls_inner = obj_dml_double_resampling.split_samples() + + assert len(smpls) == n_rep + assert len(smpls_inner) == n_rep + + for i_rep in range(n_rep): + assert len(smpls[i_rep]) == n_folds + assert len(smpls_inner[i_rep]) == n_folds + + for i_fold in range(n_folds): + train_ind, test_ind = smpls[i_rep][i_fold] + smpls_inner_rep_fold = smpls_inner[i_rep][i_fold] + assert len(smpls_inner_rep_fold) == n_folds_inner + + for i_fold_inner in range(n_folds_inner): + train_ind_inner, test_ind_inner = smpls_inner_rep_fold[i_fold_inner] + assert set(train_ind_inner).issubset(set(train_ind)) + assert set(test_ind_inner).issubset(set(train_ind)) + + +@pytest.mark.ci +def test_DoubleMLDoubleResampling_exceptions(): + msg = "n_folds must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLDoubleResampling(1, 5, 1, 100) + + msg = "n_folds_inner must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLDoubleResampling(5, 1, 1, 100) From 8a181cd656df723eeabb7316ab4388a52ef62ec7 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 7 Nov 2025 12:58:49 -0800 Subject: [PATCH 25/48] Unused var removed --- doubleml/utils/tests/test_resampling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/utils/tests/test_resampling.py b/doubleml/utils/tests/test_resampling.py index baab61b4c..3ecfbada0 100644 --- a/doubleml/utils/tests/test_resampling.py +++ b/doubleml/utils/tests/test_resampling.py @@ -28,7 +28,7 @@ def test_DoubleMLDoubleResampling_stratify(): assert len(smpls_inner[i_rep]) == n_folds for i_fold in range(n_folds): - train_ind, test_ind = smpls[i_rep][i_fold] + train_ind, _ = smpls[i_rep][i_fold] smpls_inner_rep_fold = smpls_inner[i_rep][i_fold] assert len(smpls_inner_rep_fold) == n_folds_inner From f2ecea799aac830c5dac42a37aa2ded19317a21e Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 7 Nov 2025 13:09:30 -0800 Subject: [PATCH 26/48] Fixed resampling --- doubleml/utils/resampling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index e0668ef25..4f49a3d2c 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -31,7 +31,7 @@ def __init__(self, n_folds, n_folds_inner, n_rep, n_obs, stratify=None): self.n_folds_inner = n_folds_inner self.n_rep = n_rep self.n_obs = n_obs - self.stratify = np.array(stratify) + self.stratify = np.array(stratify) if stratify is not None else None if n_folds < 2: raise ValueError( From a9a295993645df01e82d85eb1b1a555d76170dcc Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 7 Nov 2025 17:36:59 -0800 Subject: [PATCH 27/48] External predictions --- doubleml/double_ml.py | 17 +++- doubleml/plm/lplr.py | 24 ++++- doubleml/plm/tests/test_lplr.py | 3 +- .../tests/test_lplr_external_predictions.py | 90 +++++++++++++++++++ doubleml/utils/_estimation.py | 7 +- 5 files changed, 132 insertions(+), 9 deletions(-) create mode 100644 doubleml/plm/tests/test_lplr_external_predictions.py diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 05481bf16..a95e2c7dc 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -263,6 +263,13 @@ def learner(self): """ return self._learner + @property + def predictions_names(self): + """ + The names of predictions for the nuisance functions. + """ + return list(self._learner.keys()) + @property def learner_names(self): """ @@ -1059,7 +1066,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m _check_external_predictions( external_predictions=external_predictions, valid_treatments=self._dml_data.d_cols, - valid_learners=self.params_names, + valid_learners=self.predictions_names, n_obs=self.n_obs, n_rep=self.n_rep, ) @@ -1146,8 +1153,10 @@ def _initialize_arrays(self): self._all_se = np.full((n_thetas, n_rep), np.nan) def _initialize_predictions_and_targets(self): - self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} - self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} + self._predictions = {learner: np.full(self._score_dim, np.nan, dtype=object) for learner in self.predictions_names} + self._nuisance_targets = { + learner: np.full(self._score_dim, np.nan, dtype=object) for learner in self.predictions_names + } def _initialize_nuisance_loss(self): self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names} @@ -1158,7 +1167,7 @@ def _initialize_models(self): } def _store_predictions_and_targets(self, preds, targets): - for learner in self.params_names: + for learner in self.predictions_names: self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner] self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner] diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index c9b39c94e..1bd905367 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -111,6 +111,7 @@ def __init__( ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} + self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M", "ml_M_inner", "ml_a_inner"] if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) @@ -181,6 +182,7 @@ def _double_dml_cv_predict( res = {} res["preds"] = np.zeros(y.shape, dtype=float) res["preds_inner"] = [] + res["targets_inner"] = [] res["models"] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): res_inner = _dml_cv_predict( @@ -198,6 +200,7 @@ def _double_dml_cv_predict( _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) res["preds_inner"].append(res_inner["preds"]) + res["targets_inner"].append(res_inner["targets"]) for model in res_inner["models"]: res["models"].append(model) if method == "predict_proba": @@ -218,7 +221,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa a_external = external_predictions["ml_a"] is not None if M_external: - M_hat = {"preds": external_predictions["ml_M"], "targets": None, "models": None} + if "ml_M_inner" not in external_predictions.keys(): + raise ValueError("When providing external predictions for ml_M, also inner predictions have to be provided.") + M_hat_inner = np.squeeze(np.array(external_predictions["ml_M_inner"].tolist())).T + M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None} else: M_hat = self._double_dml_cv_predict( self._learner["ml_M"], @@ -285,7 +291,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa ) if a_external: - a_hat = {"preds": external_predictions["ml_a"], "targets": None, "models": None} + if "ml_a_inner" not in external_predictions.keys(): + raise ValueError("When providing external predictions for ml_M, also inner predictions have to be provided.") + a_hat_inner = np.squeeze(np.array(external_predictions["ml_a_inner"].tolist())).T + a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None} else: a_hat = self._double_dml_cv_predict( self._learner["ml_a"], @@ -338,6 +347,8 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa "ml_a": a_hat["preds"], "ml_t": t_hat["preds"], "ml_M": M_hat["preds"], + "ml_M_inner": np.moveaxis(M_hat["preds_inner"], 0, -1).tolist(), + "ml_a_inner": np.moveaxis(a_hat["preds_inner"], 0, -1).tolist(), }, "targets": { "ml_r": None, @@ -345,6 +356,8 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa "ml_a": a_hat["targets"], "ml_t": t_hat["targets"], "ml_M": M_hat["targets"], + "ml_M_inner": np.moveaxis(M_hat["targets_inner"], 0, -1).tolist() if not M_external else None, + "ml_a_inner": np.moveaxis(a_hat["targets_inner"], 0, -1).tolist() if not a_external else None, }, "models": { "ml_r": None, @@ -357,6 +370,13 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa return psi_elements, preds + @property + def predictions_names(self): + """ + The names of predictions for the nuisance functions. + """ + return self._predictions_names + def _score_elements(self, y, d, r_hat, m_hat): # compute residual d_tilde = d - m_hat diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 2e58bfeaf..efba990d9 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -4,8 +4,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import doubleml as dml - -from ..datasets import make_lplr_LZZ2020 +from doubleml.plm.datasets import make_lplr_LZZ2020 @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) diff --git a/doubleml/plm/tests/test_lplr_external_predictions.py b/doubleml/plm/tests/test_lplr_external_predictions.py new file mode 100644 index 000000000..670860386 --- /dev/null +++ b/doubleml/plm/tests/test_lplr_external_predictions.py @@ -0,0 +1,90 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml import DoubleMLData +from doubleml.plm.datasets import make_lplr_LZZ2020 +from doubleml.plm.lplr import DoubleMLLPLR +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def lplr_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_m_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_t_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_M_ext(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_lplr_fixture(lplr_score, n_rep, set_ml_m_ext, set_ml_t_ext, set_ml_M_ext): + ext_predictions = {"d": {}} + + x, y, d, _ = make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type="np.array", treatment="continuous") + + np.random.seed(3141) + dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d) + + kwargs = {"obj_dml_data": dml_data, "score": lplr_score, "n_rep": n_rep} + if lplr_score == "instrument": + # ensure ml_a supports sample_weight + kwargs["ml_a"] = LinearRegression() + + dml_lplr = DoubleMLLPLR(ml_M=LogisticRegression(max_iter=1000), ml_t=LinearRegression(), ml_m=LinearRegression(), **kwargs) + np.random.seed(3141) + dml_lplr.fit(store_predictions=True) + + # prepare external predictions and dummy learners + if set_ml_M_ext: + ext_predictions["d"]["ml_M"] = dml_lplr.predictions["ml_M"][:, :, 0] + ext_predictions["d"]["ml_M_inner"] = dml_lplr.predictions["ml_M_inner"][:, :, 0] + ml_M = DMLDummyClassifier() + else: + ml_M = LogisticRegression(max_iter=1000) + + if set_ml_t_ext: + ext_predictions["d"]["ml_t"] = dml_lplr.predictions["ml_t"][:, :, 0] + ml_t = DMLDummyRegressor() + else: + ml_t = LinearRegression() + + if set_ml_m_ext: + ext_predictions["d"]["ml_m"] = dml_lplr.predictions["ml_m"][:, :, 0] + ml_m = DMLDummyRegressor() + ext_predictions["d"]["ml_a"] = dml_lplr.predictions["ml_a"][:, :, 0] + ext_predictions["d"]["ml_a_inner"] = dml_lplr.predictions["ml_a_inner"][:, :, 0] + else: + ml_m = LinearRegression() + + # build second model with external predictions + dml_lplr_ext = DoubleMLLPLR(ml_M=ml_M, ml_t=ml_t, ml_m=ml_m, **kwargs) + + np.random.seed(3141) + dml_lplr_ext.fit(external_predictions=ext_predictions) + + res_dict = {"coef_normal": dml_lplr.coef[0], "coef_ext": dml_lplr_ext.coef[0]} + return res_dict + + +@pytest.mark.ci +def test_doubleml_lplr_coef(doubleml_lplr_fixture): + assert math.isclose(doubleml_lplr_fixture["coef_normal"], doubleml_lplr_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-4) diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index d10ae48bc..1054feb35 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -363,9 +363,14 @@ def _set_external_predictions(external_predictions, learners, treatment, i_rep): ext_prediction_dict[learner] = None elif learner in external_predictions[treatment].keys(): if isinstance(external_predictions[treatment][learner], np.ndarray): - ext_prediction_dict[learner] = external_predictions[treatment][learner][:, i_rep] + ext_prediction_dict[learner] = external_predictions[treatment][learner][:, i_rep].astype(float) else: ext_prediction_dict[learner] = None + if f"{learner}_inner" in external_predictions[treatment].keys(): + if isinstance(external_predictions[treatment][f"{learner}_inner"], np.ndarray): + ext_prediction_dict[f"{learner}_inner"] = external_predictions[treatment][f"{learner}_inner"][:, i_rep] + else: + ext_prediction_dict[learner] = None else: ext_prediction_dict[learner] = None return ext_prediction_dict From cd6055b8f78db4bd6429a06ec2fd46c855f85ed8 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 7 Nov 2025 18:05:07 -0800 Subject: [PATCH 28/48] Bugfix and addtl text --- doubleml/double_ml.py | 2 +- doubleml/plm/lplr.py | 11 +++++++++++ doubleml/plm/tests/test_lplr.py | 11 ++++++++++- doubleml/plm/tests/test_lplr_exceptions.py | 13 +++++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index a95e2c7dc..1605391e1 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -268,7 +268,7 @@ def predictions_names(self): """ The names of predictions for the nuisance functions. """ - return list(self._learner.keys()) + return list(self.params_names) @property def learner_names(self): diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 1bd905367..a0405032e 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -1,4 +1,5 @@ import inspect +import warnings import numpy as np import scipy @@ -133,6 +134,11 @@ def __init__( "but at least one treatment variable is not binary with values 0 and 1." ) else: + if self._dml_data.binary_treats.any(): + warnings.warn( + f"The ml_m learner {str(ml_m)} was identified as regressor " + "but at least one treatment variable is binary with values 0 and 1." + ) self._predict_method["ml_m"] = "predict" if ml_a_is_classifier: @@ -144,6 +150,11 @@ def __init__( "but at least one treatment variable is not binary with values 0 and 1." ) else: + if self._dml_data.binary_treats.any(): + warnings.warn( + f"The ml_a learner {str(ml_a)} was identified as regressor but at least one treatment variable is " + f"binary with values 0 and 1." + ) self._predict_method["ml_a"] = "predict" if score == "instrument": diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index efba990d9..abd7adf55 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -22,6 +22,11 @@ def learner_m(request): return request.param +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_m_classifier(request): + return request.param + + @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param @@ -39,6 +44,7 @@ def dml_lplr_fixture( learner_M, learner_t, learner_m, + learner_m_classifier, treatment, ): n_folds = 5 @@ -50,7 +56,10 @@ def dml_lplr_fixture( ml_M = clone(learner_M) ml_t = clone(learner_t) - ml_m = clone(learner_m) + if treatment == "continuous": + ml_m = clone(learner_m) + else: + ml_m = clone(learner_m_classifier) dml_sel_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m, n_folds=n_folds, score=score) dml_sel_obj.fit() diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index c4c57fd98..03cb7158a 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -12,6 +12,7 @@ n = 100 # create test data and basic learners dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20) +dml_data_binary = make_lplr_LZZ2020(alpha=0.5, n_obs=n, treatment="binary", dim_x=20) ml_M = RandomForestClassifier() ml_t = RandomForestRegressor() ml_m = RandomForestRegressor() @@ -231,6 +232,18 @@ def test_lplr_exception_and_warning_learner(): msg = "Invalid learner provided for ml_M: " + r"Lasso\(\) has no method .predict_proba\(\)." with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, Lasso(), ml_t, ml_m) + msg = ( + r"The ml_m learner RandomForestRegressor\(\) was identified as regressor but at least one treatment " + r"variable is binary with values 0 and 1." + ) + with pytest.warns(match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_m) + msg = ( + r"The ml_a learner RandomForestRegressor\(\) was identified as regressor but at least one treatment " + r"variable is binary with values 0 and 1." + ) + with pytest.warns(match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_M, ml_a=ml_m) class LassoWithNanPred(Lasso): From 4a8be08efae7cb36ee46ad58a58edc43bb6e9bd6 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Sun, 9 Nov 2025 23:45:48 -0800 Subject: [PATCH 29/48] Change to ext predictions --- doubleml/double_ml.py | 7 ++- doubleml/plm/lplr.py | 57 +++++++++++++++---- .../tests/test_lplr_external_predictions.py | 7 ++- doubleml/utils/_estimation.py | 5 -- 4 files changed, 56 insertions(+), 20 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 1605391e1..899bad4c0 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1088,7 +1088,10 @@ def _initalize_fit(self, store_predictions, store_models): def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models): ext_prediction_dict = _set_external_predictions( - external_predictions, learners=self.params_names, treatment=self._dml_data.d_cols[self._i_treat], i_rep=self._i_rep + external_predictions, + learners=self.predictions_names, + treatment=self._dml_data.d_cols[self._i_treat], + i_rep=self._i_rep, ) # ml estimation of nuisance models and computation of score elements @@ -1153,7 +1156,7 @@ def _initialize_arrays(self): self._all_se = np.full((n_thetas, n_rep), np.nan) def _initialize_predictions_and_targets(self): - self._predictions = {learner: np.full(self._score_dim, np.nan, dtype=object) for learner in self.predictions_names} + self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} self._nuisance_targets = { learner: np.full(self._score_dim, np.nan, dtype=object) for learner in self.predictions_names } diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index a0405032e..99ac77e08 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -112,7 +112,10 @@ def __init__( ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} - self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M", "ml_M_inner", "ml_a_inner"] + # replace aggregated inner names with per-inner-fold names + inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds_inner)] + inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds_inner)] + self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) @@ -232,9 +235,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa a_external = external_predictions["ml_a"] is not None if M_external: - if "ml_M_inner" not in external_predictions.keys(): - raise ValueError("When providing external predictions for ml_M, also inner predictions have to be provided.") - M_hat_inner = np.squeeze(np.array(external_predictions["ml_M_inner"].tolist())).T + # expect per-inner-fold keys ml_M_inner_i + missing = [i for i in range(self.n_folds_inner) if f"ml_M_inner_{i}" not in external_predictions.keys()] + if len(missing) > 0: + raise ValueError( + "When providing external predictions for ml_M, also inner predictions for all inner folds " + f"have to be provided (missing: {', '.join([str(i) for i in missing])})." + ) + M_hat_inner = [external_predictions[f"ml_M_inner_{i}"] for i in range(self.n_folds_inner)] M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None} else: M_hat = self._double_dml_cv_predict( @@ -302,9 +310,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa ) if a_external: - if "ml_a_inner" not in external_predictions.keys(): - raise ValueError("When providing external predictions for ml_M, also inner predictions have to be provided.") - a_hat_inner = np.squeeze(np.array(external_predictions["ml_a_inner"].tolist())).T + # expect per-inner-fold keys ml_a_inner_i + missing = [i for i in range(self.n_folds_inner) if f"ml_a_inner_{i}" not in external_predictions.keys()] + if len(missing) > 0: + raise ValueError( + "When providing external predictions for ml_a, also inner predictions for all inner folds " + f"have to be provided (missing: {', '.join([str(i) for i in missing])})." + ) + a_hat_inner = [external_predictions[f"ml_a_inner_{i}"] for i in range(self.n_folds_inner)] a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None} else: a_hat = self._double_dml_cv_predict( @@ -358,8 +371,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa "ml_a": a_hat["preds"], "ml_t": t_hat["preds"], "ml_M": M_hat["preds"], - "ml_M_inner": np.moveaxis(M_hat["preds_inner"], 0, -1).tolist(), - "ml_a_inner": np.moveaxis(a_hat["preds_inner"], 0, -1).tolist(), + # store inner predictions as separate keys per inner fold + # ml_M inner + **{f"ml_M_inner_{i}": M_hat["preds_inner"][i] for i in range(len(M_hat["preds_inner"]))}, + # ml_a inner + **{f"ml_a_inner_{i}": a_hat["preds_inner"][i] for i in range(len(a_hat["preds_inner"]))}, }, "targets": { "ml_r": None, @@ -367,8 +383,27 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa "ml_a": a_hat["targets"], "ml_t": t_hat["targets"], "ml_M": M_hat["targets"], - "ml_M_inner": np.moveaxis(M_hat["targets_inner"], 0, -1).tolist() if not M_external else None, - "ml_a_inner": np.moveaxis(a_hat["targets_inner"], 0, -1).tolist() if not a_external else None, + # store inner targets as separate keys per inner fold (None if external) + **( + { + f"ml_M_inner_{i}": ( + M_hat.get("targets_inner")[i] + if M_hat.get("targets_inner") is not None and i < len(M_hat["targets_inner"]) + else None + ) + for i in range(len(M_hat.get("preds_inner", []))) + } + ), + **( + { + f"ml_a_inner_{i}": ( + a_hat.get("targets_inner")[i] + if a_hat.get("targets_inner") is not None and i < len(a_hat["targets_inner"]) + else None + ) + for i in range(len(a_hat.get("preds_inner", []))) + } + ), }, "models": { "ml_r": None, diff --git a/doubleml/plm/tests/test_lplr_external_predictions.py b/doubleml/plm/tests/test_lplr_external_predictions.py index 670860386..5e9b66e87 100644 --- a/doubleml/plm/tests/test_lplr_external_predictions.py +++ b/doubleml/plm/tests/test_lplr_external_predictions.py @@ -56,7 +56,9 @@ def doubleml_lplr_fixture(lplr_score, n_rep, set_ml_m_ext, set_ml_t_ext, set_ml_ # prepare external predictions and dummy learners if set_ml_M_ext: ext_predictions["d"]["ml_M"] = dml_lplr.predictions["ml_M"][:, :, 0] - ext_predictions["d"]["ml_M_inner"] = dml_lplr.predictions["ml_M_inner"][:, :, 0] + # provide inner predictions per inner fold index + for i in range(dml_lplr.n_folds_inner): + ext_predictions["d"][f"ml_M_inner_{i}"] = dml_lplr.predictions[f"ml_M_inner_{i}"][:, :, 0] ml_M = DMLDummyClassifier() else: ml_M = LogisticRegression(max_iter=1000) @@ -71,7 +73,8 @@ def doubleml_lplr_fixture(lplr_score, n_rep, set_ml_m_ext, set_ml_t_ext, set_ml_ ext_predictions["d"]["ml_m"] = dml_lplr.predictions["ml_m"][:, :, 0] ml_m = DMLDummyRegressor() ext_predictions["d"]["ml_a"] = dml_lplr.predictions["ml_a"][:, :, 0] - ext_predictions["d"]["ml_a_inner"] = dml_lplr.predictions["ml_a_inner"][:, :, 0] + for i in range(dml_lplr.n_folds_inner): + ext_predictions["d"][f"ml_a_inner_{i}"] = dml_lplr.predictions[f"ml_a_inner_{i}"][:, :, 0] else: ml_m = LinearRegression() diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 1054feb35..aaf21ea34 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -366,11 +366,6 @@ def _set_external_predictions(external_predictions, learners, treatment, i_rep): ext_prediction_dict[learner] = external_predictions[treatment][learner][:, i_rep].astype(float) else: ext_prediction_dict[learner] = None - if f"{learner}_inner" in external_predictions[treatment].keys(): - if isinstance(external_predictions[treatment][f"{learner}_inner"], np.ndarray): - ext_prediction_dict[f"{learner}_inner"] = external_predictions[treatment][f"{learner}_inner"][:, i_rep] - else: - ext_prediction_dict[learner] = None else: ext_prediction_dict[learner] = None return ext_prediction_dict From 0472f1cfe8d6fc74587dd02f0c3d3d98c2aeeb78 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 07:41:10 -0800 Subject: [PATCH 30/48] Change to targets data type --- doubleml/double_ml.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 899bad4c0..33e5e75e6 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1157,9 +1157,7 @@ def _initialize_arrays(self): def _initialize_predictions_and_targets(self): self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} - self._nuisance_targets = { - learner: np.full(self._score_dim, np.nan, dtype=object) for learner in self.predictions_names - } + self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} def _initialize_nuisance_loss(self): self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names} From 2fc1f538b8e921b04e1909e712c7e0a65aa42612 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 11:22:49 -0800 Subject: [PATCH 31/48] DoubleResamplin integrated into mixin, small changes --- doubleml/double_ml.py | 26 ++++++++++++++++- doubleml/double_ml_sampling_mixins.py | 25 +++++++++++++---- doubleml/plm/lplr.py | 40 ++++----------------------- doubleml/utils/_estimation.py | 2 +- 4 files changed, 51 insertions(+), 42 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 33e5e75e6..9295e0930 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -22,7 +22,7 @@ class DoubleML(SampleSplittingMixin, ABC): """Double Machine Learning.""" - def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): + def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=False): # check and pick up obj_dml_data if not isinstance(obj_dml_data, DoubleMLBaseData): raise TypeError( @@ -108,6 +108,9 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): self._smpls = None self._smpls_cluster = None self._n_obs_sample_splitting = self.n_obs + self._double_sample_splitting = double_sample_splitting + if self._smpls_cluster is True: + self.__smpls__inner = None if draw_sample_splitting: self.draw_sample_splitting() self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) @@ -366,6 +369,21 @@ def smpls(self): raise ValueError(err_msg) return self._smpls + @property + def smpls_inner(self): + """ + The partition used for cross-fitting. + """ + if not self._double_sample_splitting: + raise ValueError("smpls_inner is only available for double sample splitting.") + if self._smpls is None: + err_msg = ( + "Sample splitting not specified. Either draw samples via .draw_sample splitting() " + + "or set external samples via .set_sample_splitting()." + ) + raise ValueError(err_msg) + return self._smpls + @property def smpls_cluster(self): """ @@ -514,6 +532,12 @@ def summary(self): def __smpls(self): return self._smpls[self._i_rep] + @property + def __smpls__inner(self): + if not self._smpls_inner[self._i_rep]: + raise ValueError("smpls_inner is only available for double sample splitting.") + return self._smpls_inner[self._i_rep] + @property def __smpls_cluster(self): return self._smpls_cluster[self._i_rep] diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index d7d8b2e14..97ed5aa59 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -1,7 +1,7 @@ from abc import abstractmethod from doubleml.utils._checks import _check_sample_splitting -from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling +from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLDoubleResampling, DoubleMLResampling class SampleSplittingMixin: @@ -29,6 +29,8 @@ def draw_sample_splitting(self): self : object """ if self._is_cluster_data: + if self._double_sample_splitting: + raise ValueError("Cluster data not supported for double sample splitting.") obj_dml_resampling = DoubleMLClusterResampling( n_folds=self._n_folds_per_cluster, n_rep=self.n_rep, @@ -38,10 +40,20 @@ def draw_sample_splitting(self): ) self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() else: - obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata - ) - self._smpls = obj_dml_resampling.split_samples() + if self._double_sample_splitting: + obj_dml_resampling = DoubleMLDoubleResampling( + n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata, + ) + self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() + else: + obj_dml_resampling = DoubleMLResampling( + n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata + ) + self._smpls = obj_dml_resampling.split_samples() return self @@ -104,6 +116,9 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): >>> dml_plr_obj.set_sample_splitting(smpls) # doctest: +ELLIPSIS """ + if self._double_sample_splitting: + raise ValueError("set_sample_splitting not supported for double sample splitting.") + self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting ) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 99ac77e08..d390067f8 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -15,7 +15,6 @@ _dml_cv_predict, _dml_tune, ) -from doubleml.utils.resampling import DoubleMLDoubleResampling class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): @@ -91,14 +90,11 @@ def __init__( score="nuisance_space", draw_sample_splitting=True, error_on_convergence_failure=False, + double_sample_splitting=True, ): self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) - # Ensure outcome only contains 0 and 1 (validate early in constructor) - if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): - raise TypeError("The outcome variable y must be binary with values 0 and 1.") - self._error_on_convergence_failure = error_on_convergence_failure self._coef_bounds = (-1e-2, 1e2) self._coef_start_val = 1.0 @@ -167,11 +163,15 @@ def __init__( self._initialize_ml_nuisance_params() self._external_predictions_implemented = True + self._sensitivity_implemented = False def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} def _check_data(self, obj_dml_data): + # Ensure outcome only contains 0 and 1 (validate early in constructor) + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError("The outcome variable y must be binary with values 0 and 1.") if not isinstance(obj_dml_data, DoubleMLData): raise TypeError( f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." @@ -562,36 +562,6 @@ def _nuisance_tuning( return res - @property - def __smpls__inner(self): - return self._smpls_inner[self._i_rep] - - def draw_sample_splitting(self): - """ - Draw sample splitting for DoubleML models. - - The samples are drawn according to the attributes - ``n_folds`` and ``n_rep``. - - Returns - ------- - self : object - """ - - obj_dml_resampling = DoubleMLDoubleResampling( - n_folds=self.n_folds, - n_folds_inner=self.n_folds_inner, - n_rep=self.n_rep, - n_obs=self._dml_data.n_obs, - stratify=self._strata, - ) - self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() - - return self - - def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): - raise NotImplementedError("set_sample_splitting is not implemented for DoubleMLLPLR.") - def _compute_score(self, psi_elements, coef): if self.score == "nuisance_space": score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index aaf21ea34..8dc631bcf 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -331,7 +331,7 @@ def _var_est(psi, psi_deriv, smpls, is_cluster_data, cluster_vars=None, smpls_cl J_l = test_cluster_inds[1] const = np.divide(min(len(I_k), len(J_l)), (np.square(len(I_k) * len(J_l)))) for cluster_value in I_k: - ind_cluster = (first_cluster_var == cluster_value) & np.in1d(second_cluster_var, J_l) + ind_cluster = (first_cluster_var == cluster_value) & np.isin(second_cluster_var, J_l) gamma_hat += const * np.sum(np.outer(psi[ind_cluster], psi[ind_cluster])) for cluster_value in J_l: ind_cluster = (second_cluster_var == cluster_value) & np.isin(first_cluster_var, I_k) From ecfe2c7b004930259495bcd94d1ae3d1fe15b768 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 12:02:40 -0800 Subject: [PATCH 32/48] Added attribute to sample mixin --- doubleml/double_ml.py | 4 ++-- doubleml/double_ml_sampling_mixins.py | 2 ++ doubleml/plm/lplr.py | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 9295e0930..6032fd789 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -109,8 +109,8 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, d self._smpls_cluster = None self._n_obs_sample_splitting = self.n_obs self._double_sample_splitting = double_sample_splitting - if self._smpls_cluster is True: - self.__smpls__inner = None + if self._double_sample_splitting: + self._smpls_inner = None if draw_sample_splitting: self.draw_sample_splitting() self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index 97ed5aa59..2f63d88e2 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -17,6 +17,8 @@ class SampleSplittingMixin: `sample splitting `_ in the DoubleML user guide. """ + _double_sample_splitting = False + def draw_sample_splitting(self): """ Draw sample splitting for DoubleML models. diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index d390067f8..c9580b840 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -90,10 +90,9 @@ def __init__( score="nuisance_space", draw_sample_splitting=True, error_on_convergence_failure=False, - double_sample_splitting=True, ): self.n_folds_inner = n_folds_inner - super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=True) self._error_on_convergence_failure = error_on_convergence_failure self._coef_bounds = (-1e-2, 1e2) From a9c0debb509f42cab4d27663a7ec50c8da4d5e0b Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 13:44:35 -0800 Subject: [PATCH 33/48] Smpls inner access adjusted --- doubleml/double_ml.py | 4 ++-- doubleml/plm/lplr.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 6032fd789..19811cb87 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -376,13 +376,13 @@ def smpls_inner(self): """ if not self._double_sample_splitting: raise ValueError("smpls_inner is only available for double sample splitting.") - if self._smpls is None: + if self._smpls_inner is None: err_msg = ( "Sample splitting not specified. Either draw samples via .draw_sample splitting() " + "or set external samples via .set_sample_splitting()." ) raise ValueError(err_msg) - return self._smpls + return self._smpls_inner @property def smpls_cluster(self): diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index c9580b840..42336a619 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -250,7 +250,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa x_d_concat, y, smpls=smpls, - smpls_inner=self.__smpls__inner, + smpls_inner=self._DoubleML__smpls__inner, n_jobs=n_jobs_cv, est_params=self._get_params("ml_M"), method=self._predict_method["ml_M"], @@ -325,7 +325,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa x, d, smpls=smpls, - smpls_inner=self.__smpls__inner, + smpls_inner=self._DoubleML__smpls__inner, n_jobs=n_jobs_cv, est_params=self._get_params("ml_a"), method=self._predict_method["ml_a"], @@ -518,7 +518,7 @@ def _nuisance_tuning( x_d_concat, y, smpls=smpls, - smpls_inner=self.__smpls__inner, + smpls_inner=self._DoubleML__smpls__inner, n_jobs=n_jobs_cv, est_params=M_best_params, method=self._predict_method["ml_M"], From 6abff491210391cb4b6841644cc31edf40c49978 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 17:21:12 -0800 Subject: [PATCH 34/48] Docstring, complexity reduction --- doubleml/double_ml.py | 16 ++++------------ doubleml/plm/lplr.py | 6 +++--- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 19811cb87..bdfabad18 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -34,18 +34,10 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, d if obj_dml_data.n_cluster_vars > 2: raise NotImplementedError("Multi-way (n_ways > 2) clustering not yet implemented.") self._is_cluster_data = True - self._is_panel_data = False - if isinstance(obj_dml_data, DoubleMLPanelData): - self._is_panel_data = True - self._is_did_data = False - if isinstance(obj_dml_data, DoubleMLDIDData): - self._is_did_data = True - self._is_ssm_data = False - if isinstance(obj_dml_data, DoubleMLSSMData): - self._is_ssm_data = True - self._is_rdd_data = False - if isinstance(obj_dml_data, DoubleMLRDDData): - self._is_rdd_data = True + self._is_panel_data = isinstance(obj_dml_data, DoubleMLPanelData) + self._is_did_data = isinstance(obj_dml_data, DoubleMLDIDData) + self._is_ssm_data = isinstance(obj_dml_data, DoubleMLSSMData) + self._is_rdd_data = isinstance(obj_dml_data, DoubleMLRDDData) self._dml_data = obj_dml_data self._n_obs = self._dml_data.n_obs diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 42336a619..9e0ed4060 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -60,10 +60,10 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) - >>> dml_lplr_obj = dml.DoubleMLPLR(obj_dml_data, ml_M, ml_t, ml_m) + >>> dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) >>> dml_lplr_obj.fit().summary - coef std err t P>|t| 2.5 % 97.5 % - d 0.480691 0.040533 11.859129 1.929729e-32 0.401247 0.560135 + coef std err t P>|t| 2.5 % 97.5 % + d 0.661166 0.172672 3.829038 0.000129 0.322736 0.999596 Notes ----- From 0f08e370b36a22062b762763c58203da6db99a56 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 18:42:31 -0800 Subject: [PATCH 35/48] Weights updated, seed corrected --- doubleml/plm/lplr.py | 48 ++++++++++++---------------- doubleml/utils/_estimation.py | 60 +++++++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 45 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 9e0ed4060..701a7fcd3 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -55,7 +55,7 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): >>> from doubleml.plm.datasets import make_lplr_LZZ2020 >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> from sklearn.base import clone - >>> np.random.seed(3141) + >>> np.random.seed(42) >>> ml_t = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) @@ -95,7 +95,6 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=True) self._error_on_convergence_failure = error_on_convergence_failure - self._coef_bounds = (-1e-2, 1e2) self._coef_start_val = 1.0 self._check_data(self._dml_data) @@ -207,7 +206,7 @@ def _double_dml_cv_predict( est_params=est_params, method=method, return_models=True, - smpls_is_partition=True, + smpls_is_partition_manual_set=True, sample_weights=sample_weights, ) _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) @@ -261,36 +260,26 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} else: if self.score == "instrument": - weights = [] - for i, (train, test) in enumerate(smpls): - weights.append(M_hat["preds_inner"][i][train] * (1 - M_hat["preds_inner"][i][train])) - m_hat = _dml_cv_predict( - self._learner["ml_m"], - x, - d, - smpls=smpls, - n_jobs=n_jobs_cv, - est_params=self._get_params("ml_m"), - method=self._predict_method["ml_m"], - return_models=return_models, - sample_weights=weights, - ) - + weights = M_hat["preds"] * (1 - M_hat["preds"]) + filtered_smpls = smpls elif self.score == "nuisance_space": filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] filtered_smpls.append((train_filtered, test)) - m_hat = _dml_cv_predict( - self._learner["ml_m"], - x, - d, - smpls=filtered_smpls, - n_jobs=n_jobs_cv, - est_params=self._get_params("ml_m"), - method=self._predict_method["ml_m"], - return_models=return_models, - ) + weights = None + + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + sample_weights=weights, + ) _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) @@ -342,6 +331,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa d_tilde = (d - a_hat["preds_inner"][i])[train] beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde**2) + # Use preliminary beta estimates as starting value for root finding + self._coef_start_val = np.average(beta) + # nuisance t if t_external: t_hat = {"preds": external_predictions["ml_t"], "targets": None, "models": None} diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 8dc631bcf..aff0729ec 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -38,8 +38,11 @@ def _get_cond_smpls_2d(smpls, bin_var1, bin_var2): return smpls_00, smpls_01, smpls_10, smpls_11 -def _fit(estimator, x, y, train_index, idx=None): - estimator.fit(x[train_index, :], y[train_index]) +def _fit(estimator, x, y, train_index, idx=None, sample_weights=None): + if sample_weights is not None: + estimator.fit(x[train_index, :], y[train_index], sample_weights=sample_weights[train_index]) + else: + estimator.fit(x[train_index, :], y[train_index]) return estimator, idx @@ -53,36 +56,50 @@ def _dml_cv_predict( method="predict", return_train_preds=False, return_models=False, - smpls_is_partition=None, + smpls_is_partition_manual_set=None, sample_weights=None, ): n_obs = x.shape[0] - # TODO: Better name for smples_is_partition - if smpls_is_partition is None: + if smpls_is_partition_manual_set is None: smpls_is_partition = _check_is_partition(smpls, n_obs) + else: + smpls_is_partition = smpls_is_partition_manual_set fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) manual_cv_predict = ( - (not smpls_is_partition) - | return_train_preds - | fold_specific_params - | fold_specific_target - | return_models - | bool(sample_weights) + (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target | return_models ) - # TODO: Check if cross_val_predict supports weights res = {"models": None} if not manual_cv_predict: + # prepare fit_params for cross_val_predict + fit_params_for_cv = {"sample_weight": sample_weights} if sample_weights is not None else None + if est_params is None: # if there are no parameters set we redirect to the standard method - preds = cross_val_predict(clone(estimator), x, y, cv=smpls, n_jobs=n_jobs, method=method) + preds = cross_val_predict( + clone(estimator), + x, + y, + cv=smpls, + n_jobs=n_jobs, + method=method, + params=fit_params_for_cv, + ) else: assert isinstance(est_params, dict) # if no fold-specific parameters we redirect to the standard method # warnings.warn("Using the same (hyper-)parameters for all folds") - preds = cross_val_predict(clone(estimator).set_params(**est_params), x, y, cv=smpls, n_jobs=n_jobs, method=method) + preds = cross_val_predict( + clone(estimator).set_params(**est_params), + x, + y, + cv=smpls, + n_jobs=n_jobs, + method=method, + params=fit_params_for_cv, + ) if method == "predict_proba": res["preds"] = preds[:, 1] else: @@ -113,19 +130,28 @@ def _dml_cv_predict( if est_params is None: fitted_models = parallel( - delayed(_fit)(clone(estimator), x, y_list[idx], train_index, idx) + delayed(_fit)(clone(estimator), x, y_list[idx], train_index, idx, sample_weights=sample_weights) for idx, (train_index, test_index) in enumerate(smpls) ) elif isinstance(est_params, dict): # warnings.warn("Using the same (hyper-)parameters for all folds") fitted_models = parallel( - delayed(_fit)(clone(estimator).set_params(**est_params), x, y_list[idx], train_index, idx) + delayed(_fit)( + clone(estimator).set_params(**est_params), x, y_list[idx], train_index, idx, sample_weights=sample_weights + ) for idx, (train_index, test_index) in enumerate(smpls) ) else: assert len(est_params) == len(smpls), "provide one parameter setting per fold" fitted_models = parallel( - delayed(_fit)(clone(estimator).set_params(**est_params[idx]), x, y_list[idx], train_index, idx) + delayed(_fit)( + clone(estimator).set_params(**est_params[idx]), + x, + y_list[idx], + train_index, + idx, + sample_weights=sample_weights, + ) for idx, (train_index, test_index) in enumerate(smpls) ) From 430f4a6abc48853d5704676805a229722f21c0ed Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 20:37:31 -0800 Subject: [PATCH 36/48] Fix --- doubleml/utils/_estimation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index aff0729ec..d4e8abc4d 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -40,7 +40,7 @@ def _get_cond_smpls_2d(smpls, bin_var1, bin_var2): def _fit(estimator, x, y, train_index, idx=None, sample_weights=None): if sample_weights is not None: - estimator.fit(x[train_index, :], y[train_index], sample_weights=sample_weights[train_index]) + estimator.fit(x[train_index, :], y[train_index], sample_weight=sample_weights[train_index]) else: estimator.fit(x[train_index, :], y[train_index]) return estimator, idx From 5b92395b47ce93ccf5271b63ce55cc84aa820067 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 20:50:44 -0800 Subject: [PATCH 37/48] Renaming --- doubleml/plm/lplr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 701a7fcd3..7b47f5665 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -273,7 +273,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa self._learner["ml_m"], x, d, - smpls=smpls, + smpls=filtered_smpls, n_jobs=n_jobs_cv, est_params=self._get_params("ml_m"), method=self._predict_method["ml_m"], From 042aa26e114e73852d0ad9a673a6c03b8c8d5d0b Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 10 Nov 2025 22:03:02 -0800 Subject: [PATCH 38/48] Doctest --- doubleml/plm/lplr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 7b47f5665..67f907a40 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -61,7 +61,7 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) >>> dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) - >>> dml_lplr_obj.fit().summary + >>> dml_lplr_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.661166 0.172672 3.829038 0.000129 0.322736 0.999596 From 3b6f3b732dfacd3dc8ff3a8d4d8995e6b3f3aad4 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 12 Nov 2025 12:14:41 -0800 Subject: [PATCH 39/48] Test updated and comments implemented --- doubleml/double_ml.py | 2 +- doubleml/irm/tests/test_datasets.py | 156 ++++++++++ doubleml/plm/lplr.py | 18 +- doubleml/plm/tests/test_datasets.py | 151 +++++++++ doubleml/plm/tests/test_lplr_exceptions.py | 38 ++- .../tests/test_lplr_external_predictions.py | 57 +++- doubleml/tests/test_datasets.py | 294 ------------------ doubleml/tests/test_exceptions.py | 28 +- doubleml/tests/test_set_sample_splitting.py | 8 + 9 files changed, 445 insertions(+), 307 deletions(-) create mode 100644 doubleml/irm/tests/test_datasets.py create mode 100644 doubleml/plm/tests/test_datasets.py diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index bdfabad18..638cb31ec 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -526,7 +526,7 @@ def __smpls(self): @property def __smpls__inner(self): - if not self._smpls_inner[self._i_rep]: + if not self.smpls_inner[self._i_rep]: raise ValueError("smpls_inner is only available for double sample splitting.") return self._smpls_inner[self._i_rep] diff --git a/doubleml/irm/tests/test_datasets.py b/doubleml/irm/tests/test_datasets.py new file mode 100644 index 000000000..79bf67940 --- /dev/null +++ b/doubleml/irm/tests/test_datasets.py @@ -0,0 +1,156 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml import DoubleMLData +from doubleml.irm.datasets import ( + make_confounded_irm_data, + make_heterogeneous_data, + make_iivm_data, + make_irm_data, + make_irm_data_discrete_treatments, + make_ssm_data, +) + +msg_inv_return_type = "Invalid return_type." + + +@pytest.mark.ci +def test_make_irm_data_return_types(): + np.random.seed(3141) + res = make_irm_data(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_irm_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d = make_irm_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_irm_data(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_iivm_data_return_types(): + np.random.seed(3141) + res = make_iivm_data(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_iivm_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_iivm_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_iivm_data(n_obs=100, return_type="matrix") + + +@pytest.fixture(scope="function", params=[True, False]) +def linear(request): + return request.param + + +@pytest.mark.ci +def test_make_confounded_irm_data_return_types(linear): + np.random.seed(3141) + res = make_confounded_irm_data(linear=linear) + assert isinstance(res, dict) + assert isinstance(res["x"], np.ndarray) + assert isinstance(res["y"], np.ndarray) + assert isinstance(res["d"], np.ndarray) + + assert isinstance(res["oracle_values"], dict) + assert isinstance(res["oracle_values"]["g_long"], np.ndarray) + assert isinstance(res["oracle_values"]["g_short"], np.ndarray) + assert isinstance(res["oracle_values"]["m_long"], np.ndarray) + assert isinstance(res["oracle_values"]["m_short"], np.ndarray) + assert isinstance(res["oracle_values"]["gamma_a"], float) + assert isinstance(res["oracle_values"]["beta_a"], float) + assert isinstance(res["oracle_values"]["a"], np.ndarray) + assert isinstance(res["oracle_values"]["y_0"], np.ndarray) + assert isinstance(res["oracle_values"]["y_1"], np.ndarray) + assert isinstance(res["oracle_values"]["z"], np.ndarray) + assert isinstance(res["oracle_values"]["cf_y"], float) + assert isinstance(res["oracle_values"]["cf_d_ate"], float) + assert isinstance(res["oracle_values"]["cf_d_atte"], float) + assert isinstance(res["oracle_values"]["rho_ate"], float) + assert isinstance(res["oracle_values"]["rho_atte"], float) + + +@pytest.fixture(scope="function", params=[False, True]) +def binary_treatment(request): + return request.param + + +@pytest.fixture(scope="function", params=[1, 2]) +def n_x(request): + return request.param + + +@pytest.mark.ci +def test_make_heterogeneous_data_return_types(binary_treatment, n_x): + np.random.seed(3141) + res = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=binary_treatment) + assert isinstance(res, dict) + assert isinstance(res["data"], pd.DataFrame) + assert isinstance(res["effects"], np.ndarray) + assert callable(res["treatment_effect"]) + + # test input checks + msg = "n_x must be either 1 or 2." + with pytest.raises(AssertionError, match=msg): + _ = make_heterogeneous_data(n_obs=100, n_x=0, binary_treatment=binary_treatment) + msg = "support_size must be smaller than p." + with pytest.raises(AssertionError, match=msg): + _ = make_heterogeneous_data(n_obs=100, n_x=n_x, support_size=31, binary_treatment=binary_treatment) + msg = "binary_treatment must be a boolean." + with pytest.raises(AssertionError, match=msg): + _ = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=2) + + +@pytest.mark.ci +def test_make_ssm_data_return_types(): + np.random.seed(3141) + res = make_ssm_data(n_obs=100) + assert isinstance(res, DoubleMLData) + res = make_ssm_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z, s = make_ssm_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + assert isinstance(s, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_ssm_data(n_obs=100, return_type="matrix") + + +@pytest.fixture(scope="function", params=[3, 5]) +def n_levels(request): + return request.param + + +def test_make_data_discrete_treatments(n_levels): + np.random.seed(3141) + n = 100 + data_apo = make_irm_data_discrete_treatments(n_obs=n, n_levels=3) + assert isinstance(data_apo, dict) + assert isinstance(data_apo["y"], np.ndarray) + assert isinstance(data_apo["d"], np.ndarray) + assert isinstance(data_apo["x"], np.ndarray) + assert isinstance(data_apo["oracle_values"], dict) + + assert isinstance(data_apo["oracle_values"]["cont_d"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["level_bounds"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["potential_level"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["ite"], np.ndarray) + assert isinstance(data_apo["oracle_values"]["y0"], np.ndarray) + + msg = "n_levels must be at least 2." + with pytest.raises(ValueError, match=msg): + _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1) + + msg = "n_levels must be an integer." + with pytest.raises(ValueError, match=msg): + _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1.1) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 67f907a40..e7ff1c35f 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -41,7 +41,7 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): Number of inner folds for nested resampling used internally. n_rep : int, default=1 Number of repetitions for sample splitting. - score : {'nuisance_space', 'instrument'} or callable, default='nuisance_space' + score : {'nuisance_space', 'instrument'}, default='nuisance_space' Score to use. 'nuisance_space' estimates m on subsamples with y=0; 'instrument' uses an instrument-type score. draw_sample_splitting : bool, default=True Whether to draw sample splitting during initialization. @@ -95,7 +95,6 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=True) self._error_on_convergence_failure = error_on_convergence_failure - self._coef_start_val = 1.0 self._check_data(self._dml_data) valid_scores = ["nuisance_space", "instrument"] @@ -167,9 +166,6 @@ def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} def _check_data(self, obj_dml_data): - # Ensure outcome only contains 0 and 1 (validate early in constructor) - if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): - raise TypeError("The outcome variable y must be binary with values 0 and 1.") if not isinstance(obj_dml_data, DoubleMLData): raise TypeError( f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." @@ -234,7 +230,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa if M_external: # expect per-inner-fold keys ml_M_inner_i - missing = [i for i in range(self.n_folds_inner) if f"ml_M_inner_{i}" not in external_predictions.keys()] + missing = [ + i + for i in range(self.n_folds_inner) + if f"ml_M_inner_{i}" not in external_predictions.keys() or external_predictions[f"ml_M_inner_{i}"] is None + ] if len(missing) > 0: raise ValueError( "When providing external predictions for ml_M, also inner predictions for all inner folds " @@ -299,7 +299,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa if a_external: # expect per-inner-fold keys ml_a_inner_i - missing = [i for i in range(self.n_folds_inner) if f"ml_a_inner_{i}" not in external_predictions.keys()] + missing = [ + i + for i in range(self.n_folds_inner) + if f"ml_a_inner_{i}" not in external_predictions.keys() or external_predictions[f"ml_a_inner_{i}"] is None + ] if len(missing) > 0: raise ValueError( "When providing external predictions for ml_a, also inner predictions for all inner folds " diff --git a/doubleml/plm/tests/test_datasets.py b/doubleml/plm/tests/test_datasets.py new file mode 100644 index 000000000..5e16b9acf --- /dev/null +++ b/doubleml/plm/tests/test_datasets.py @@ -0,0 +1,151 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml import DoubleMLData +from doubleml.plm.datasets import ( + _make_pliv_data, + make_confounded_plr_data, + make_lplr_LZZ2020, + make_pliv_CHS2015, + make_pliv_multiway_cluster_CKMS2021, + make_plr_CCDDHNR2018, + make_plr_turrell2018, +) + +msg_inv_return_type = "Invalid return_type." + + +@pytest.mark.ci +def test_make_plr_CCDDHNR2018_return_types(): + np.random.seed(3141) + res = make_plr_CCDDHNR2018(n_obs=100, return_type=DoubleMLData) + assert isinstance(res, DoubleMLData) + res = make_plr_CCDDHNR2018(n_obs=100, return_type=pd.DataFrame) + assert isinstance(res, pd.DataFrame) + x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type=np.ndarray) + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_plr_CCDDHNR2018(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_plr_turrell2018_return_types(): + np.random.seed(3141) + res = make_plr_turrell2018(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_plr_turrell2018(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d = make_plr_turrell2018(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_plr_turrell2018(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_confounded_plr_data_return_types(): + np.random.seed(3141) + res = make_confounded_plr_data(theta=5.0) + assert isinstance(res, dict) + assert isinstance(res["x"], np.ndarray) + assert isinstance(res["y"], np.ndarray) + assert isinstance(res["d"], np.ndarray) + + assert isinstance(res["oracle_values"], dict) + assert isinstance(res["oracle_values"]["g_long"], np.ndarray) + assert isinstance(res["oracle_values"]["g_short"], np.ndarray) + assert isinstance(res["oracle_values"]["m_long"], np.ndarray) + assert isinstance(res["oracle_values"]["m_short"], np.ndarray) + assert isinstance(res["oracle_values"]["theta"], float) + assert isinstance(res["oracle_values"]["gamma_a"], float) + assert isinstance(res["oracle_values"]["beta_a"], float) + assert isinstance(res["oracle_values"]["a"], np.ndarray) + assert isinstance(res["oracle_values"]["z"], np.ndarray) + + +@pytest.mark.ci +def test_make_pliv_data_return_types(): + np.random.seed(3141) + res = _make_pliv_data(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = _make_pliv_data(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = _make_pliv_data(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = _make_pliv_data(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_pliv_CHS2015_return_types(): + np.random.seed(3141) + res = make_pliv_CHS2015(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_pliv_CHS2015(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_pliv_CHS2015(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_pliv_CHS2015(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_pliv_multiway_cluster_CKMS2021_return_types(): + np.random.seed(3141) + res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(cluster_vars, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="matrix") + + +@pytest.mark.ci +def test_make_lplr_LZZ2020_return_types(): + np.random.seed(3141) + res = make_lplr_LZZ2020(n_obs=100, return_type="DoubleMLData") + assert isinstance(res, DoubleMLData) + res = make_lplr_LZZ2020(n_obs=100, return_type="DataFrame") + assert isinstance(res, pd.DataFrame) + x, y, d, z = make_lplr_LZZ2020(n_obs=100, return_type="array") + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + assert isinstance(z, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_lplr_LZZ2020(n_obs=100, return_type="matrix") + + +@pytest.mark.ci +def test_make_lplr_LZZ2020_variants(): + np.random.seed(3141) + res = make_lplr_LZZ2020(n_obs=100, treatment="binary") + assert np.array_equal(np.unique(res.d), np.array([0, 1])) + res = make_lplr_LZZ2020(n_obs=100, treatment="binary_unbalanced") + assert np.array_equal(np.unique(res.d), np.array([0, 1])) + res = make_lplr_LZZ2020(n_obs=100, treatment="continuous") + assert len(np.unique(res.d)) == 100 + + msg = "Invalid treatment type." + with pytest.raises(ValueError, match=msg): + _ = make_lplr_LZZ2020(n_obs=100, treatment="colors") + + res = make_lplr_LZZ2020(n_obs=100, balanced_r0=False) + _, y_unique = np.unique(res.y, return_counts=True) + assert np.abs(y_unique[0] - y_unique[1]) > 10 diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 03cb7158a..f01cd8855 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -4,6 +4,7 @@ from sklearn.base import BaseEstimator from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import Lasso, LogisticRegression +from sklearn.semi_supervised import LabelSpreading from doubleml import DoubleMLLPLR from doubleml.plm.datasets import make_lplr_LZZ2020 @@ -45,7 +46,7 @@ def test_lplr_exception_scores(): @pytest.mark.ci -def test_ssm_exception_resampling(): +def test_lplr_exception_resampling(): msg = "The number of folds must be of int type. 1.5 of type was passed." with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=1.5) @@ -208,10 +209,22 @@ def test_lplr_exception_learner(): with pytest.raises(ValueError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, LogisticRegression()) + # ml_m may not be a classifier when treatment is not binary + msg = ( + r"The ml_a learner LogisticRegression\(\) was identified as classifier " + r"but at least one treatment variable is not binary with values 0 and 1\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, ml_a=LogisticRegression()) + + # ml_m may not be a classifier when treatment is not binary + dml_data_binary = make_lplr_LZZ2020(treatment="binary") + msg = 'Learner "ml_a" who supports sample_weight is required for score type "instrument"' + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_m, ml_a=LabelSpreading(), score="instrument") + # construct a classifier which is not identifiable as classifier via is_classifier by sklearn log_reg = LogisticRegressionManipulatedType() - # TODO(0.11) can be removed if the sklearn dependency is bumped to 1.6.0 - log_reg._estimator_type = None msg = ( r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " r"neither a regressor nor a classifier. Method predict is used for prediction\." @@ -262,6 +275,13 @@ def predict(self, X): return preds +# Classifier that returns hard labels (0/1) via predict_proba to trigger the binary-predictions error +class HardLabelPredictProba(LogisticRegression): + def predict_proba(self, X): + labels = super().predict(X).astype(int) + return np.column_stack((1 - labels, labels)) + + @pytest.mark.ci def test_lplr_nan_prediction(): msg = r"Predictions from learner LassoWithNanPred\(\) for ml_t are not finite." @@ -304,3 +324,15 @@ def eval_fct(y_pred, y_true): with pytest.raises(ValueError): dml_lplr_obj.evaluate_learners(metric=eval_fct) + + +@pytest.mark.ci +def test_lplr_exception_binary_predictions_from_classifier(): + # Expect error because ml_m returns binary labels instead of probabilities for a binary treatment + msg = ( + r"For the binary treatment variable d, predictions obtained with the ml_m learner " + r"HardLabelPredictProba\(\) are also observed to be binary with values 0 and 1\. " + r"Make sure that for classifiers probabilities and not labels are predicted\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, HardLabelPredictProba()).fit() diff --git a/doubleml/plm/tests/test_lplr_external_predictions.py b/doubleml/plm/tests/test_lplr_external_predictions.py index 5e9b66e87..cc8546a8f 100644 --- a/doubleml/plm/tests/test_lplr_external_predictions.py +++ b/doubleml/plm/tests/test_lplr_external_predictions.py @@ -84,10 +84,65 @@ def doubleml_lplr_fixture(lplr_score, n_rep, set_ml_m_ext, set_ml_t_ext, set_ml_ np.random.seed(3141) dml_lplr_ext.fit(external_predictions=ext_predictions) - res_dict = {"coef_normal": dml_lplr.coef[0], "coef_ext": dml_lplr_ext.coef[0]} + res_dict = { + "coef_normal": dml_lplr.coef[0], + "coef_ext": dml_lplr_ext.coef[0], + "se_normal": dml_lplr.se[0], + "se_ext": dml_lplr_ext.se[0], + } return res_dict @pytest.mark.ci def test_doubleml_lplr_coef(doubleml_lplr_fixture): assert math.isclose(doubleml_lplr_fixture["coef_normal"], doubleml_lplr_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_doubleml_lplr_se(doubleml_lplr_fixture): + assert math.isclose(doubleml_lplr_fixture["se_normal"], doubleml_lplr_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_doubleml_lplr_exceptions(): + ext_predictions = {"d": {}} + + x, y, d, _ = make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type="np.array", treatment="continuous") + + np.random.seed(3141) + dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d) + + kwargs = {"obj_dml_data": dml_data} + + dml_lplr = DoubleMLLPLR(ml_M=LogisticRegression(max_iter=1000), ml_t=LinearRegression(), ml_m=LinearRegression(), **kwargs) + np.random.seed(3141) + dml_lplr.fit(store_predictions=True) + + # prepare external predictions and dummy learners + + ml_M = LogisticRegression(max_iter=1000) + ml_t = LinearRegression() + ml_m = LinearRegression() + + # build second model with external predictions + dml_lplr_ext = DoubleMLLPLR(ml_M=ml_M, ml_t=ml_t, ml_m=ml_m, **kwargs) + + ext_predictions["d"]["ml_M"] = dml_lplr.predictions["ml_M"][:, :, 0] + # provide inner predictions per inner fold index + for i in range(dml_lplr.n_folds_inner - 1): + ext_predictions["d"][f"ml_M_inner_{i}"] = dml_lplr.predictions[f"ml_M_inner_{i}"][:, :, 0] + + msg = r"When providing external predictions for ml_M, also inner predictions for all inner folds" + with pytest.raises(ValueError, match=msg): + dml_lplr_ext.fit(external_predictions=ext_predictions) + + ext_predictions["d"][f"ml_M_inner_{dml_lplr.n_folds_inner-1}"] = (dml_lplr.predictions)[ + f"ml_M_inner_{dml_lplr.n_folds_inner-1}" + ][:, :, 0] + ext_predictions["d"]["ml_a"] = dml_lplr.predictions["ml_a"][:, :, 0] + for i in range(dml_lplr.n_folds_inner - 1): + ext_predictions["d"][f"ml_a_inner_{i}"] = dml_lplr.predictions[f"ml_a_inner_{i}"][:, :, 0] + + msg = r"When providing external predictions for ml_a, also inner predictions for all inner folds" + with pytest.raises(ValueError, match=msg): + dml_lplr_ext.fit(external_predictions=ext_predictions) diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index 05c75d00a..95b6ea53b 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -1,26 +1,8 @@ -import numpy as np import pandas as pd import pytest from doubleml import DoubleMLData from doubleml.datasets import fetch_401K, fetch_bonus -from doubleml.irm.datasets import ( - make_confounded_irm_data, - make_heterogeneous_data, - make_iivm_data, - make_irm_data, - make_irm_data_discrete_treatments, - make_ssm_data, -) -from doubleml.plm.datasets import ( - _make_pliv_data, - make_confounded_plr_data, - make_lplr_LZZ2020, - make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, - make_plr_CCDDHNR2018, - make_plr_turrell2018, -) msg_inv_return_type = "Invalid return_type." @@ -54,279 +36,3 @@ def test_fetch_bonus_poly(): n_x = len(data_bonus_wo_poly.x_cols) data_bonus_w_poly = fetch_bonus(polynomial_features=True) assert len(data_bonus_w_poly.x_cols) == ((n_x + 1) * n_x / 2 + n_x) - - -@pytest.mark.ci -def test_make_plr_CCDDHNR2018_return_types(): - np.random.seed(3141) - res = make_plr_CCDDHNR2018(n_obs=100, return_type=DoubleMLData) - assert isinstance(res, DoubleMLData) - res = make_plr_CCDDHNR2018(n_obs=100, return_type=pd.DataFrame) - assert isinstance(res, pd.DataFrame) - x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type=np.ndarray) - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_plr_CCDDHNR2018(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_plr_turrell2018_return_types(): - np.random.seed(3141) - res = make_plr_turrell2018(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_plr_turrell2018(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d = make_plr_turrell2018(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_plr_turrell2018(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_irm_data_return_types(): - np.random.seed(3141) - res = make_irm_data(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_irm_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d = make_irm_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_irm_data(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_iivm_data_return_types(): - np.random.seed(3141) - res = make_iivm_data(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_iivm_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = make_iivm_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_iivm_data(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_pliv_data_return_types(): - np.random.seed(3141) - res = _make_pliv_data(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = _make_pliv_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = _make_pliv_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = _make_pliv_data(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_pliv_CHS2015_return_types(): - np.random.seed(3141) - res = make_pliv_CHS2015(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_pliv_CHS2015(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = make_pliv_CHS2015(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_pliv_CHS2015(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_pliv_multiway_cluster_CKMS2021_return_types(): - np.random.seed(3141) - res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(cluster_vars, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="matrix") - - -@pytest.fixture(scope="function", params=[True, False]) -def linear(request): - return request.param - - -@pytest.mark.ci -def test_make_confounded_irm_data_return_types(linear): - np.random.seed(3141) - res = make_confounded_irm_data(linear=linear) - assert isinstance(res, dict) - assert isinstance(res["x"], np.ndarray) - assert isinstance(res["y"], np.ndarray) - assert isinstance(res["d"], np.ndarray) - - assert isinstance(res["oracle_values"], dict) - assert isinstance(res["oracle_values"]["g_long"], np.ndarray) - assert isinstance(res["oracle_values"]["g_short"], np.ndarray) - assert isinstance(res["oracle_values"]["m_long"], np.ndarray) - assert isinstance(res["oracle_values"]["m_short"], np.ndarray) - assert isinstance(res["oracle_values"]["gamma_a"], float) - assert isinstance(res["oracle_values"]["beta_a"], float) - assert isinstance(res["oracle_values"]["a"], np.ndarray) - assert isinstance(res["oracle_values"]["y_0"], np.ndarray) - assert isinstance(res["oracle_values"]["y_1"], np.ndarray) - assert isinstance(res["oracle_values"]["z"], np.ndarray) - assert isinstance(res["oracle_values"]["cf_y"], float) - assert isinstance(res["oracle_values"]["cf_d_ate"], float) - assert isinstance(res["oracle_values"]["cf_d_atte"], float) - assert isinstance(res["oracle_values"]["rho_ate"], float) - assert isinstance(res["oracle_values"]["rho_atte"], float) - - -@pytest.mark.ci -def test_make_confounded_plr_data_return_types(): - np.random.seed(3141) - res = make_confounded_plr_data(theta=5.0) - assert isinstance(res, dict) - assert isinstance(res["x"], np.ndarray) - assert isinstance(res["y"], np.ndarray) - assert isinstance(res["d"], np.ndarray) - - assert isinstance(res["oracle_values"], dict) - assert isinstance(res["oracle_values"]["g_long"], np.ndarray) - assert isinstance(res["oracle_values"]["g_short"], np.ndarray) - assert isinstance(res["oracle_values"]["m_long"], np.ndarray) - assert isinstance(res["oracle_values"]["m_short"], np.ndarray) - assert isinstance(res["oracle_values"]["theta"], float) - assert isinstance(res["oracle_values"]["gamma_a"], float) - assert isinstance(res["oracle_values"]["beta_a"], float) - assert isinstance(res["oracle_values"]["a"], np.ndarray) - assert isinstance(res["oracle_values"]["z"], np.ndarray) - - -@pytest.fixture(scope="function", params=[False, True]) -def binary_treatment(request): - return request.param - - -@pytest.fixture(scope="function", params=[1, 2]) -def n_x(request): - return request.param - - -@pytest.mark.ci -def test_make_heterogeneous_data_return_types(binary_treatment, n_x): - np.random.seed(3141) - res = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=binary_treatment) - assert isinstance(res, dict) - assert isinstance(res["data"], pd.DataFrame) - assert isinstance(res["effects"], np.ndarray) - assert callable(res["treatment_effect"]) - - # test input checks - msg = "n_x must be either 1 or 2." - with pytest.raises(AssertionError, match=msg): - _ = make_heterogeneous_data(n_obs=100, n_x=0, binary_treatment=binary_treatment) - msg = "support_size must be smaller than p." - with pytest.raises(AssertionError, match=msg): - _ = make_heterogeneous_data(n_obs=100, n_x=n_x, support_size=31, binary_treatment=binary_treatment) - msg = "binary_treatment must be a boolean." - with pytest.raises(AssertionError, match=msg): - _ = make_heterogeneous_data(n_obs=100, n_x=n_x, binary_treatment=2) - - -@pytest.mark.ci -def test_make_ssm_data_return_types(): - np.random.seed(3141) - res = make_ssm_data(n_obs=100) - assert isinstance(res, DoubleMLData) - res = make_ssm_data(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z, s = make_ssm_data(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - assert isinstance(s, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_ssm_data(n_obs=100, return_type="matrix") - - -@pytest.fixture(scope="function", params=[3, 5]) -def n_levels(request): - return request.param - - -def test_make_data_discrete_treatments(n_levels): - np.random.seed(3141) - n = 100 - data_apo = make_irm_data_discrete_treatments(n_obs=n, n_levels=3) - assert isinstance(data_apo, dict) - assert isinstance(data_apo["y"], np.ndarray) - assert isinstance(data_apo["d"], np.ndarray) - assert isinstance(data_apo["x"], np.ndarray) - assert isinstance(data_apo["oracle_values"], dict) - - assert isinstance(data_apo["oracle_values"]["cont_d"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["level_bounds"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["potential_level"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["ite"], np.ndarray) - assert isinstance(data_apo["oracle_values"]["y0"], np.ndarray) - - msg = "n_levels must be at least 2." - with pytest.raises(ValueError, match=msg): - _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1) - - msg = "n_levels must be an integer." - with pytest.raises(ValueError, match=msg): - _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1.1) - - -@pytest.mark.ci -def test_make_lplr_LZZ2020_return_types(): - np.random.seed(3141) - res = make_lplr_LZZ2020(n_obs=100, return_type="DoubleMLData") - assert isinstance(res, DoubleMLData) - res = make_lplr_LZZ2020(n_obs=100, return_type="DataFrame") - assert isinstance(res, pd.DataFrame) - x, y, d, z = make_lplr_LZZ2020(n_obs=100, return_type="array") - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - assert isinstance(z, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_lplr_LZZ2020(n_obs=100, return_type="matrix") - - -@pytest.mark.ci -def test_make_lplr_LZZ2020_variants(): - np.random.seed(3141) - res = make_lplr_LZZ2020(n_obs=100, treatment="binary") - assert np.array_equal(np.unique(res.d), np.array([0, 1])) - res = make_lplr_LZZ2020(n_obs=100, treatment="binary_unbalanced") - assert np.array_equal(np.unique(res.d), np.array([0, 1])) - res = make_lplr_LZZ2020(n_obs=100, treatment="continuous") - assert len(np.unique(res.d)) == 100 - - msg = "Invalid treatment type." - with pytest.raises(ValueError, match=msg): - _ = make_lplr_LZZ2020(n_obs=100, treatment="colors") - - res = make_lplr_LZZ2020(n_obs=100, balanced_r0=False) - _, y_unique = np.unique(res.y, return_counts=True) - assert np.abs(y_unique[0] - y_unique[1]) > 10 diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index e725a562e..13a1fdc89 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -15,6 +15,7 @@ DoubleMLDIDData, DoubleMLIIVM, DoubleMLIRM, + DoubleMLLPLR, DoubleMLLPQ, DoubleMLPLIV, DoubleMLPLR, @@ -23,7 +24,12 @@ ) from doubleml.did.datasets import make_did_SZ2020 from doubleml.irm.datasets import make_iivm_data, make_irm_data -from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 +from doubleml.plm.datasets import ( + make_lplr_LZZ2020, + make_pliv_CHS2015, + make_pliv_multiway_cluster_CKMS2021, + make_plr_CCDDHNR2018, +) from ._utils import DummyDataClass @@ -796,6 +802,26 @@ def test_doubleml_exception_smpls(): _ = dml_pliv_cluster.set_sample_splitting(all_smpls=dml_pliv_cluster.smpls, all_smpls_cluster=all_smpls_cluster) +@pytest.mark.ci +def test_doubleml_exception_smpls_inner(): + dml_plr_no_inner = DoubleMLPLR(dml_data, ml_l, ml_m) + msg = "smpls_inner is only available for double sample splitting." + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_inner.smpls_inner + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_inner._DoubleML__smpls__inner + + dml_data_lplr = make_lplr_LZZ2020() + ml_M = LogisticRegression() + dml_plr_inner_no_smpls = DoubleMLLPLR(dml_data_lplr, ml_M, ml_m, ml_m, draw_sample_splitting=False) + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + with pytest.raises(ValueError, match=msg): + _ = dml_plr_inner_no_smpls.smpls_inner + + @pytest.mark.ci def test_doubleml_exception_fit(): msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." diff --git a/doubleml/tests/test_set_sample_splitting.py b/doubleml/tests/test_set_sample_splitting.py index fa0a43945..fd44289b2 100644 --- a/doubleml/tests/test_set_sample_splitting.py +++ b/doubleml/tests/test_set_sample_splitting.py @@ -276,3 +276,11 @@ def test_doubleml_set_sample_splitting_shuffled_indices(): # Since predictions are stored by observation index, they should be identical np.testing.assert_allclose(sorted_preds_l, shuffled_preds_l, rtol=1e-10) np.testing.assert_allclose(sorted_preds_m, shuffled_preds_m, rtol=1e-10) + + +@pytest.mark.ci +def test_doubleml_exceptions_double_sample_splitting(): + smpls = (np.arange(n_obs), np.arange(n_obs)) + msg = "set_sample_splitting not supported for double sample splitting." + with pytest.raises(ValueError, match=msg): + dml_plr.set_sample_splitting(smpls) From 74b1caaf74f0b5f38979151970cbfe608b83ea5b Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 12 Nov 2025 13:46:58 -0800 Subject: [PATCH 40/48] Sample splitting exceptions --- doubleml/tests/test_set_sample_splitting.py | 8 ------- .../test_set_sample_splitting_exceptions.py | 23 +++++++++++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) create mode 100644 doubleml/tests/test_set_sample_splitting_exceptions.py diff --git a/doubleml/tests/test_set_sample_splitting.py b/doubleml/tests/test_set_sample_splitting.py index fd44289b2..fa0a43945 100644 --- a/doubleml/tests/test_set_sample_splitting.py +++ b/doubleml/tests/test_set_sample_splitting.py @@ -276,11 +276,3 @@ def test_doubleml_set_sample_splitting_shuffled_indices(): # Since predictions are stored by observation index, they should be identical np.testing.assert_allclose(sorted_preds_l, shuffled_preds_l, rtol=1e-10) np.testing.assert_allclose(sorted_preds_m, shuffled_preds_m, rtol=1e-10) - - -@pytest.mark.ci -def test_doubleml_exceptions_double_sample_splitting(): - smpls = (np.arange(n_obs), np.arange(n_obs)) - msg = "set_sample_splitting not supported for double sample splitting." - with pytest.raises(ValueError, match=msg): - dml_plr.set_sample_splitting(smpls) diff --git a/doubleml/tests/test_set_sample_splitting_exceptions.py b/doubleml/tests/test_set_sample_splitting_exceptions.py new file mode 100644 index 000000000..b58f6bdda --- /dev/null +++ b/doubleml/tests/test_set_sample_splitting_exceptions.py @@ -0,0 +1,23 @@ +import numpy as np +import pytest +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.plm.datasets import make_lplr_LZZ2020 + +np.random.seed(3141) + +dml_data_lplr = make_lplr_LZZ2020(n_obs=10) +n_obs = dml_data_lplr.n_obs +ml_M = LogisticRegression() +ml_t = Lasso() +ml_m = Lasso() +dml_lplr = DoubleMLLPLR(dml_data_lplr, ml_M, ml_t, ml_m, n_folds=7, n_rep=8, draw_sample_splitting=False) + + +@pytest.mark.ci +def test_doubleml_exceptions_double_sample_splitting(): + smpls = (np.arange(n_obs), np.arange(n_obs)) + msg = "set_sample_splitting not supported for double sample splitting." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_sample_splitting(smpls) From 72be054067b98a85048b36266164cd0b7df53099 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 12 Nov 2025 15:25:39 -0800 Subject: [PATCH 41/48] Test coverage increase --- doubleml/double_ml.py | 2 +- doubleml/plm/lplr.py | 8 -------- doubleml/tests/test_set_sample_splitting_exceptions.py | 5 +++++ doubleml/utils/_estimation.py | 8 ++------ 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 638cb31ec..d084b4561 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -526,7 +526,7 @@ def __smpls(self): @property def __smpls__inner(self): - if not self.smpls_inner[self._i_rep]: + if self._smpls_inner is None: raise ValueError("smpls_inner is only available for double sample splitting.") return self._smpls_inner[self._i_rep] diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index e7ff1c35f..f452e02d4 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -7,7 +7,6 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target -from doubleml import DoubleMLData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import NonLinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score @@ -166,13 +165,8 @@ def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} def _check_data(self, obj_dml_data): - if not isinstance(obj_dml_data, DoubleMLData): - raise TypeError( - f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." - ) if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): raise TypeError("The outcome variable y must be binary with values 0 and 1.") - return def _double_dml_cv_predict( self, @@ -202,7 +196,6 @@ def _double_dml_cv_predict( est_params=est_params, method=method, return_models=True, - smpls_is_partition_manual_set=True, sample_weights=sample_weights, ) _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) @@ -447,7 +440,6 @@ def _nuisance_tuning( ): if self._i_rep is None: raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.") - # TODO: test x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) x_d_concat = np.hstack((d.reshape(-1, 1), x)) diff --git a/doubleml/tests/test_set_sample_splitting_exceptions.py b/doubleml/tests/test_set_sample_splitting_exceptions.py index b58f6bdda..eba513ca3 100644 --- a/doubleml/tests/test_set_sample_splitting_exceptions.py +++ b/doubleml/tests/test_set_sample_splitting_exceptions.py @@ -21,3 +21,8 @@ def test_doubleml_exceptions_double_sample_splitting(): msg = "set_sample_splitting not supported for double sample splitting." with pytest.raises(ValueError, match=msg): dml_lplr.set_sample_splitting(smpls) + + dml_lplr._is_cluster_data = True + msg = "Cluster data not supported for double sample splitting." + with pytest.raises(ValueError, match=msg): + dml_lplr.draw_sample_splitting() diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index d4e8abc4d..b79c7618a 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -56,15 +56,11 @@ def _dml_cv_predict( method="predict", return_train_preds=False, return_models=False, - smpls_is_partition_manual_set=None, sample_weights=None, ): n_obs = x.shape[0] - if smpls_is_partition_manual_set is None: - smpls_is_partition = _check_is_partition(smpls, n_obs) - else: - smpls_is_partition = smpls_is_partition_manual_set + smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) manual_cv_predict = ( @@ -108,7 +104,7 @@ def _dml_cv_predict( else: if not smpls_is_partition: assert not fold_specific_target, "combination of fold-specific y and no cross-fitting not implemented yet" - assert len(smpls) == 1 + # assert len(smpls) == 1 if method == "predict_proba": assert not fold_specific_target # fold_specific_target only needed for PLIV.partialXZ From 5d9e0eb7b3f43d2f1fbfa0f1e966d3b5fe471b0f Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 12 Nov 2025 16:20:32 -0800 Subject: [PATCH 42/48] Exception fixed --- doubleml/double_ml.py | 8 +++++++- doubleml/tests/test_exceptions.py | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index d084b4561..4e11b13c8 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -526,8 +526,14 @@ def __smpls(self): @property def __smpls__inner(self): - if self._smpls_inner is None: + if not self._double_sample_splitting: raise ValueError("smpls_inner is only available for double sample splitting.") + if self._smpls_inner is None: + err_msg = ( + "Sample splitting not specified. Either draw samples via .draw_sample splitting() " + + "or set external samples via .set_sample_splitting()." + ) + raise ValueError(err_msg) return self._smpls_inner[self._i_rep] @property diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index f4a267170..4fca5318b 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -706,13 +706,15 @@ def test_doubleml_exception_smpls_inner(): dml_data_lplr = make_lplr_LZZ2020() ml_M = LogisticRegression() - dml_plr_inner_no_smpls = DoubleMLLPLR(dml_data_lplr, ml_M, ml_m, ml_m, draw_sample_splitting=False) + dml_lplr_inner_no_smpls = DoubleMLLPLR(dml_data_lplr, ml_M, ml_m, ml_m, draw_sample_splitting=False) msg = ( "Sample splitting not specified. " r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." ) with pytest.raises(ValueError, match=msg): - _ = dml_plr_inner_no_smpls.smpls_inner + _ = dml_lplr_inner_no_smpls.smpls_inner + with pytest.raises(ValueError, match=msg): + _ = dml_lplr_inner_no_smpls._DoubleML__smpls__inner @pytest.mark.ci From 99e78bf77015bff71a9bf90f62b2ecb66c0cf01c Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 13 Nov 2025 19:56:57 -0800 Subject: [PATCH 43/48] PR Review --- doubleml/double_ml.py | 17 +-- doubleml/plm/lplr.py | 114 +++++---------------- doubleml/plm/tests/test_lplr.py | 9 +- doubleml/plm/tests/test_lplr_exceptions.py | 6 +- doubleml/plm/tests/test_lplr_tune.py | 27 +---- doubleml/plm/tests/test_model_defaults.py | 51 +++++++++ doubleml/plm/tests/test_return_types.py | 73 +++++++++++++ doubleml/utils/_check_defaults.py | 3 +- doubleml/utils/_check_return_types.py | 4 +- doubleml/utils/_estimation.py | 46 ++++++++- 10 files changed, 212 insertions(+), 138 deletions(-) create mode 100644 doubleml/plm/tests/test_model_defaults.py create mode 100644 doubleml/plm/tests/test_return_types.py diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 4e11b13c8..df35bcb5e 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -258,13 +258,6 @@ def learner(self): """ return self._learner - @property - def predictions_names(self): - """ - The names of predictions for the nuisance functions. - """ - return list(self.params_names) - @property def learner_names(self): """ @@ -1088,7 +1081,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m _check_external_predictions( external_predictions=external_predictions, valid_treatments=self._dml_data.d_cols, - valid_learners=self.predictions_names, + valid_learners=self.params_names, n_obs=self.n_obs, n_rep=self.n_rep, ) @@ -1111,7 +1104,7 @@ def _initalize_fit(self, store_predictions, store_models): def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models): ext_prediction_dict = _set_external_predictions( external_predictions, - learners=self.predictions_names, + learners=self.params_names, treatment=self._dml_data.d_cols[self._i_treat], i_rep=self._i_rep, ) @@ -1178,8 +1171,8 @@ def _initialize_arrays(self): self._all_se = np.full((n_thetas, n_rep), np.nan) def _initialize_predictions_and_targets(self): - self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} - self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names} + self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} + self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} def _initialize_nuisance_loss(self): self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names} @@ -1190,7 +1183,7 @@ def _initialize_models(self): } def _store_predictions_and_targets(self, preds, targets): - for learner in self.predictions_names: + for learner in self.params_names: self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner] self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner] diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index f452e02d4..c3f6d5b56 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -13,6 +13,7 @@ from doubleml.utils._estimation import ( _dml_cv_predict, _dml_tune, + _double_dml_cv_predict, ) @@ -104,10 +105,6 @@ def __init__( ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} - # replace aggregated inner names with per-inner-fold names - inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds_inner)] - inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds_inner)] - self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) @@ -162,56 +159,15 @@ def __init__( self._sensitivity_implemented = False def _initialize_ml_nuisance_params(self): - self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} + inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds)] + inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds)] + params_names = ["ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in params_names} def _check_data(self, obj_dml_data): if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): raise TypeError("The outcome variable y must be binary with values 0 and 1.") - def _double_dml_cv_predict( - self, - estimator, - estimator_name, - x, - y, - smpls=None, - smpls_inner=None, - n_jobs=None, - est_params=None, - method="predict", - sample_weights=None, - ): - res = {} - res["preds"] = np.zeros(y.shape, dtype=float) - res["preds_inner"] = [] - res["targets_inner"] = [] - res["models"] = [] - for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - res_inner = _dml_cv_predict( - estimator, - x, - y, - smpls=smpls_double_split, - n_jobs=n_jobs, - est_params=est_params, - method=method, - return_models=True, - sample_weights=sample_weights, - ) - _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) - - res["preds_inner"].append(res_inner["preds"]) - res["targets_inner"].append(res_inner["targets"]) - for model in res_inner["models"]: - res["models"].append(model) - if method == "predict_proba": - res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] - else: - res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) - res["preds"] /= len(smpls) - res["targets"] = np.copy(y) - return res - def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) @@ -234,9 +190,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa f"have to be provided (missing: {', '.join([str(i) for i in missing])})." ) M_hat_inner = [external_predictions[f"ml_M_inner_{i}"] for i in range(self.n_folds_inner)] - M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None} + M_hat = { + "preds": external_predictions["ml_M"], + "preds_inner": M_hat_inner, + "targets": self._dml_data.y, + "models": None, + } else: - M_hat = self._double_dml_cv_predict( + M_hat = _double_dml_cv_predict( self._learner["ml_M"], "ml_M", x_d_concat, @@ -250,7 +211,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa # nuisance m if m_external: - m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} + m_hat = {"preds": external_predictions["ml_m"], "targets": self._dml_data.d, "models": None} else: if self.score == "instrument": weights = M_hat["preds"] * (1 - M_hat["preds"]) @@ -303,9 +264,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa f"have to be provided (missing: {', '.join([str(i) for i in missing])})." ) a_hat_inner = [external_predictions[f"ml_a_inner_{i}"] for i in range(self.n_folds_inner)] - a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None} + a_hat = { + "preds": external_predictions["ml_a"], + "preds_inner": a_hat_inner, + "targets": self._dml_data.d, + "models": None, + } else: - a_hat = self._double_dml_cv_predict( + a_hat = _double_dml_cv_predict( self._learner["ml_a"], "ml_a", x, @@ -404,13 +370,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa return psi_elements, preds - @property - def predictions_names(self): - """ - The names of predictions for the nuisance functions. - """ - return self._predictions_names - def _score_elements(self, y, d, r_hat, m_hat): # compute residual d_tilde = d - m_hat @@ -438,8 +397,6 @@ def _sensitivity_element_est(self, preds): def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search ): - if self._i_rep is None: - raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.") x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) x_d_concat = np.hstack((d.reshape(-1, 1), x)) @@ -500,34 +457,16 @@ def _nuisance_tuning( a_best_params = [xx.best_params_ for xx in a_tune_res] # Create targets for tuning ml_t - M_hat = self._double_dml_cv_predict( - self._learner["ml_M"], - "ml_M", - x_d_concat, - y, - smpls=smpls, - smpls_inner=self._DoubleML__smpls__inner, - n_jobs=n_jobs_cv, - est_params=M_best_params, - method=self._predict_method["ml_M"], - ) - W_inner = [] - for i, (train, _) in enumerate(smpls): - M_iteration = M_hat["preds_inner"][i][train] - M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) - w = scipy.special.logit(M_iteration) - W_inner.append(w) + M_hat = np.full_like(y, np.nan) + for idx, (train_index, _) in enumerate(smpls): + M_hat[train_index] = M_tune_res[idx].predict_proba(x_d_concat[train_index, :])[:, 1] - # Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN - W_targets = [] - for i, train in enumerate(train_inds): - wt = np.full(x.shape[0], np.nan, dtype=float) - wt[train] = W_inner[i] - W_targets.append(wt) + M_hat = np.clip(M_hat, 1e-8, 1 - 1e-8) + W_hat = scipy.special.logit(M_hat) t_tune_res = _dml_tune( - W_inner, + W_hat, x, train_inds, self._learner["ml_t"], @@ -537,7 +476,6 @@ def _nuisance_tuning( n_jobs_cv, search_mode, n_iter_randomized_search, - fold_specific_target=True, ) t_best_params = [xx.best_params_ for xx in t_tune_res] diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index abd7adf55..6ddbba6bc 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -7,22 +7,22 @@ from doubleml.plm.datasets import make_lplr_LZZ2020 -@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)]) def learner_M(request): return request.param -@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)]) def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)]) def learner_m(request): return request.param -@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)]) def learner_m_classifier(request): return request.param @@ -33,7 +33,6 @@ def score(request): @pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"]) -# TODO: Error for continuous treatment? def treatment(request): return request.param diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index f01cd8855..32a8103e9 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -14,9 +14,9 @@ # create test data and basic learners dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20) dml_data_binary = make_lplr_LZZ2020(alpha=0.5, n_obs=n, treatment="binary", dim_x=20) -ml_M = RandomForestClassifier() -ml_t = RandomForestRegressor() -ml_m = RandomForestRegressor() +ml_M = RandomForestClassifier(max_depth=2, n_estimators=10) +ml_t = RandomForestRegressor(max_depth=2, n_estimators=10) +ml_m = RandomForestRegressor(max_depth=2, n_estimators=10) dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 7c7c4aebb..78ad050ee 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -44,7 +44,7 @@ def dml_lplr_fixture( learner_m, learner_a, score, - tune_on_folds=True, + tune_on_folds=False, ): par_grid = { "ml_M": get_par_grid(), @@ -94,28 +94,3 @@ def test_dml_selection_coef(dml_lplr_fixture): se = dml_lplr_fixture["se"] true_coef = dml_lplr_fixture["true_coef"] assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) - - -@pytest.mark.ci -def test_lplr_exception_tuning( - learner_M, - learner_t, - learner_m, - learner_a, -): - # LPLR valid scores are 'nuisance_space' and 'instrument' - obj_dml_data = make_lplr_LZZ2020(alpha=0.5) - ml_M = clone(learner_M) - ml_t = clone(learner_t) - ml_m = clone(learner_m) - - dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) - par_grid = { - "ml_M": get_par_grid(), - "ml_t": get_par_grid(), - "ml_m": get_par_grid(), - "ml_a": get_par_grid(), - } - msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." - with pytest.raises(ValueError, match=msg): - dml_lplr_obj.tune(par_grid, tune_on_folds=False) diff --git a/doubleml/plm/tests/test_model_defaults.py b/doubleml/plm/tests/test_model_defaults.py new file mode 100644 index 000000000..3e9bc430d --- /dev/null +++ b/doubleml/plm/tests/test_model_defaults.py @@ -0,0 +1,51 @@ +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.plm.datasets import make_lplr_LZZ2020 +from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap + +dml_data_lplr = make_lplr_LZZ2020(n_obs=100) + +dml_lplr_obj = DoubleMLLPLR(dml_data_lplr, LogisticRegression(), LinearRegression(), LinearRegression()) + + +@pytest.mark.ci +def test_lplr_defaults(): + _check_basic_defaults_before_fit(dml_lplr_obj) + + _fit_bootstrap(dml_lplr_obj) + + _check_basic_defaults_after_fit(dml_lplr_obj) + + +@pytest.mark.ci +def test_did_multi_str(): + # Test the string representation before fitting + dml_str = str(dml_lplr_obj) + + # Check that all important sections are present + assert "================== DoubleMLLPLR Object ==================" in dml_str + assert "------------------ Data Summary ------------------" in dml_str + assert "------------------ Score & Algorithm ------------------" in dml_str + assert "------------------ Machine Learner ------------------" in dml_str + assert "------------------ Resampling ------------------" in dml_str + assert "------------------ Fit Summary ------------------" in dml_str + + # Check specific content before fitting + assert "No. folds: 5" in dml_str + assert "No. repeated sample splits: 1" in dml_str + assert "Learner ml_M:" in dml_str + assert "Learner ml_m:" in dml_str + assert "Learner ml_t:" in dml_str + + # Fit the model + dml_lplr_obj_fit = dml_lplr_obj.fit() + dml_str_after_fit = str(dml_lplr_obj_fit) + + # Check that additional information is present after fitting + assert "coef" in dml_str_after_fit + assert "std err" in dml_str_after_fit + assert "t" in dml_str_after_fit + assert "P>|t|" in dml_str_after_fit + assert "Out-of-sample Performance:" in dml_str_after_fit diff --git a/doubleml/plm/tests/test_return_types.py b/doubleml/plm/tests/test_return_types.py new file mode 100644 index 000000000..cb32f5433 --- /dev/null +++ b/doubleml/plm/tests/test_return_types.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.plm.datasets import make_lplr_LZZ2020 +from doubleml.utils._check_return_types import ( + check_basic_predictions_and_targets, + check_basic_property_types_and_shapes, + check_basic_return_types, + check_sensitivity_return_types, +) + +# Test constants +N_OBS = 200 +N_TREAT = 1 +N_REP = 1 +N_FOLDS = 3 +N_REP_BOOT = 314 + +dml_args = { + "n_rep": N_REP, + "n_folds": N_FOLDS, +} + + +# create all datasets +np.random.seed(3141) +datasets = {} + + +datasets["lplr"] = make_lplr_LZZ2020(n_obs=N_OBS) +datasets["lplr_binary"] = make_lplr_LZZ2020(n_obs=N_OBS, treatment="binary") + +dml_lplr_obj = DoubleMLLPLR(datasets["lplr"], LogisticRegression(), LinearRegression(), LinearRegression(), **dml_args) +dml_lplr_obj_binary = DoubleMLLPLR( + datasets["lplr_binary"], LogisticRegression(), LinearRegression(), LogisticRegression(), **dml_args +) + +dml_objs = [ + (dml_lplr_obj, DoubleMLLPLR), + (dml_lplr_obj_binary, DoubleMLLPLR), +] + + +@pytest.mark.ci +@pytest.mark.parametrize("dml_obj, cls", dml_objs) +def test_return_types(dml_obj, cls): + check_basic_return_types(dml_obj, cls) + + # further return type tests + assert isinstance(dml_obj.get_params("ml_m"), dict) + + +@pytest.fixture(params=dml_objs) +def fitted_dml_obj(request): + dml_obj, _ = request.param + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_property_types_and_shapes(fitted_dml_obj): + check_basic_property_types_and_shapes(fitted_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT) + check_basic_predictions_and_targets(fitted_dml_obj, N_OBS, N_TREAT, N_REP) + + +@pytest.mark.ci +def test_sensitivity_return_types(fitted_dml_obj): + if fitted_dml_obj._sensitivity_implemented: + benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]] + check_sensitivity_return_types(fitted_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set) diff --git a/doubleml/utils/_check_defaults.py b/doubleml/utils/_check_defaults.py index 5f376000a..d374ff313 100644 --- a/doubleml/utils/_check_defaults.py +++ b/doubleml/utils/_check_defaults.py @@ -47,7 +47,8 @@ def _check_basic_defaults_after_fit(dml_obj): # sensitivity assert dml_obj.sensitivity_params is None - assert isinstance(dml_obj.sensitivity_elements, dict) + if dml_obj._sensitivity_implemented: + assert isinstance(dml_obj.sensitivity_elements, dict) # fit method if isinstance(dml_obj, DoubleML): diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py index b73e2e04e..633eb1c67 100644 --- a/doubleml/utils/_check_return_types.py +++ b/doubleml/utils/_check_return_types.py @@ -11,9 +11,9 @@ def check_basic_return_types(dml_obj, cls): assert isinstance(dml_obj.__str__(), str) assert isinstance(dml_obj.summary, pd.DataFrame) assert isinstance(dml_obj.draw_sample_splitting(), cls) - if not dml_obj._is_cluster_data: + if not dml_obj._is_cluster_data and not hasattr(dml_obj, "n_folds_inner"): assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls) - else: + elif dml_obj._is_cluster_data: assert dml_obj._dml_data.is_cluster_data assert isinstance(dml_obj.fit(), cls) assert isinstance(dml_obj.__str__(), str) # called again after fit, now with numbers diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index b79c7618a..d548ca14a 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -9,7 +9,7 @@ from sklearn.preprocessing import LabelEncoder from statsmodels.nonparametric.kde import KDEUnivariate -from ._checks import _check_is_partition +from ._checks import _check_finite_predictions, _check_is_partition def _assure_2d_array(x): @@ -187,6 +187,50 @@ def _dml_cv_predict( return res +def _double_dml_cv_predict( + estimator, + estimator_name, + x, + y, + smpls=None, + smpls_inner=None, + n_jobs=None, + est_params=None, + method="predict", + sample_weights=None, +): + res = {} + res["preds"] = np.zeros(y.shape, dtype=float) + res["preds_inner"] = [] + res["targets_inner"] = [] + res["models"] = [] + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + res_inner = _dml_cv_predict( + estimator, + x, + y, + smpls=smpls_double_split, + n_jobs=n_jobs, + est_params=est_params, + method=method, + return_models=True, + sample_weights=sample_weights, + ) + _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) + + res["preds_inner"].append(res_inner["preds"]) + res["targets_inner"].append(res_inner["targets"]) + for model in res_inner["models"]: + res["models"].append(model) + if method == "predict_proba": + res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + else: + res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds"] /= len(smpls) + res["targets"] = np.copy(y) + return res + + def _dml_tune( y, x, From 8f7125f7794fb7e27c98454588c04c9b66465313 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 13 Nov 2025 21:10:57 -0800 Subject: [PATCH 44/48] Exceptions fixed --- doubleml/plm/tests/test_lplr_exceptions.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 32a8103e9..404770fa0 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -70,7 +70,7 @@ def test_lplr_exception_resampling(): @pytest.mark.ci def test_lplr_exception_get_params(): - msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." + msg = r"Invalid nuisance learner ml_x. Valid nuisance learner ml_m or ml_a or ml_t or ml_M.*" with pytest.raises(ValueError, match=msg): dml_lplr.get_params("ml_x") @@ -147,7 +147,7 @@ def test_lplr_exception_confint(): @pytest.mark.ci def test_lplr_exception_set_ml_nuisance_params(): # invalid learner name - msg = "Invalid nuisance learner g. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." + msg = "Invalid nuisance learner g. Valid nuisance learner ml_m or ml_a or ml_t or ml_M.*" with pytest.raises(ValueError, match=msg): dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) # invalid treatment variable @@ -246,13 +246,13 @@ def test_lplr_exception_and_warning_learner(): with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, Lasso(), ml_t, ml_m) msg = ( - r"The ml_m learner RandomForestRegressor\(\) was identified as regressor but at least one treatment " + r"The ml_m learner RandomForestRegressor\(.*\) was identified as regressor but at least one treatment " r"variable is binary with values 0 and 1." ) with pytest.warns(match=msg): _ = DoubleMLLPLR(dml_data_binary, ml_M, ml_t, ml_m) msg = ( - r"The ml_a learner RandomForestRegressor\(\) was identified as regressor but at least one treatment " + r"The ml_a learner RandomForestRegressor\(.*\) was identified as regressor but at least one treatment " r"variable is binary with values 0 and 1." ) with pytest.warns(match=msg): @@ -314,7 +314,8 @@ def test_double_ml_exception_evaluate_learner(): dml_lplr_obj.evaluate_learners(metric="mse") msg = ( - r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." + r"The learners have to be a subset of \['ml_m', 'ml_a', 'ml_t', 'ml_M'.*\]\. " + r"Learners \['ml_mu', 'ml_p'\] provided." ) with pytest.raises(ValueError, match=msg): dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) From 03fd19179bcc909d3199ddc0c3e3fe376c86b0b1 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 13 Nov 2025 22:23:07 -0800 Subject: [PATCH 45/48] Test fixed --- doubleml/utils/_check_defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/utils/_check_defaults.py b/doubleml/utils/_check_defaults.py index d374ff313..bb3f8ba49 100644 --- a/doubleml/utils/_check_defaults.py +++ b/doubleml/utils/_check_defaults.py @@ -47,7 +47,7 @@ def _check_basic_defaults_after_fit(dml_obj): # sensitivity assert dml_obj.sensitivity_params is None - if dml_obj._sensitivity_implemented: + if dml_obj.sensitivity_params is not None: assert isinstance(dml_obj.sensitivity_elements, dict) # fit method From 33a86d0f70d7c5775ba51d897a32696bd37247eb Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 14 Nov 2025 08:53:39 +0100 Subject: [PATCH 46/48] Skip doctests for summary outputs --- doubleml/did/did.py | 2 +- doubleml/did/did_cs.py | 2 +- doubleml/did/did_multi.py | 2 +- doubleml/double_ml.py | 2 +- doubleml/irm/cvar.py | 2 +- doubleml/irm/iivm.py | 2 +- doubleml/irm/irm.py | 2 +- doubleml/irm/lpq.py | 2 +- doubleml/irm/pq.py | 2 +- doubleml/irm/qte.py | 2 +- doubleml/irm/ssm.py | 2 +- doubleml/plm/lplr.py | 2 +- doubleml/plm/pliv.py | 2 +- doubleml/plm/plr.py | 2 +- doubleml/rdd/rdd.py | 1 + doubleml/utils/_check_return_types.py | 2 +- 16 files changed, 16 insertions(+), 15 deletions(-) diff --git a/doubleml/did/did.py b/doubleml/did/did.py index 87eb4aaa8..5e86d52eb 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -70,7 +70,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): >>> data = make_did_SZ2020(n_obs=500, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd') >>> dml_did_obj = dml.DoubleMLDID(obj_dml_data, ml_g, ml_m) - >>> dml_did_obj.fit().summary + >>> dml_did_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d -2.840718 1.760386 -1.613691 0.106595 -6.291011 0.609575 diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index da833fd56..f2f2b0543 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -69,7 +69,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): >>> data = make_did_SZ2020(n_obs=500, cross_sectional_data=True, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd', t_col='t') >>> dml_did_obj = dml.DoubleMLDIDCS(obj_dml_data, ml_g, ml_m) - >>> dml_did_obj.fit().summary + >>> dml_did_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d -4.9944 7.561785 -0.660479 0.508947 -19.815226 9.826426 """ diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py index a9e9e7908..2b7aa9064 100644 --- a/doubleml/did/did_multi.py +++ b/doubleml/did/did_multi.py @@ -140,7 +140,7 @@ class DoubleMLDIDMulti: ... gt_combinations="standard", ... control_group="never_treated", ... ) - >>> print(dml_did_obj.fit().summary) + >>> print(dml_did_obj.fit().summary) # doctest: +SKIP coef std err ... 2.5 % 97.5 % ATT(2025-03,2025-01,2025-02) -0.797617 0.459617 ... -1.698450 0.103215 ATT(2025-03,2025-02,2025-03) 0.270311 0.456453 ... -0.624320 1.164941 diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index df35bcb5e..3b94a2cae 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1255,7 +1255,7 @@ def evaluate_learners(self, learners=None, metric=_rmse): >>> def mae(y_true, y_pred): ... subset = np.logical_not(np.isnan(y_true)) ... return mean_absolute_error(y_true[subset], y_pred[subset]) - >>> dml_irm_obj.evaluate_learners(metric=mae) + >>> dml_irm_obj.evaluate_learners(metric=mae) # doctest: +SKIP {'ml_g0': array([[0.88173585]]), 'ml_g1': array([[0.83854057]]), 'ml_m': array([[0.35871235]])} """ # if no learners are provided try to evaluate all learners diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py index 64e82ad85..6c6982933 100644 --- a/doubleml/irm/cvar.py +++ b/doubleml/irm/cvar.py @@ -97,7 +97,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML): >>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd') >>> dml_cvar_obj = dml.DoubleMLCVAR(obj_dml_data, ml_g, ml_m, treatment=1, quantile=0.5) - >>> dml_cvar_obj.fit().summary + >>> dml_cvar_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 1.588364 0.096616 16.43989 9.909942e-61 1.398999 1.777728 diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index 7f330cfb2..50513c0f4 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -95,7 +95,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML): >>> data = make_iivm_data(theta=0.5, n_obs=1000, dim_x=20, alpha_x=1.0, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd', z_cols='z') >>> dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data, ml_g, ml_m, ml_r) - >>> dml_iivm_obj.fit().summary + >>> dml_iivm_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.362398 0.191578 1.891649 0.058538 -0.013088 0.737884 diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py index 01d288bdf..f5abdbd95 100644 --- a/doubleml/irm/irm.py +++ b/doubleml/irm/irm.py @@ -96,7 +96,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML): >>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd') >>> dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m) - >>> dml_irm_obj.fit().summary + >>> dml_irm_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.371972 0.206802 1.798685 0.072069 -0.033353 0.777297 diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py index bd62794cd..5dd8ff379 100644 --- a/doubleml/irm/lpq.py +++ b/doubleml/irm/lpq.py @@ -99,7 +99,7 @@ class DoubleMLLPQ(NonLinearScoreMixin, DoubleML): >>> data = make_iivm_data(theta=0.5, n_obs=1000, dim_x=20, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd', z_cols='z') >>> dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data, ml_g, ml_m, treatment=1, quantile=0.5) - >>> dml_lpq_obj.fit().summary + >>> dml_lpq_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.217244 0.636453 0.341336 0.73285 -1.03018 1.464668 """ diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py index f3b72e2c8..901c07b7d 100644 --- a/doubleml/irm/pq.py +++ b/doubleml/irm/pq.py @@ -105,7 +105,7 @@ class DoubleMLPQ(NonLinearScoreMixin, DoubleML): >>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd') >>> dml_pq_obj = dml.DoubleMLPQ(obj_dml_data, ml_g, ml_m, treatment=1, quantile=0.5) - >>> dml_pq_obj.fit().summary + >>> dml_pq_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.553878 0.149858 3.696011 0.000219 0.260161 0.847595 """ diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index 46c8f3165..c3325e08d 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -88,7 +88,7 @@ class DoubleMLQTE(SampleSplittingMixin): >>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd') >>> dml_qte_obj = dml.DoubleMLQTE(obj_dml_data, ml_g, ml_m, quantiles=[0.25, 0.5, 0.75]) - >>> dml_qte_obj.fit().summary + >>> dml_qte_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % 0.25 0.274825 0.347310 0.791297 0.428771 -0.405890 0.955541 0.50 0.449150 0.192539 2.332782 0.019660 0.071782 0.826519 diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py index fdc2ab6ef..bc6cd739d 100644 --- a/doubleml/irm/ssm.py +++ b/doubleml/irm/ssm.py @@ -94,7 +94,7 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML): >>> ml_pi_sim = clone(learner_class) >>> ml_m_sim = clone(learner_class) >>> obj_dml_sim = DoubleMLSSM(simul_data, ml_g_sim, ml_pi_sim, ml_m_sim) - >>> obj_dml_sim.fit().summary + >>> obj_dml_sim.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.518517 0.065535 7.912033 2.532202e-15 0.39007 0.646963 diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index c3f6d5b56..0e5cb9965 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -61,7 +61,7 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) >>> dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) - >>> dml_lplr_obj.fit().summary # doctest: +SKIP + >>> dml_lplr_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.661166 0.172672 3.829038 0.000129 0.322736 0.999596 diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py index d2b348c58..e8fd0ed67 100644 --- a/doubleml/plm/pliv.py +++ b/doubleml/plm/pliv.py @@ -73,7 +73,7 @@ class DoubleMLPLIV(LinearScoreMixin, DoubleML): >>> data = make_pliv_CHS2015(alpha=0.5, n_obs=500, dim_x=20, dim_z=1, return_type='DataFrame') >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd', z_cols='Z1') >>> dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data, ml_l, ml_m, ml_r) - >>> dml_pliv_obj.fit().summary + >>> dml_pliv_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.511722 0.087184 5.869427 4.373034e-09 0.340844 0.6826 diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index 19ee9199b..9b28231f2 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -68,7 +68,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML): >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) >>> obj_dml_data = make_plr_CCDDHNR2018(alpha=0.5, n_obs=500, dim_x=20) >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> dml_plr_obj.fit().summary + >>> dml_plr_obj.fit().summary # doctest: +SKIP coef std err t P>|t| 2.5 % 97.5 % d 0.480691 0.040533 11.859129 1.929729e-32 0.401247 0.560135 diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index fffa9a0aa..d854a8bec 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -91,6 +91,7 @@ class RDFlex: >>> ml_g = RandomForestRegressor() >>> ml_m = RandomForestClassifier() >>> rdflex_obj = dml.rdd.RDFlex(obj_dml_data, ml_g, ml_m, fuzzy=True) + >>> print(rdflex_obj.fit()) # doctest: +SKIP """ def __init__( diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py index 633eb1c67..5e6b207c6 100644 --- a/doubleml/utils/_check_return_types.py +++ b/doubleml/utils/_check_return_types.py @@ -11,7 +11,7 @@ def check_basic_return_types(dml_obj, cls): assert isinstance(dml_obj.__str__(), str) assert isinstance(dml_obj.summary, pd.DataFrame) assert isinstance(dml_obj.draw_sample_splitting(), cls) - if not dml_obj._is_cluster_data and not hasattr(dml_obj, "n_folds_inner"): + if not dml_obj._is_cluster_data and not hasattr(dml_obj, "n_folds_inner"): # set_sample_splitting is not available assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls) elif dml_obj._is_cluster_data: assert dml_obj._dml_data.is_cluster_data From 96f33ae7dd91bcf3265adb3544238ece44370f48 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 14 Nov 2025 09:09:13 +0100 Subject: [PATCH 47/48] Enhance learner evaluation checks and handle NaN targets in DoubleML class --- doubleml/double_ml.py | 19 +++++++++++++------ doubleml/utils/_check_return_types.py | 6 ++++++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 3b94a2cae..6293731a3 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1274,12 +1274,19 @@ def evaluate_learners(self, learners=None, metric=_rmse): for learner in learners: for rep in range(self.n_rep): for coef_idx in range(self._dml_data.n_coefs): - res = metric( - y_pred=self.predictions[learner][:, rep, coef_idx].reshape(1, -1), - y_true=self.nuisance_targets[learner][:, rep, coef_idx].reshape(1, -1), - ) - if not np.isfinite(res): - raise ValueError(f"Evaluation from learner {str(learner)} is not finite.") + targets = self.nuisance_targets[learner][:, rep, coef_idx].reshape(1, -1) + + if np.all(np.isnan(targets)): + res = np.nan + else: + predictions = self.predictions[learner][:, rep, coef_idx].reshape(1, -1) + res = metric( + y_pred=predictions, + y_true=targets, + ) + if not np.isfinite(res): + raise ValueError(f"Evaluation from learner {str(learner)} is not finite.") + dist[learner][rep, coef_idx] = res return dist else: diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py index 5e6b207c6..fc7aca0ec 100644 --- a/doubleml/utils/_check_return_types.py +++ b/doubleml/utils/_check_return_types.py @@ -113,6 +113,12 @@ def check_basic_predictions_and_targets(dml_obj, n_obs, n_treat, n_rep): assert isinstance(dml_obj.nuisance_loss[key], np.ndarray) assert dml_obj.nuisance_loss[key].shape == (n_rep, n_treat) + learner_eval = dml_obj.evaluate_learners() + assert isinstance(learner_eval, dict) + for key in expected_keys: + assert key in learner_eval + assert isinstance(learner_eval[key], np.ndarray) + assert learner_eval[key].shape == (n_rep, n_treat) return From 3d362aad41d1cffc1e33b11e07e2ba770e3382bb Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 14 Nov 2025 09:09:25 +0100 Subject: [PATCH 48/48] removed unnecessary test --- doubleml/plm/tests/test_model_defaults.py | 32 ----------------------- 1 file changed, 32 deletions(-) diff --git a/doubleml/plm/tests/test_model_defaults.py b/doubleml/plm/tests/test_model_defaults.py index 3e9bc430d..b555f5ad5 100644 --- a/doubleml/plm/tests/test_model_defaults.py +++ b/doubleml/plm/tests/test_model_defaults.py @@ -17,35 +17,3 @@ def test_lplr_defaults(): _fit_bootstrap(dml_lplr_obj) _check_basic_defaults_after_fit(dml_lplr_obj) - - -@pytest.mark.ci -def test_did_multi_str(): - # Test the string representation before fitting - dml_str = str(dml_lplr_obj) - - # Check that all important sections are present - assert "================== DoubleMLLPLR Object ==================" in dml_str - assert "------------------ Data Summary ------------------" in dml_str - assert "------------------ Score & Algorithm ------------------" in dml_str - assert "------------------ Machine Learner ------------------" in dml_str - assert "------------------ Resampling ------------------" in dml_str - assert "------------------ Fit Summary ------------------" in dml_str - - # Check specific content before fitting - assert "No. folds: 5" in dml_str - assert "No. repeated sample splits: 1" in dml_str - assert "Learner ml_M:" in dml_str - assert "Learner ml_m:" in dml_str - assert "Learner ml_t:" in dml_str - - # Fit the model - dml_lplr_obj_fit = dml_lplr_obj.fit() - dml_str_after_fit = str(dml_lplr_obj_fit) - - # Check that additional information is present after fitting - assert "coef" in dml_str_after_fit - assert "std err" in dml_str_after_fit - assert "t" in dml_str_after_fit - assert "P>|t|" in dml_str_after_fit - assert "Out-of-sample Performance:" in dml_str_after_fit