Skip to content

Commit f64630c

Browse files
authored
Merge pull request #200 from DoubleML/s-did
Add Difference-in-Differences models to DoubleML
2 parents 59b5690 + 7f61c22 commit f64630c

26 files changed

+2920
-133
lines changed

doubleml/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from .double_ml_iivm import DoubleMLIIVM
77
from .double_ml_data import DoubleMLData, DoubleMLClusterData
88
from .double_ml_blp import DoubleMLBLP
9+
from .double_ml_did import DoubleMLDID
10+
from .double_ml_did_cs import DoubleMLDIDCS
911
from .double_ml_qte import DoubleMLQTE
1012
from .double_ml_pq import DoubleMLPQ
1113
from .double_ml_lpq import DoubleMLLPQ
@@ -18,6 +20,8 @@
1820
'DoubleMLData',
1921
'DoubleMLClusterData',
2022
'DoubleMLBLP',
23+
'DoubleMLDID',
24+
'DoubleMLDIDCS',
2125
'DoubleMLPQ',
2226
'DoubleMLQTE',
2327
'DoubleMLLPQ',

doubleml/_utils.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,18 @@ def _get_cond_smpls(smpls, bin_var):
2828
return smpls_0, smpls_1
2929

3030

31+
def _get_cond_smpls_2d(smpls, bin_var1, bin_var2):
32+
subset_00 = (bin_var1 == 0) & (bin_var2 == 0)
33+
smpls_00 = [(np.intersect1d(np.where(subset_00)[0], train), test) for train, test in smpls]
34+
subset_01 = (bin_var1 == 0) & (bin_var2 == 1)
35+
smpls_01 = [(np.intersect1d(np.where(subset_01)[0], train), test) for train, test in smpls]
36+
subset_10 = (bin_var1 == 1) & (bin_var2 == 0)
37+
smpls_10 = [(np.intersect1d(np.where(subset_10)[0], train), test) for train, test in smpls]
38+
subset_11 = (bin_var1 == 1) & (bin_var2 == 1)
39+
smpls_11 = [(np.intersect1d(np.where(subset_11)[0], train), test) for train, test in smpls]
40+
return smpls_00, smpls_01, smpls_10, smpls_11
41+
42+
3143
def _check_is_partition(smpls, n_obs):
3244
test_indices = np.concatenate([test_index for _, test_index in smpls])
3345
if len(test_indices) != n_obs:
@@ -328,14 +340,19 @@ def _check_trimming(trimming_rule, trimming_threshold):
328340
return
329341

330342

331-
def _check_score(score, valid_score):
343+
def _check_score(score, valid_score, allow_callable=True):
332344
if isinstance(score, str):
333345
if score not in valid_score:
334346
raise ValueError('Invalid score ' + score + '. ' +
335347
'Valid score ' + ' or '.join(valid_score) + '.')
336348
else:
337-
raise TypeError('Invalid score. ' +
338-
'Valid score ' + ' or '.join(valid_score) + '.')
349+
if allow_callable:
350+
if not callable(score):
351+
raise TypeError('score should be either a string or a callable. '
352+
'%r was passed.' % score)
353+
else:
354+
raise TypeError('score should be a string. '
355+
'%r was passed.' % score)
339356
return
340357

341358

doubleml/datasets.py

Lines changed: 189 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def make_iivm_data(n_obs=500, dim_x=20, theta=1., alpha_x=0.2, return_type='Doub
378378
:math:`\\beta_j=\\frac{1}{j^2}`.
379379
380380
The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and
381-
Klaaßen (2020).
381+
Klaassen (2020).
382382
383383
Parameters
384384
----------
@@ -705,3 +705,191 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1., return_
705705
return DoubleMLClusterData(data, 'Y', 'D', cluster_cols, x_cols, 'Z')
706706
else:
707707
raise ValueError('Invalid return_type.')
708+
709+
710+
def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type='DoubleMLData', **kwargs):
711+
"""
712+
Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020).
713+
The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let
714+
715+
.. math::
716+
717+
f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4),
718+
719+
f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4).
720+
721+
722+
Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
723+
:math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
724+
Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
725+
where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
726+
:math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
727+
At first define
728+
729+
.. math::
730+
731+
Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0,
732+
733+
Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d),
734+
735+
p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))},
736+
737+
D &= 1\\{p(W_{ps}) \\ge U\\},
738+
739+
where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables,
740+
:math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform
741+
and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`.
742+
The different data generating processes are defined via
743+
744+
.. math::
745+
746+
DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
747+
748+
DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
749+
750+
DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
751+
752+
DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
753+
754+
DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
755+
756+
DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0,
757+
758+
such that the last two settings correspond to an experimental setting with treatment probability
759+
of :math:`P(D=1) = \\frac{1}{2}.`
760+
For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`.
761+
For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``.
762+
Then the outcome will be defined to be
763+
764+
.. math::
765+
766+
Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0),
767+
768+
where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`.
769+
The true average treatment effect on the treated is zero for all data generating processes.
770+
771+
Parameters
772+
----------
773+
n_obs :
774+
The number of observations to simulate.
775+
dgp_type :
776+
The DGP to be used. Default value is ``1`` (integer).
777+
cross_sectional_data :
778+
Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``.
779+
return_type :
780+
If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
781+
782+
If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
783+
784+
If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``
785+
or ``(x, y, d, t)``.
786+
**kwargs
787+
Additional keyword arguments to set non-default values for the parameter
788+
:math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`.
789+
790+
References
791+
----------
792+
Sant’Anna, P. H. and Zhao, J. (2020),
793+
Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122.
794+
doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
795+
"""
796+
xi = kwargs.get('xi', 0.75)
797+
c = kwargs.get('c', 0.0)
798+
lambda_t = kwargs.get('lambda_t', 0.5)
799+
800+
def f_reg(w):
801+
res = 210 + 27.4*w[:, 0] + 13.7*(w[:, 1] + w[:, 2] + w[:, 3])
802+
return res
803+
804+
def f_ps(w, xi):
805+
res = xi*(-w[:, 0] + 0.5*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
806+
return res
807+
808+
dim_x = 4
809+
cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
810+
x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])
811+
# x = np.random.normal(loc=0, scale=1, size=[n_obs, 4])
812+
813+
z_tilde_1 = np.exp(0.5*x[:, 0])
814+
z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
815+
z_tilde_3 = (0.6 + x[:, 0]*x[:, 2]/25)**3
816+
z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2
817+
818+
z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4))
819+
z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)
820+
821+
# error terms
822+
epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs)
823+
epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2])
824+
825+
if dgp_type == 1:
826+
features_ps = z
827+
features_reg = z
828+
elif dgp_type == 2:
829+
features_ps = x
830+
features_reg = z
831+
elif dgp_type == 3:
832+
features_ps = z
833+
features_reg = x
834+
elif dgp_type == 4:
835+
features_ps = x
836+
features_reg = x
837+
elif dgp_type == 5:
838+
features_ps = None
839+
features_reg = z
840+
elif dgp_type == 6:
841+
features_ps = None
842+
features_reg = x
843+
else:
844+
raise ValueError('The dgp_type is not valid.')
845+
846+
# treatment and propensities
847+
is_experimental = (dgp_type == 5) or (dgp_type == 6)
848+
if is_experimental:
849+
# Set D to be experimental
850+
p = 0.5 * np.ones(n_obs)
851+
else:
852+
p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi)))
853+
u = np.random.uniform(low=0, high=1, size=n_obs)
854+
d = 1.0 * (p >= u)
855+
856+
# potential outcomes
857+
nu = np.random.normal(loc=d*f_reg(features_reg), scale=1, size=n_obs)
858+
y0 = f_reg(features_reg) + nu + epsilon_0
859+
y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0]
860+
y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1]
861+
y1 = d * y1_d1 + (1-d) * y1_d0
862+
863+
if not cross_sectional_data:
864+
y = y1 - y0
865+
866+
if return_type in _array_alias:
867+
return z, y, d
868+
elif return_type in _data_frame_alias + _dml_data_alias:
869+
z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)]
870+
data = pd.DataFrame(np.column_stack((z, y, d)),
871+
columns=z_cols + ['y', 'd'])
872+
if return_type in _data_frame_alias:
873+
return data
874+
else:
875+
return DoubleMLData(data, 'y', 'd', z_cols)
876+
else:
877+
raise ValueError('Invalid return_type.')
878+
879+
else:
880+
u_t = np.random.uniform(low=0, high=1, size=n_obs)
881+
t = 1.0 * (u_t <= lambda_t)
882+
y = t * y1 + (1-t)*y0
883+
884+
if return_type in _array_alias:
885+
return z, y, d, t
886+
elif return_type in _data_frame_alias + _dml_data_alias:
887+
z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)]
888+
data = pd.DataFrame(np.column_stack((z, y, d, t)),
889+
columns=z_cols + ['y', 'd', 't'])
890+
if return_type in _data_frame_alias:
891+
return data
892+
else:
893+
return DoubleMLData(data, 'y', 'd', z_cols, t_col='t')
894+
else:
895+
raise ValueError('Invalid return_type.')

doubleml/double_ml_blp.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,22 @@
77

88

99
class DoubleMLBLP:
10-
"""Best linear predictor (BLP) for DoubleML with orthogonal signals. Mainly used for CATE and GATE estimation for IRM models.
11-
12-
Parameters
13-
----------
14-
orth_signal : :class:`numpy.array`
15-
The orthogonal signal to be predicted. Has to be of shape ``(n_obs,)``,
16-
where ``n_obs`` is the number of observations.
17-
18-
basis : :class:`pandas.DataFrame`
19-
The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
20-
where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
21-
22-
is_gate : bool
23-
Indicates whether the basis is constructed for GATEs (dummy-basis).
24-
Default is ``False``.
10+
"""Best linear predictor (BLP) for DoubleML with orthogonal signals.
11+
Manily used for CATE and GATE estimation for IRM models.
12+
13+
Parameters
14+
----------
15+
orth_signal : :class:`numpy.array`
16+
The orthogonal signal to be predicted. Has to be of shape ``(n_obs,)``,
17+
where ``n_obs`` is the number of observations.
18+
19+
basis : :class:`pandas.DataFrame`
20+
The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
21+
where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
22+
23+
is_gate : bool
24+
Indicates whether the basis is constructed for GATEs (dummy-basis).
25+
Default is ``False``.
2526
"""
2627

2728
def __init__(self,

doubleml/double_ml_cvar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def __init__(self,
120120

121121
self._check_data(self._dml_data)
122122
valid_score = ['CVaR']
123-
_check_score(self.score, valid_score)
123+
_check_score(self.score, valid_score, allow_callable=False)
124124
_check_quantile(self.quantile)
125125
_check_treatment(self.treatment)
126126

0 commit comments

Comments
 (0)