@@ -378,7 +378,7 @@ def make_iivm_data(n_obs=500, dim_x=20, theta=1., alpha_x=0.2, return_type='Doub
378378 :math:`\\ beta_j=\\ frac{1}{j^2}`.
379379
380380 The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and
381- Klaaßen (2020).
381+ Klaassen (2020).
382382
383383 Parameters
384384 ----------
@@ -705,3 +705,191 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1., return_
705705 return DoubleMLClusterData (data , 'Y' , 'D' , cluster_cols , x_cols , 'Z' )
706706 else :
707707 raise ValueError ('Invalid return_type.' )
708+
709+
710+ def make_did_SZ2020 (n_obs = 500 , dgp_type = 1 , cross_sectional_data = False , return_type = 'DoubleMLData' , ** kwargs ):
711+ """
712+ Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020).
713+ The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let
714+
715+ .. math::
716+
717+ f_{reg}(W) &= 210 + 27.4 \\ cdot W_1 +13.7 \\ cdot (W_2 + W_3 + W_4),
718+
719+ f_{ps}(W) &= 0.75 \\ cdot (-W_1 + 0.5 \\ cdot W_2 -0.25 \\ cdot W_3 - 0.1 \\ cdot W_4).
720+
721+
722+ Let :math:`X= (X_1, X_2, X_3, X_4)^T \\ sim \\ mathcal{N}(0, \\ Sigma)`, where :math:`\\ Sigma` is a matrix with entries
723+ :math:`\\ Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
724+ Further, define :math:`Z_j = (\\ tilde{Z_j} - \\ mathbb{E}[\\ tilde{Z}_j]) / \\ sqrt{\\ text{Var}(\\ tilde{Z}_j)}`,
725+ where :math:`\\ tilde{Z}_1 = \\ exp(0.5 \\ cdot X_1)`, :math:`\\ tilde{Z}_2 = 10 + X_2/(1 + \\ exp(X_1))`,
726+ :math:`\\ tilde{Z}_3 = (0.6 + X_1 \\ cdot X_3 / 25)^3` and :math:`\\ tilde{Z}_4 = (20 + X_2 + X_4)^2`.
727+ At first define
728+
729+ .. math::
730+
731+ Y_0(0) &= f_{reg}(W_{reg}) + \\ nu(W_{reg}, D) + \\ varepsilon_0,
732+
733+ Y_1(d) &= 2 \\ cdot f_{reg}(W_{reg}) + \\ nu(W_{reg}, D) + \\ varepsilon_1(d),
734+
735+ p(W_{ps}) &= \\ frac{\\ exp(f_{ps}(W_{ps}))}{1 + \\ exp(f_{ps}(W_{ps}))},
736+
737+ D &= 1\\ {p(W_{ps}) \\ ge U\\ },
738+
739+ where :math:`\\ varepsilon_0, \\ varepsilon_1(d), d=0, 1` are independent standard normal random variables,
740+ :math:`U \\ sim \\ mathcal{U}[0, 1]` is a independent standard uniform
741+ and :math:`\\ nu(W_{reg}, D)\\ sim \\ mathcal{N}(D \\ cdot f_{reg}(W_{reg}),1)`.
742+ The different data generating processes are defined via
743+
744+ .. math::
745+
746+ DGP1:\\ quad W_{reg} &= Z \\ quad W_{ps} = Z
747+
748+ DGP2:\\ quad W_{reg} &= Z \\ quad W_{ps} = X
749+
750+ DGP3:\\ quad W_{reg} &= X \\ quad W_{ps} = Z
751+
752+ DGP4:\\ quad W_{reg} &= X \\ quad W_{ps} = X
753+
754+ DGP5:\\ quad W_{reg} &= Z \\ quad W_{ps} = 0
755+
756+ DGP6:\\ quad W_{reg} &= X \\ quad W_{ps} = 0,
757+
758+ such that the last two settings correspond to an experimental setting with treatment probability
759+ of :math:`P(D=1) = \\ frac{1}{2}.`
760+ For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`.
761+ For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``.
762+ Then the outcome will be defined to be
763+
764+ .. math::
765+
766+ Y = T \\ cdot Y_1(D) + (1-T) \\ cdot Y_0(0),
767+
768+ where :math:`T = 1\\ {U_T\\ le \\ lambda_T \\ }` with :math:`U_T\\ sim \\ mathcal{U}[0, 1]` and :math:`\\ lambda_T=0.5`.
769+ The true average treatment effect on the treated is zero for all data generating processes.
770+
771+ Parameters
772+ ----------
773+ n_obs :
774+ The number of observations to simulate.
775+ dgp_type :
776+ The DGP to be used. Default value is ``1`` (integer).
777+ cross_sectional_data :
778+ Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``.
779+ return_type :
780+ If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
781+
782+ If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
783+
784+ If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``
785+ or ``(x, y, d, t)``.
786+ **kwargs
787+ Additional keyword arguments to set non-default values for the parameter
788+ :math:`xi=0.75`, :math:`c=0.0` and :math:`\\ lambda_T=0.5`.
789+
790+ References
791+ ----------
792+ Sant’Anna, P. H. and Zhao, J. (2020),
793+ Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122.
794+ doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
795+ """
796+ xi = kwargs .get ('xi' , 0.75 )
797+ c = kwargs .get ('c' , 0.0 )
798+ lambda_t = kwargs .get ('lambda_t' , 0.5 )
799+
800+ def f_reg (w ):
801+ res = 210 + 27.4 * w [:, 0 ] + 13.7 * (w [:, 1 ] + w [:, 2 ] + w [:, 3 ])
802+ return res
803+
804+ def f_ps (w , xi ):
805+ res = xi * (- w [:, 0 ] + 0.5 * w [:, 1 ] - 0.25 * w [:, 2 ] - 0.1 * w [:, 3 ])
806+ return res
807+
808+ dim_x = 4
809+ cov_mat = toeplitz ([np .power (c , k ) for k in range (dim_x )])
810+ x = np .random .multivariate_normal (np .zeros (dim_x ), cov_mat , size = [n_obs , ])
811+ # x = np.random.normal(loc=0, scale=1, size=[n_obs, 4])
812+
813+ z_tilde_1 = np .exp (0.5 * x [:, 0 ])
814+ z_tilde_2 = 10 + x [:, 1 ] / (1 + np .exp (x [:, 0 ]))
815+ z_tilde_3 = (0.6 + x [:, 0 ]* x [:, 2 ]/ 25 )** 3
816+ z_tilde_4 = (20 + x [:, 1 ] + x [:, 3 ])** 2
817+
818+ z_tilde = np .column_stack ((z_tilde_1 , z_tilde_2 , z_tilde_3 , z_tilde_4 ))
819+ z = (z_tilde - np .mean (z_tilde , axis = 0 )) / np .std (z_tilde , axis = 0 )
820+
821+ # error terms
822+ epsilon_0 = np .random .normal (loc = 0 , scale = 1 , size = n_obs )
823+ epsilon_1 = np .random .normal (loc = 0 , scale = 1 , size = [n_obs , 2 ])
824+
825+ if dgp_type == 1 :
826+ features_ps = z
827+ features_reg = z
828+ elif dgp_type == 2 :
829+ features_ps = x
830+ features_reg = z
831+ elif dgp_type == 3 :
832+ features_ps = z
833+ features_reg = x
834+ elif dgp_type == 4 :
835+ features_ps = x
836+ features_reg = x
837+ elif dgp_type == 5 :
838+ features_ps = None
839+ features_reg = z
840+ elif dgp_type == 6 :
841+ features_ps = None
842+ features_reg = x
843+ else :
844+ raise ValueError ('The dgp_type is not valid.' )
845+
846+ # treatment and propensities
847+ is_experimental = (dgp_type == 5 ) or (dgp_type == 6 )
848+ if is_experimental :
849+ # Set D to be experimental
850+ p = 0.5 * np .ones (n_obs )
851+ else :
852+ p = np .exp (f_ps (features_ps , xi )) / (1 + np .exp (f_ps (features_ps , xi )))
853+ u = np .random .uniform (low = 0 , high = 1 , size = n_obs )
854+ d = 1.0 * (p >= u )
855+
856+ # potential outcomes
857+ nu = np .random .normal (loc = d * f_reg (features_reg ), scale = 1 , size = n_obs )
858+ y0 = f_reg (features_reg ) + nu + epsilon_0
859+ y1_d0 = 2 * f_reg (features_reg ) + nu + epsilon_1 [:, 0 ]
860+ y1_d1 = 2 * f_reg (features_reg ) + nu + epsilon_1 [:, 1 ]
861+ y1 = d * y1_d1 + (1 - d ) * y1_d0
862+
863+ if not cross_sectional_data :
864+ y = y1 - y0
865+
866+ if return_type in _array_alias :
867+ return z , y , d
868+ elif return_type in _data_frame_alias + _dml_data_alias :
869+ z_cols = [f'Z{ i + 1 } ' for i in np .arange (dim_x )]
870+ data = pd .DataFrame (np .column_stack ((z , y , d )),
871+ columns = z_cols + ['y' , 'd' ])
872+ if return_type in _data_frame_alias :
873+ return data
874+ else :
875+ return DoubleMLData (data , 'y' , 'd' , z_cols )
876+ else :
877+ raise ValueError ('Invalid return_type.' )
878+
879+ else :
880+ u_t = np .random .uniform (low = 0 , high = 1 , size = n_obs )
881+ t = 1.0 * (u_t <= lambda_t )
882+ y = t * y1 + (1 - t )* y0
883+
884+ if return_type in _array_alias :
885+ return z , y , d , t
886+ elif return_type in _data_frame_alias + _dml_data_alias :
887+ z_cols = [f'Z{ i + 1 } ' for i in np .arange (dim_x )]
888+ data = pd .DataFrame (np .column_stack ((z , y , d , t )),
889+ columns = z_cols + ['y' , 'd' , 't' ])
890+ if return_type in _data_frame_alias :
891+ return data
892+ else :
893+ return DoubleMLData (data , 'y' , 'd' , z_cols , t_col = 't' )
894+ else :
895+ raise ValueError ('Invalid return_type.' )
0 commit comments