Skip to content

Commit cdb0cf8

Browse files
authored
Merge pull request #324 from DoubleML/s-update-var-aggregation
Update Variance aggregation
2 parents 83dd94f + 808b197 commit cdb0cf8

File tree

9 files changed

+46
-45
lines changed

9 files changed

+46
-45
lines changed

doubleml/double_ml.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None,
537537
self._fit_sensitivity_elements(nuisance_predictions)
538538

539539
# aggregated parameter estimates and standard errors from repeated cross-fitting
540-
self.coef, self.se = _aggregate_coefs_and_ses(self._all_coef, self._all_se, self._var_scaling_factors)
540+
self.coef, self.se = _aggregate_coefs_and_ses(self._all_coef, self._all_se)
541541

542542
# validate sensitivity elements (e.g., re-estimate nu2 if negative)
543543
self._validate_sensitivity_elements()
@@ -1392,7 +1392,7 @@ def _est_causal_pars_and_se(self):
13921392
self._all_se[self._i_treat, self._i_rep], self._var_scaling_factors[self._i_treat] = self._se_causal_pars()
13931393

13941394
# aggregated parameter estimates and standard errors from repeated cross-fitting
1395-
self.coef, self.se = _aggregate_coefs_and_ses(self._all_coef, self._all_se, self._var_scaling_factors)
1395+
self.coef, self.se = _aggregate_coefs_and_ses(self._all_coef, self._all_se)
13961396

13971397
# Score estimation and elements
13981398
@abstractmethod

doubleml/double_ml_framework.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def __add__(self, other):
310310
# compute standard errors (Uses factor 1/n for scaling!)
311311
sigma2_hat = np.divide(np.mean(np.square(scaled_psi), axis=0), var_scaling_factors.reshape(-1, 1))
312312
all_ses = np.sqrt(sigma2_hat)
313-
thetas, ses = _aggregate_coefs_and_ses(all_thetas, all_ses, var_scaling_factors)
313+
thetas, ses = _aggregate_coefs_and_ses(all_thetas, all_ses)
314314

315315
doubleml_dict = {
316316
"thetas": thetas,
@@ -358,7 +358,7 @@ def __sub__(self, other):
358358
# compute standard errors
359359
sigma2_hat = np.divide(np.mean(np.square(scaled_psi), axis=0), var_scaling_factors.reshape(-1, 1))
360360
all_ses = np.sqrt(sigma2_hat)
361-
thetas, ses = _aggregate_coefs_and_ses(all_thetas, all_ses, var_scaling_factors)
361+
thetas, ses = _aggregate_coefs_and_ses(all_thetas, all_ses)
362362

363363
doubleml_dict = {
364364
"thetas": thetas,
@@ -507,8 +507,8 @@ def _calc_sensitivity_analysis(self, cf_y, cf_d, rho, level):
507507
all_sigma_upper[i_theta, i_rep] = np.sqrt(sigma2_upper_hat)
508508

509509
# aggregate coefs and ses over n_rep
510-
theta_lower, sigma_lower = _aggregate_coefs_and_ses(all_theta_lower, all_sigma_lower, self._var_scaling_factors)
511-
theta_upper, sigma_upper = _aggregate_coefs_and_ses(all_theta_upper, all_sigma_upper, self._var_scaling_factors)
510+
theta_lower, sigma_lower = _aggregate_coefs_and_ses(all_theta_lower, all_sigma_lower)
511+
theta_upper, sigma_upper = _aggregate_coefs_and_ses(all_theta_upper, all_sigma_upper)
512512

513513
# per repetition confidence intervals
514514
quant = norm.ppf(level)

doubleml/plm/tests/_utils_plr_manual.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def fit_plr_multitreat(
2121
g_params=None,
2222
use_other_treat_as_covariate=True,
2323
):
24-
n_obs = len(y)
2524
n_d = d.shape[1]
2625

2726
thetas = list()
@@ -62,7 +61,9 @@ def fit_plr_multitreat(
6261
theta_vec = np.array([xx[i_d] for xx in thetas])
6362
se_vec = np.array([xx[i_d] for xx in ses])
6463
theta[i_d] = np.median(theta_vec)
65-
se[i_d] = np.sqrt(np.median(np.power(se_vec, 2) * n_obs + np.power(theta_vec - theta[i_d], 2)) / n_obs)
64+
upper_bound_vec = theta_vec + 1.96 * se_vec
65+
upper_bound = np.median(upper_bound_vec)
66+
se[i_d] = (upper_bound - theta[i_d]) / 1.96
6667

6768
res = {
6869
"theta": theta,
@@ -78,7 +79,6 @@ def fit_plr_multitreat(
7879

7980

8081
def fit_plr(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, n_rep=1, l_params=None, m_params=None, g_params=None):
81-
n_obs = len(y)
8282

8383
thetas = np.zeros(n_rep)
8484
ses = np.zeros(n_rep)
@@ -95,7 +95,9 @@ def fit_plr(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, n_rep=1,
9595
all_g_hat.append(g_hat)
9696

9797
theta = np.median(thetas)
98-
se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs)
98+
upper_bound_vec = thetas + 1.96 * ses
99+
upper_bound = np.median(upper_bound_vec)
100+
se = (upper_bound - theta) / 1.96
99101

100102
res = {
101103
"theta": theta,

doubleml/tests/_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ def generate_dml_dict(psi_a, psi_b):
101101
thetas, ses = _aggregate_coefs_and_ses(
102102
all_coefs=all_thetas,
103103
all_ses=all_ses,
104-
var_scaling_factors=var_scaling_factors,
105104
)
106105
scaled_psi = psi_b / np.mean(psi_a, axis=0)
107106

doubleml/tests/_utils_doubleml_sensitivity_manual.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def doubleml_sensitivity_manual(sensitivity_elements, all_coefs, psi, psi_deriv,
3131
all_sigma_lower = np.transpose(np.sqrt(np.divide(np.mean(np.square(psi_lower), axis=0), var_scaling_factor)))
3232
all_sigma_upper = np.transpose(np.sqrt(np.divide(np.mean(np.square(psi_upper), axis=0), var_scaling_factor)))
3333

34-
theta_lower, sigma_lower = _aggregate_coefs_and_ses(all_theta_lower, all_sigma_lower, var_scaling_factor)
35-
theta_upper, sigma_upper = _aggregate_coefs_and_ses(all_theta_upper, all_sigma_upper, var_scaling_factor)
34+
theta_lower, sigma_lower = _aggregate_coefs_and_ses(all_theta_lower, all_sigma_lower)
35+
theta_upper, sigma_upper = _aggregate_coefs_and_ses(all_theta_upper, all_sigma_upper)
3636

3737
quant = norm.ppf(level)
3838

doubleml/tests/test_framework_sensitivity.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy as np
12
import pytest
23
from sklearn.linear_model import LinearRegression, LogisticRegression
34

@@ -121,19 +122,19 @@ def test_dml_framework_sensitivity_operations(dml_framework_sensitivity_fixture)
121122
mul_zero_obj = dml_framework_sensitivity_fixture["dml_framework_obj_mul_zero_obj"]
122123

123124
for key in ["theta", "se", "ci"]:
124-
assert add_obj.sensitivity_params[key]["upper"] == mul_obj.sensitivity_params[key]["upper"]
125-
assert add_obj.sensitivity_params[key]["lower"] == mul_obj.sensitivity_params[key]["lower"]
125+
assert np.allclose(add_obj.sensitivity_params[key]["upper"], mul_obj.sensitivity_params[key]["upper"])
126+
assert np.allclose(add_obj.sensitivity_params[key]["lower"], mul_obj.sensitivity_params[key]["lower"])
126127

127128
assert mul_zero_obj.sensitivity_params[key]["upper"] == 0
128129
assert mul_zero_obj.sensitivity_params[key]["lower"] == 0
129130

130-
assert add_obj.sensitivity_params["rv"] == mul_obj.sensitivity_params["rv"]
131+
assert np.allclose(add_obj.sensitivity_params["rv"], mul_obj.sensitivity_params["rv"])
131132
assert mul_zero_obj.sensitivity_params["rv"] > 0.99 # due to degenerated variance
132133
assert mul_zero_obj.sensitivity_params["rva"] > 0.99 # due to degenerated variance
133134

134135
sub_obj = dml_framework_sensitivity_fixture["dml_framework_obj_sub_obj2"]
135-
assert sub_obj.sensitivity_params["theta"]["upper"] == -1 * sub_obj.sensitivity_params["theta"]["lower"]
136-
assert sub_obj.sensitivity_params["se"]["upper"] == sub_obj.sensitivity_params["se"]["lower"]
137-
assert sub_obj.sensitivity_params["ci"]["upper"] == -1 * sub_obj.sensitivity_params["ci"]["lower"]
136+
assert np.allclose(sub_obj.sensitivity_params["theta"]["upper"], -1 * sub_obj.sensitivity_params["theta"]["lower"])
137+
assert np.allclose(sub_obj.sensitivity_params["se"]["upper"], sub_obj.sensitivity_params["se"]["lower"])
138+
assert np.allclose(sub_obj.sensitivity_params["ci"]["upper"], -1 * sub_obj.sensitivity_params["ci"]["lower"])
138139
assert sub_obj.sensitivity_params["rv"] < 0.01
139140
assert sub_obj.sensitivity_params["rva"] < 0.01

doubleml/tests/test_multiway_cluster.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,10 @@ def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner, score):
121121
ses[i_rep] = se[0]
122122

123123
theta = np.median(thetas)
124-
n_clusters1 = len(np.unique(obj_dml_cluster_data.cluster_vars[:, 0]))
125-
n_clusters2 = len(np.unique(obj_dml_cluster_data.cluster_vars[:, 1]))
126-
var_scaling_factor = min(n_clusters1, n_clusters2)
127-
se = np.sqrt(np.median(np.power(ses, 2) * var_scaling_factor + np.power(thetas - theta, 2)) / var_scaling_factor)
124+
125+
all_upper_bounds = thetas + 1.96 * ses
126+
upper_bound = np.median(all_upper_bounds, axis=0)
127+
se = (upper_bound - theta) / 1.96
128128

129129
res_dict = {
130130
"coef": dml_pliv_obj.coef,

doubleml/utils/_estimation.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -239,20 +239,17 @@ def abs_ipw_score(theta):
239239
return ipw_est
240240

241241

242-
def _aggregate_coefs_and_ses(all_coefs, all_ses, var_scaling_factors):
243-
if var_scaling_factors.shape == (all_coefs.shape[0],):
244-
scaling_factors = np.repeat(var_scaling_factors[:, np.newaxis], all_coefs.shape[1], axis=1)
245-
else:
246-
scaling_factors = var_scaling_factors
242+
def _aggregate_coefs_and_ses(all_coefs, all_ses):
243+
# already expects equally scaled variances over all repetitions
247244
# aggregation is done over dimension 1, such that the coefs and ses have to be of shape (n_coefs, n_rep)
248245
coefs = np.median(all_coefs, 1)
249246

250-
coefs_deviations = np.square(all_coefs - coefs.reshape(-1, 1))
251-
scaled_coef_deviations = np.divide(coefs_deviations, scaling_factors)
252-
all_variances = np.square(all_ses) + scaled_coef_deviations
253-
254-
var = np.median(all_variances, 1)
255-
ses = np.sqrt(var)
247+
# construct the upper bounds & aggregate
248+
critical_value = 1.96
249+
all_upper_bounds = all_coefs + critical_value * all_ses
250+
agg_upper_bounds = np.median(all_upper_bounds, axis=1)
251+
# reverse to calculate the standard errors
252+
ses = (agg_upper_bounds - coefs) / critical_value
256253
return coefs, ses
257254

258255

doubleml/utils/tests/test_var_est_and_aggregation.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import pytest
3+
from scipy.stats import norm
34

45
from doubleml.utils._estimation import _aggregate_coefs_and_ses, _var_est
56

@@ -14,19 +15,24 @@ def n_coefs(request):
1415
return request.param
1516

1617

18+
@pytest.fixture(scope="module", params=[0.9, 0.95, 0.975])
19+
def level(request):
20+
return request.param
21+
22+
1723
@pytest.fixture(scope="module")
18-
def test_var_est_and_aggr_fixture(n_rep, n_coefs):
24+
def test_var_est_and_aggr_fixture(n_rep, n_coefs, level):
1925
np.random.seed(42)
2026

2127
all_thetas = np.full((n_coefs, n_rep), np.nan)
2228
all_ses = np.full((n_coefs, n_rep), np.nan)
23-
expected_all_var = np.full((n_coefs, n_rep), np.nan)
29+
expected_all_upper_bounds = np.full((n_coefs, n_rep), np.nan)
2430
all_var_scaling_factors = np.full((n_coefs, n_rep), np.nan)
2531

2632
for i_coef in range(n_coefs):
2733
n_obs = np.random.randint(100, 200)
2834
for i_rep in range(n_rep):
29-
psi = np.random.normal(size=(n_obs))
35+
psi = np.random.normal(size=(n_obs), loc=i_coef, scale=i_coef + 1)
3036
psi_deriv = np.ones((n_obs))
3137

3238
all_thetas[i_coef, i_rep] = np.mean(psi)
@@ -37,27 +43,23 @@ def test_var_est_and_aggr_fixture(n_rep, n_coefs):
3743
all_var_scaling_factors[i_coef, i_rep] = var_scaling_factor
3844

3945
expected_theta = np.median(all_thetas, axis=1)
46+
critical_value = norm.ppf(level)
4047
for i_coef in range(n_coefs):
4148
for i_rep in range(n_rep):
42-
theta_deviation = np.square(all_thetas[i_coef, i_rep] - expected_theta[i_coef])
43-
expected_all_var[i_coef, i_rep] = np.square(all_ses[i_coef, i_rep]) + np.divide(
44-
theta_deviation, all_var_scaling_factors[i_coef, i_rep]
45-
)
49+
expected_all_upper_bounds[i_coef, i_rep] = all_thetas[i_coef, i_rep] + critical_value * all_ses[i_coef, i_rep]
4650

47-
expected_se = np.sqrt(np.median(expected_all_var, axis=1))
51+
expected_upper_bounds = np.median(expected_all_upper_bounds, axis=1)
52+
expected_se = (expected_upper_bounds - expected_theta) / critical_value
4853

49-
# without n_rep
5054
theta, se = _aggregate_coefs_and_ses(
5155
all_coefs=all_thetas,
5256
all_ses=all_ses,
53-
var_scaling_factors=all_var_scaling_factors[:, 0],
5457
)
5558

5659
# with n_rep
5760
theta_2, se_2 = _aggregate_coefs_and_ses(
5861
all_coefs=all_thetas,
5962
all_ses=all_ses,
60-
var_scaling_factors=all_var_scaling_factors,
6163
)
6264

6365
result_dict = {

0 commit comments

Comments
 (0)