Skip to content

Commit 99e78bf

Browse files
committed
PR Review
1 parent 5d9e0eb commit 99e78bf

File tree

10 files changed

+212
-138
lines changed

10 files changed

+212
-138
lines changed

doubleml/double_ml.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -258,13 +258,6 @@ def learner(self):
258258
"""
259259
return self._learner
260260

261-
@property
262-
def predictions_names(self):
263-
"""
264-
The names of predictions for the nuisance functions.
265-
"""
266-
return list(self.params_names)
267-
268261
@property
269262
def learner_names(self):
270263
"""
@@ -1088,7 +1081,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m
10881081
_check_external_predictions(
10891082
external_predictions=external_predictions,
10901083
valid_treatments=self._dml_data.d_cols,
1091-
valid_learners=self.predictions_names,
1084+
valid_learners=self.params_names,
10921085
n_obs=self.n_obs,
10931086
n_rep=self.n_rep,
10941087
)
@@ -1111,7 +1104,7 @@ def _initalize_fit(self, store_predictions, store_models):
11111104
def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models):
11121105
ext_prediction_dict = _set_external_predictions(
11131106
external_predictions,
1114-
learners=self.predictions_names,
1107+
learners=self.params_names,
11151108
treatment=self._dml_data.d_cols[self._i_treat],
11161109
i_rep=self._i_rep,
11171110
)
@@ -1178,8 +1171,8 @@ def _initialize_arrays(self):
11781171
self._all_se = np.full((n_thetas, n_rep), np.nan)
11791172

11801173
def _initialize_predictions_and_targets(self):
1181-
self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names}
1182-
self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names}
1174+
self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
1175+
self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
11831176

11841177
def _initialize_nuisance_loss(self):
11851178
self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names}
@@ -1190,7 +1183,7 @@ def _initialize_models(self):
11901183
}
11911184

11921185
def _store_predictions_and_targets(self, preds, targets):
1193-
for learner in self.predictions_names:
1186+
for learner in self.params_names:
11941187
self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner]
11951188
self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner]
11961189

doubleml/plm/lplr.py

Lines changed: 26 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from doubleml.utils._estimation import (
1414
_dml_cv_predict,
1515
_dml_tune,
16+
_double_dml_cv_predict,
1617
)
1718

1819

@@ -104,10 +105,6 @@ def __init__(
104105

105106
ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True)
106107
self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M}
107-
# replace aggregated inner names with per-inner-fold names
108-
inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds_inner)]
109-
inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds_inner)]
110-
self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names
111108

112109
if ml_a is not None:
113110
ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True)
@@ -162,56 +159,15 @@ def __init__(
162159
self._sensitivity_implemented = False
163160

164161
def _initialize_ml_nuisance_params(self):
165-
self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner}
162+
inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds)]
163+
inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds)]
164+
params_names = ["ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names
165+
self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in params_names}
166166

167167
def _check_data(self, obj_dml_data):
168168
if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]):
169169
raise TypeError("The outcome variable y must be binary with values 0 and 1.")
170170

171-
def _double_dml_cv_predict(
172-
self,
173-
estimator,
174-
estimator_name,
175-
x,
176-
y,
177-
smpls=None,
178-
smpls_inner=None,
179-
n_jobs=None,
180-
est_params=None,
181-
method="predict",
182-
sample_weights=None,
183-
):
184-
res = {}
185-
res["preds"] = np.zeros(y.shape, dtype=float)
186-
res["preds_inner"] = []
187-
res["targets_inner"] = []
188-
res["models"] = []
189-
for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner):
190-
res_inner = _dml_cv_predict(
191-
estimator,
192-
x,
193-
y,
194-
smpls=smpls_double_split,
195-
n_jobs=n_jobs,
196-
est_params=est_params,
197-
method=method,
198-
return_models=True,
199-
sample_weights=sample_weights,
200-
)
201-
_check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split)
202-
203-
res["preds_inner"].append(res_inner["preds"])
204-
res["targets_inner"].append(res_inner["targets"])
205-
for model in res_inner["models"]:
206-
res["models"].append(model)
207-
if method == "predict_proba":
208-
res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1]
209-
else:
210-
res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]])
211-
res["preds"] /= len(smpls)
212-
res["targets"] = np.copy(y)
213-
return res
214-
215171
def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
216172
x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False)
217173
x, d = check_X_y(x, self._dml_data.d, force_all_finite=False)
@@ -234,9 +190,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
234190
f"have to be provided (missing: {', '.join([str(i) for i in missing])})."
235191
)
236192
M_hat_inner = [external_predictions[f"ml_M_inner_{i}"] for i in range(self.n_folds_inner)]
237-
M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None}
193+
M_hat = {
194+
"preds": external_predictions["ml_M"],
195+
"preds_inner": M_hat_inner,
196+
"targets": self._dml_data.y,
197+
"models": None,
198+
}
238199
else:
239-
M_hat = self._double_dml_cv_predict(
200+
M_hat = _double_dml_cv_predict(
240201
self._learner["ml_M"],
241202
"ml_M",
242203
x_d_concat,
@@ -250,7 +211,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
250211

251212
# nuisance m
252213
if m_external:
253-
m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None}
214+
m_hat = {"preds": external_predictions["ml_m"], "targets": self._dml_data.d, "models": None}
254215
else:
255216
if self.score == "instrument":
256217
weights = M_hat["preds"] * (1 - M_hat["preds"])
@@ -303,9 +264,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
303264
f"have to be provided (missing: {', '.join([str(i) for i in missing])})."
304265
)
305266
a_hat_inner = [external_predictions[f"ml_a_inner_{i}"] for i in range(self.n_folds_inner)]
306-
a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None}
267+
a_hat = {
268+
"preds": external_predictions["ml_a"],
269+
"preds_inner": a_hat_inner,
270+
"targets": self._dml_data.d,
271+
"models": None,
272+
}
307273
else:
308-
a_hat = self._double_dml_cv_predict(
274+
a_hat = _double_dml_cv_predict(
309275
self._learner["ml_a"],
310276
"ml_a",
311277
x,
@@ -404,13 +370,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
404370

405371
return psi_elements, preds
406372

407-
@property
408-
def predictions_names(self):
409-
"""
410-
The names of predictions for the nuisance functions.
411-
"""
412-
return self._predictions_names
413-
414373
def _score_elements(self, y, d, r_hat, m_hat):
415374
# compute residual
416375
d_tilde = d - m_hat
@@ -438,8 +397,6 @@ def _sensitivity_element_est(self, preds):
438397
def _nuisance_tuning(
439398
self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
440399
):
441-
if self._i_rep is None:
442-
raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.")
443400
x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False)
444401
x, d = check_X_y(x, self._dml_data.d, force_all_finite=False)
445402
x_d_concat = np.hstack((d.reshape(-1, 1), x))
@@ -500,34 +457,16 @@ def _nuisance_tuning(
500457
a_best_params = [xx.best_params_ for xx in a_tune_res]
501458

502459
# Create targets for tuning ml_t
503-
M_hat = self._double_dml_cv_predict(
504-
self._learner["ml_M"],
505-
"ml_M",
506-
x_d_concat,
507-
y,
508-
smpls=smpls,
509-
smpls_inner=self._DoubleML__smpls__inner,
510-
n_jobs=n_jobs_cv,
511-
est_params=M_best_params,
512-
method=self._predict_method["ml_M"],
513-
)
514460

515-
W_inner = []
516-
for i, (train, _) in enumerate(smpls):
517-
M_iteration = M_hat["preds_inner"][i][train]
518-
M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8)
519-
w = scipy.special.logit(M_iteration)
520-
W_inner.append(w)
461+
M_hat = np.full_like(y, np.nan)
462+
for idx, (train_index, _) in enumerate(smpls):
463+
M_hat[train_index] = M_tune_res[idx].predict_proba(x_d_concat[train_index, :])[:, 1]
521464

522-
# Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN
523-
W_targets = []
524-
for i, train in enumerate(train_inds):
525-
wt = np.full(x.shape[0], np.nan, dtype=float)
526-
wt[train] = W_inner[i]
527-
W_targets.append(wt)
465+
M_hat = np.clip(M_hat, 1e-8, 1 - 1e-8)
466+
W_hat = scipy.special.logit(M_hat)
528467

529468
t_tune_res = _dml_tune(
530-
W_inner,
469+
W_hat,
531470
x,
532471
train_inds,
533472
self._learner["ml_t"],
@@ -537,7 +476,6 @@ def _nuisance_tuning(
537476
n_jobs_cv,
538477
search_mode,
539478
n_iter_randomized_search,
540-
fold_specific_target=True,
541479
)
542480
t_best_params = [xx.best_params_ for xx in t_tune_res]
543481

doubleml/plm/tests/test_lplr.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,22 @@
77
from doubleml.plm.datasets import make_lplr_LZZ2020
88

99

10-
@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)])
10+
@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)])
1111
def learner_M(request):
1212
return request.param
1313

1414

15-
@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
15+
@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)])
1616
def learner_t(request):
1717
return request.param
1818

1919

20-
@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
20+
@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)])
2121
def learner_m(request):
2222
return request.param
2323

2424

25-
@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)])
25+
@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)])
2626
def learner_m_classifier(request):
2727
return request.param
2828

@@ -33,7 +33,6 @@ def score(request):
3333

3434

3535
@pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"])
36-
# TODO: Error for continuous treatment?
3736
def treatment(request):
3837
return request.param
3938

doubleml/plm/tests/test_lplr_exceptions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
# create test data and basic learners
1515
dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20)
1616
dml_data_binary = make_lplr_LZZ2020(alpha=0.5, n_obs=n, treatment="binary", dim_x=20)
17-
ml_M = RandomForestClassifier()
18-
ml_t = RandomForestRegressor()
19-
ml_m = RandomForestRegressor()
17+
ml_M = RandomForestClassifier(max_depth=2, n_estimators=10)
18+
ml_t = RandomForestRegressor(max_depth=2, n_estimators=10)
19+
ml_m = RandomForestRegressor(max_depth=2, n_estimators=10)
2020
dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m)
2121
dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument")
2222

doubleml/plm/tests/test_lplr_tune.py

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def dml_lplr_fixture(
4444
learner_m,
4545
learner_a,
4646
score,
47-
tune_on_folds=True,
47+
tune_on_folds=False,
4848
):
4949
par_grid = {
5050
"ml_M": get_par_grid(),
@@ -94,28 +94,3 @@ def test_dml_selection_coef(dml_lplr_fixture):
9494
se = dml_lplr_fixture["se"]
9595
true_coef = dml_lplr_fixture["true_coef"]
9696
assert abs(coef - true_coef) <= 3.0 * np.sqrt(se)
97-
98-
99-
@pytest.mark.ci
100-
def test_lplr_exception_tuning(
101-
learner_M,
102-
learner_t,
103-
learner_m,
104-
learner_a,
105-
):
106-
# LPLR valid scores are 'nuisance_space' and 'instrument'
107-
obj_dml_data = make_lplr_LZZ2020(alpha=0.5)
108-
ml_M = clone(learner_M)
109-
ml_t = clone(learner_t)
110-
ml_m = clone(learner_m)
111-
112-
dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m)
113-
par_grid = {
114-
"ml_M": get_par_grid(),
115-
"ml_t": get_par_grid(),
116-
"ml_m": get_par_grid(),
117-
"ml_a": get_par_grid(),
118-
}
119-
msg = "tune_on_folds must be True as targets have to be created for ml_t on folds."
120-
with pytest.raises(ValueError, match=msg):
121-
dml_lplr_obj.tune(par_grid, tune_on_folds=False)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import pytest
2+
from sklearn.linear_model import LinearRegression, LogisticRegression
3+
4+
from doubleml import DoubleMLLPLR
5+
from doubleml.plm.datasets import make_lplr_LZZ2020
6+
from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap
7+
8+
dml_data_lplr = make_lplr_LZZ2020(n_obs=100)
9+
10+
dml_lplr_obj = DoubleMLLPLR(dml_data_lplr, LogisticRegression(), LinearRegression(), LinearRegression())
11+
12+
13+
@pytest.mark.ci
14+
def test_lplr_defaults():
15+
_check_basic_defaults_before_fit(dml_lplr_obj)
16+
17+
_fit_bootstrap(dml_lplr_obj)
18+
19+
_check_basic_defaults_after_fit(dml_lplr_obj)
20+
21+
22+
@pytest.mark.ci
23+
def test_did_multi_str():
24+
# Test the string representation before fitting
25+
dml_str = str(dml_lplr_obj)
26+
27+
# Check that all important sections are present
28+
assert "================== DoubleMLLPLR Object ==================" in dml_str
29+
assert "------------------ Data Summary ------------------" in dml_str
30+
assert "------------------ Score & Algorithm ------------------" in dml_str
31+
assert "------------------ Machine Learner ------------------" in dml_str
32+
assert "------------------ Resampling ------------------" in dml_str
33+
assert "------------------ Fit Summary ------------------" in dml_str
34+
35+
# Check specific content before fitting
36+
assert "No. folds: 5" in dml_str
37+
assert "No. repeated sample splits: 1" in dml_str
38+
assert "Learner ml_M:" in dml_str
39+
assert "Learner ml_m:" in dml_str
40+
assert "Learner ml_t:" in dml_str
41+
42+
# Fit the model
43+
dml_lplr_obj_fit = dml_lplr_obj.fit()
44+
dml_str_after_fit = str(dml_lplr_obj_fit)
45+
46+
# Check that additional information is present after fitting
47+
assert "coef" in dml_str_after_fit
48+
assert "std err" in dml_str_after_fit
49+
assert "t" in dml_str_after_fit
50+
assert "P>|t|" in dml_str_after_fit
51+
assert "Out-of-sample Performance:" in dml_str_after_fit

0 commit comments

Comments
 (0)