Skip to content

Commit 60f4a43

Browse files
Merge pull request #276 from DoubleML/o-rdd
Add Flexible Covariate Adjustments for Regression Discontinuity Designs
2 parents 0ee11dd + 6fff8c5 commit 60f4a43

26 files changed

+2277
-30
lines changed

doubleml/double_ml_data.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ class DoubleMLData(DoubleMLBaseData):
110110
Default is ``None``.
111111
112112
s_col : None or str
113-
The selection variable (only relevant/used for SSM Estimatiors).
113+
The score or selection variable (only relevant/used for RDD or SSM Estimatiors).
114114
Default is ``None``.
115115
116116
use_other_treat_as_covariate : bool
@@ -182,7 +182,7 @@ def _data_summary_str(self):
182182
if self.t_col is not None:
183183
data_summary += f'Time variable: {self.t_col}\n'
184184
if self.s_col is not None:
185-
data_summary += f'Selection variable: {self.s_col}\n'
185+
data_summary += f'Score/Selection variable: {self.s_col}\n'
186186
data_summary += f'No. Observations: {self.n_obs}\n'
187187
return data_summary
188188

@@ -212,7 +212,7 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria
212212
Default is ``None``.
213213
214214
s : :class:`numpy.ndarray`
215-
Array of the selection variable (only relevant/used for SSM models).
215+
Array of the score or selection variable (only relevant/used for RDD and SSM models).
216216
Default is ``None``.
217217
218218
use_other_treat_as_covariate : bool
@@ -351,7 +351,7 @@ def t(self):
351351
@property
352352
def s(self):
353353
"""
354-
Array of selection variable.
354+
Array of score or selection variable.
355355
"""
356356
if self.s_col is not None:
357357
return self._s.values
@@ -538,7 +538,7 @@ def t_col(self, value):
538538
@property
539539
def s_col(self):
540540
"""
541-
The selection variable.
541+
The score or selection variable.
542542
"""
543543
return self._s_col
544544

@@ -547,10 +547,10 @@ def s_col(self, value):
547547
reset_value = hasattr(self, '_s_col')
548548
if value is not None:
549549
if not isinstance(value, str):
550-
raise TypeError('The selection variable s_col must be of str type (or None). '
550+
raise TypeError('The score or selection variable s_col must be of str type (or None). '
551551
f'{str(value)} of type {str(type(value))} was passed.')
552552
if value not in self.all_variables:
553-
raise ValueError('Invalid selection variable s_col. '
553+
raise ValueError('Invalid score or selection variable s_col. '
554554
f'{value} is no data column.')
555555
self._s_col = value
556556
else:
@@ -725,24 +725,24 @@ def _check_disjoint_sets_t_s(self):
725725
if self.s_col is not None:
726726
s_col_set = {self.s_col}
727727
if not s_col_set.isdisjoint(x_cols_set):
728-
raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and covariate in '
728+
raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in '
729729
'``x_cols``.')
730730
if not s_col_set.isdisjoint(d_cols_set):
731-
raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and treatment variable in '
732-
'``d_cols``.')
731+
raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment '
732+
'variable in ``d_cols``.')
733733
if not s_col_set.isdisjoint(y_col_set):
734-
raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and outcome variable '
735-
'``y_col``.')
734+
raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome '
735+
'variable ``y_col``.')
736736
if self.z_cols is not None:
737737
z_cols_set = set(self.z_cols)
738738
if not s_col_set.isdisjoint(z_cols_set):
739-
raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and instrumental '
740-
'variable in ``z_cols``.')
739+
raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and '
740+
'instrumental variable in ``z_cols``.')
741741
if self.t_col is not None:
742742
t_col_set = {self.t_col}
743743
if not s_col_set.isdisjoint(t_col_set):
744-
raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and time variable '
745-
'``t_col``.')
744+
raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time '
745+
'variable ``t_col``.')
746746

747747

748748
class DoubleMLClusterData(DoubleMLData):
@@ -780,7 +780,7 @@ class DoubleMLClusterData(DoubleMLData):
780780
Default is ``None``.
781781
782782
s_col : None or str
783-
The selection variable (only relevant/used for SSM Estimatiors).
783+
The score or selection variable (only relevant/used for RDD and SSM Estimatiors).
784784
Default is ``None``.
785785
786786
use_other_treat_as_covariate : bool
@@ -854,7 +854,7 @@ def _data_summary_str(self):
854854
if self.t_col is not None:
855855
data_summary += f'Time variable: {self.t_col}\n'
856856
if self.s_col is not None:
857-
data_summary += f'Selection variable: {self.s_col}\n'
857+
data_summary += f'Score/Selection variable: {self.s_col}\n'
858858

859859
data_summary += f'No. Observations: {self.n_obs}\n'
860860
return data_summary
@@ -888,7 +888,7 @@ def from_arrays(cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_tr
888888
Default is ``None``.
889889
890890
s : :class:`numpy.ndarray`
891-
Array of the selection variable (only relevant/used for SSM models).
891+
Array of the score or selection variable (only relevant/used for RDD or SSM models).
892892
Default is ``None``.
893893
894894
use_other_treat_as_covariate : bool
@@ -1039,7 +1039,7 @@ def _check_disjoint_sets_cluster_cols(self):
10391039
'cluster variable in ``cluster_cols``.')
10401040
if self.s_col is not None:
10411041
if not s_col_set.isdisjoint(cluster_cols_set):
1042-
raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and '
1042+
raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and '
10431043
'cluster variable in ``cluster_cols``.')
10441044

10451045
def _set_cluster_vars(self):

doubleml/rdd/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""
2+
The :mod:`doubleml.rdd` module implements double machine learning estimates for regression discontinuity designs.
3+
"""
4+
5+
from .rdd import RDFlex
6+
7+
__all__ = [
8+
"RDFlex",
9+
]

doubleml/rdd/datasets/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""
2+
The :mod:`doubleml.rdd.datasets` module implements data generating processes for regression discontinuity designs.
3+
"""
4+
5+
from .simple_dgp import make_simple_rdd_data
6+
7+
__all__ = [
8+
"make_simple_rdd_data",
9+
]
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import numpy as np
2+
from numpy.polynomial.polynomial import Polynomial
3+
4+
5+
def make_simple_rdd_data(n_obs=5000, p=4, fuzzy=True, binary_outcome=False, **kwargs):
6+
"""
7+
Generates synthetic data for a regression discontinuity design (RDD) analysis.
8+
9+
.. math::
10+
Y_0 &= g_0 + g_{cov} + \\epsilon_0 \\
11+
Y_1 &= g_1 + g_{cov} + \\epsilon_1 \\
12+
g_0 &= 0.1 \\cdot \\text{score}^2 \\
13+
g_1 &= \tau + 0.1 \\cdot \\text{score}^2 - 0.5 \\cdot \\text{score}^2 \\
14+
g_{cov} &= \\sum_{i=1}^{\text{dim\\_x}} \text{Polynomial}(X_i) \\
15+
\\epsilon_0, \\epsilon_1 &\\sim \\mathcal{N}(0, 0.2^2)
16+
17+
Parameters
18+
----------
19+
n_obs : int
20+
Number of observations to generate. Default is 5000.
21+
22+
p : int
23+
Degree of the polynomial for covariates. Default is 4.
24+
25+
fuzzy : bool
26+
If True, generates data for a fuzzy RDD. Default is True.
27+
28+
binary_outcome : bool
29+
If True, generates binary outcomes. Default is False.
30+
31+
**kwargs : Additional keyword arguments.
32+
cutoff : float
33+
The cutoff value for the score. Default is 0.0.
34+
dim_x : int
35+
The number of independent covariates. Default is 3.
36+
a : float
37+
Factor to control interaction of score and covariates to the outcome equation. Default is 0.0.
38+
tau : float
39+
Parameter to control the true effect in the generated data at the given cutoff. Default is 1.0.
40+
41+
Returns
42+
-------
43+
dict: A dictionary containing the generated data with keys:
44+
'score' (np.ndarray): The running variable.
45+
'X' (np.ndarray): The independent covariates.
46+
'Y0' (np.ndarray): The potential outcomes without treatment.
47+
'Y1' (np.ndarray): The potential outcomes with treatment.
48+
'intended_treatment' (np.ndarray): The intended treatment assignment.
49+
"""
50+
51+
cutoff = kwargs.get('cutoff', 0.0)
52+
dim_x = kwargs.get('dim_x', 3)
53+
a = kwargs.get('a', 0.0)
54+
tau = kwargs.get('tau', 1.0)
55+
56+
score = np.random.normal(size=n_obs)
57+
# independent covariates
58+
X = np.random.uniform(size=(n_obs, dim_x), low=-1, high=1)
59+
60+
# Create polynomials of covariates
61+
if p == 0:
62+
covs = np.zeros((n_obs, 1))
63+
else:
64+
covs = np.column_stack([Polynomial(np.arange(p + 1))(X[:, i]) for i in range(X.shape[1])])
65+
g_cov = np.sum(covs, axis=1)
66+
67+
g0 = 0.1 * score**2
68+
g1 = tau + 0.1 * score**2 - 0.5 * score**2 + a * np.sum(X, axis=1) * score
69+
70+
eps_scale = 0.2
71+
# potential outcomes with independent errors
72+
if not binary_outcome:
73+
Y0 = g0 + g_cov + np.random.normal(size=n_obs, scale=eps_scale)
74+
Y1 = g1 + g_cov + np.random.normal(size=n_obs, scale=eps_scale)
75+
else:
76+
p_Y0 = 1 / (1 + np.exp(-1.0 * (g0 + g_cov)))
77+
p_Y1 = 1 / (1 + np.exp(-1.0 * (g1 + g_cov)))
78+
Y0 = np.random.binomial(n=1, p=p_Y0, size=n_obs)
79+
Y1 = np.random.binomial(n=1, p=p_Y1, size=n_obs)
80+
81+
intended_treatment = (score >= cutoff).astype(int)
82+
if fuzzy:
83+
prob = 0.3 + 0.4 * intended_treatment + 0.01 * score**2 - 0.02 * score**2 * intended_treatment + 0.2 * g_cov
84+
prob = np.clip(prob, 0.0, 1.0)
85+
D = np.random.binomial(n=1, p=prob, size=n_obs)
86+
else:
87+
D = intended_treatment
88+
89+
D = D.astype(int)
90+
Y = Y0 * (1 - D) + Y1 * D
91+
92+
oracle_values = {
93+
'Y0': Y0,
94+
'Y1': Y1,
95+
}
96+
res_dict = {
97+
'score': score,
98+
'Y': Y,
99+
'D': D,
100+
'X': X,
101+
'oracle_values': oracle_values
102+
}
103+
return res_dict

0 commit comments

Comments
 (0)