@@ -450,14 +450,18 @@ def generate_event_study_data(
450450 unit_fe_sigma : float = 1.0 ,
451451 time_fe_sigma : float = 0.5 ,
452452 noise_sigma : float = 0.2 ,
453+ predictor_effects : dict [str , float ] | None = None ,
454+ ar_phi : float = 0.9 ,
455+ ar_scale : float = 1.0 ,
453456 seed : int | None = None ,
454457) -> pd .DataFrame :
455458 """
456459 Generate synthetic panel data for event study / dynamic DiD analysis.
457460
458461 Creates panel data with unit and time fixed effects, where a fraction of units
459462 receive treatment at a common treatment time. Treatment effects can vary by
460- event time (time relative to treatment).
463+ event time (time relative to treatment). Optionally includes time-varying
464+ predictor variables generated via AR(1) processes.
461465
462466 Parameters
463467 ----------
@@ -482,6 +486,19 @@ def generate_event_study_data(
482486 Standard deviation for time fixed effects. Default 0.5.
483487 noise_sigma : float
484488 Standard deviation for observation noise. Default 0.2.
489+ predictor_effects : dict[str, float], optional
490+ Dictionary mapping predictor names to their true coefficients.
491+ Each predictor is generated as an AR(1) time series that varies over time
492+ but is the same for all units at a given time. For example,
493+ ``{'temperature': 0.3, 'humidity': -0.1}`` creates two predictors.
494+ Default None (no predictors).
495+ ar_phi : float
496+ AR(1) autoregressive coefficient controlling persistence of predictors.
497+ Values closer to 1 produce smoother, more persistent series.
498+ Default 0.9.
499+ ar_scale : float
500+ Standard deviation of the AR(1) innovation noise for predictors.
501+ Default 1.0.
485502 seed : int, optional
486503 Random seed for reproducibility.
487504
@@ -494,6 +511,7 @@ def generate_event_study_data(
494511 - y: Outcome variable
495512 - treat_time: Treatment time for unit (NaN if never treated)
496513 - treated: Whether unit is in treated group (0 or 1)
514+ - <predictor_name>: One column per predictor (if predictor_effects provided)
497515
498516 Example
499517 --------
@@ -505,6 +523,20 @@ def generate_event_study_data(
505523 (400, 5)
506524 >>> df.columns.tolist()
507525 ['unit', 'time', 'y', 'treat_time', 'treated']
526+
527+ With predictors:
528+
529+ >>> df = generate_event_study_data(
530+ ... n_units=10,
531+ ... n_time=10,
532+ ... treatment_time=5,
533+ ... seed=42,
534+ ... predictor_effects={"temperature": 0.3, "humidity": -0.1},
535+ ... )
536+ >>> df.shape
537+ (100, 7)
538+ >>> "temperature" in df.columns and "humidity" in df.columns
539+ True
508540 """
509541 if seed is not None :
510542 np .random .seed (seed )
@@ -529,6 +561,16 @@ def generate_event_study_data(
529561 # Generate time fixed effects
530562 time_fe = np .random .normal (0 , time_fe_sigma , n_time )
531563
564+ # Generate predictor time series (if any)
565+ # Each predictor is an AR(1) series that varies over time but is the same
566+ # for all units at a given time
567+ predictors : dict [str , np .ndarray ] = {}
568+ if predictor_effects is not None :
569+ for predictor_name in predictor_effects :
570+ predictors [predictor_name ] = generate_ar1_series (
571+ n = n_time , phi = ar_phi , scale = ar_scale
572+ )
573+
532574 # Build panel data
533575 data = []
534576 for unit in range (n_units ):
@@ -539,6 +581,11 @@ def generate_event_study_data(
539581 # Base outcome: unit FE + time FE + noise
540582 y = unit_fe [unit ] + time_fe [t ] + np .random .normal (0 , noise_sigma )
541583
584+ # Add predictor contributions to outcome
585+ if predictor_effects is not None :
586+ for predictor_name , coef in predictor_effects .items ():
587+ y += coef * predictors [predictor_name ][t ]
588+
542589 # Add treatment effect for treated units in event window
543590 if is_treated :
544591 event_time = t - treatment_time
@@ -548,15 +595,18 @@ def generate_event_study_data(
548595 ):
549596 y += treatment_effects [event_time ]
550597
551- data .append (
552- {
553- "unit" : unit ,
554- "time" : t ,
555- "y" : y ,
556- "treat_time" : unit_treat_time ,
557- "treated" : 1 if is_treated else 0 ,
558- }
559- )
598+ row = {
599+ "unit" : unit ,
600+ "time" : t ,
601+ "y" : y ,
602+ "treat_time" : unit_treat_time ,
603+ "treated" : 1 if is_treated else 0 ,
604+ }
605+ # Add predictor values to the row
606+ for predictor_name , series in predictors .items ():
607+ row [predictor_name ] = series [t ]
608+
609+ data .append (row )
560610
561611 return pd .DataFrame (data )
562612
@@ -566,6 +616,52 @@ def generate_event_study_data(
566616# -----------------
567617
568618
619+ def generate_ar1_series (
620+ n : int ,
621+ phi : float = 0.9 ,
622+ scale : float = 1.0 ,
623+ initial : float = 0.0 ,
624+ ) -> np .ndarray :
625+ """
626+ Generate an AR(1) autoregressive time series.
627+
628+ The AR(1) process is defined as:
629+ x_{t+1} = phi * x_t + eta_t, where eta_t ~ N(0, scale^2)
630+
631+ Parameters
632+ ----------
633+ n : int
634+ Length of the time series to generate.
635+ phi : float
636+ Autoregressive coefficient controlling persistence. Values closer to 1
637+ produce smoother, more persistent series. Must be in (-1, 1) for
638+ stationarity. Default 0.9.
639+ scale : float
640+ Standard deviation of the innovation noise. Default 1.0.
641+ initial : float
642+ Initial value of the series. Default 0.0.
643+
644+ Returns
645+ -------
646+ np.ndarray
647+ Array of length n containing the AR(1) time series.
648+
649+ Example
650+ -------
651+ >>> from causalpy.data.simulate_data import generate_ar1_series
652+ >>> np.random.seed(42)
653+ >>> series = generate_ar1_series(n=10, phi=0.9, scale=0.5)
654+ >>> len(series)
655+ 10
656+ """
657+ series = np .zeros (n )
658+ series [0 ] = initial
659+ innovations = np .random .normal (0 , scale , n - 1 )
660+ for t in range (1 , n ):
661+ series [t ] = phi * series [t - 1 ] + innovations [t - 1 ]
662+ return series
663+
664+
569665def generate_seasonality (
570666 n : int = 12 , amplitude : int = 1 , length_scale : float = 0.5
571667) -> np .ndarray :
0 commit comments