1+ rm(list = ls())
2+ library(PKLMtest )
3+
4+ set.seed(42 )
5+
6+ # Generate uniform random variables
7+ unif_features <- replicate(100 , runif(500 , min = 0 , max = 1 ))
8+
9+ empirical_cdf <- function (data ) {
10+ sorted_data <- sort(data )
11+ y_values <- seq(1 , length(data )) / length(data )
12+ return (list (x = sorted_data , y = y_values ))
13+ }
14+
15+ # Generate MCAR data
16+
17+ num_sim = 500
18+ num_variables <- 5
19+ num_samples <- 100
20+ r <- 0.65
21+ prob_nan <- 1 - r ^ (1 / num_variables )
22+
23+ results_p_values <- numeric (num_sim )
24+
25+ for (i in 1 : num_sim ) {
26+ data <- matrix (runif(num_variables * num_samples , min = 0 , max = 1 ), ncol = num_variables )
27+ mask <- matrix (runif(num_variables * num_samples ) < prob_nan , ncol = num_variables )
28+ data [mask ] <- NaN
29+ results_p_values [[i ]] <- PKLMtest(
30+ data ,
31+ num.proj = 100 ,
32+ nrep = 30 ,
33+ num.trees.per.proj = 200 ,
34+ size.resp.set = 2
35+ )
36+ }
37+
38+ cdf_empirique_p_values <- ecdf(results_p_values )
39+
40+ # Plot
41+ plot(
42+ cdf_empirique_p_values ,
43+ main = " Cumulative distribution function value of the p-values under H0" ,
44+ xlab = " x: p_values under H0" ,
45+ ylab = " F(x)" ,
46+ col = " black" ,
47+ lwd = 2 ,
48+ xlim = c(0 , 1 ),
49+ ylim = c(0 , 1 )
50+ )
51+
52+ for (i in 1 : 100 ) {
53+ cdf <- empirical_cdf(unif_features [, i ])
54+ lines(cdf $ x , cdf $ y , col = rgb(0 , 0 , 1 , alpha = 0.2 ))
55+ }
56+
57+ abline(a = 0 , b = 1 , col = " red" , lwd = 2 )
0 commit comments