Skip to content

Commit 290939e

Browse files
committed
✏️ Fix typo
1 parent f2307da commit 290939e

File tree

9 files changed

+496
-285
lines changed

9 files changed

+496
-285
lines changed

examples/tutorials/plot_tuto_benchmark_TS.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@
7878
ratio_masked = 0.1
7979

8080
imputer_median = imputers.ImputerSimple(groups=("station",), strategy="median")
81-
imputer_interpol = imputers.ImputerInterpolation(groups=("station",), method="linear")
81+
imputer_interpol = imputers.ImputerInterpolation(
82+
groups=("station",), method="linear"
83+
)
8284
imputer_residuals = imputers.ImputerResiduals(
8385
groups=("station",),
8486
period=365,
@@ -103,7 +105,10 @@
103105
)
104106

105107
generator_holes = missing_patterns.EmpiricalHoleGenerator(
106-
n_splits=4, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked
108+
n_splits=4,
109+
groups=("station",),
110+
subset=cols_to_impute,
111+
ratio_masked=ratio_masked,
107112
)
108113

109114
dict_imputers = {
@@ -142,11 +147,17 @@
142147
# Aotizhongxin
143148

144149
df_plot = df[cols_to_impute]
145-
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
150+
dfs_imputed = {
151+
name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()
152+
}
146153
station = "Aotizhongxin"
147154
df_station = df_plot.loc[station]
148-
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
149-
fig, axs = plt.subplots(3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute)))
155+
dfs_imputed_station = {
156+
name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()
157+
}
158+
fig, axs = plt.subplots(
159+
3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute))
160+
)
150161
for col, ax in zip(cols_to_impute, axs.flatten()):
151162
values_orig = df_station[col]
152163
ax.plot(values_orig, ".", color="black", label="original")
@@ -174,7 +185,9 @@
174185
fig = plt.figure(figsize=(10, 10))
175186
i_plot = 1
176187
for i, col in enumerate(cols_to_impute[:-1]):
177-
for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed_station.items()):
188+
for i_imputer, (name_imputer, df_imp) in enumerate(
189+
dfs_imputed_station.items()
190+
):
178191
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
179192
plot.compare_covariances(
180193
df_station,

examples/tutorials/plot_tuto_diffusion_models.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@
6666
df_data_valid = df_data.iloc[:500]
6767

6868
tabddpm = ImputerDiffusion(
69-
model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True
69+
model=TabDDPM(),
70+
epochs=10,
71+
batch_size=100,
72+
x_valid=df_data_valid,
73+
print_valid=True,
7074
)
7175
tabddpm = tabddpm.fit(df_data)
7276

@@ -150,8 +154,12 @@
150154
# reconstruction errors (mae) but increases distribution distance (KL_columnwise).
151155

152156
dict_imputers = {
153-
"num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
154-
"num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100),
157+
"num_sampling=5": ImputerDiffusion(
158+
model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
159+
),
160+
"num_sampling=10": ImputerDiffusion(
161+
model=TabDDPM(num_sampling=10), epochs=10, batch_size=100
162+
),
155163
}
156164

157165
comparison = comparator.Comparator(
@@ -196,7 +204,9 @@
196204
# but requires a longer training/inference time.
197205

198206
dict_imputers = {
199-
"tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
207+
"tabddpm": ImputerDiffusion(
208+
model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
209+
),
200210
"tsddpm": ImputerDiffusion(
201211
model=TsDDPM(num_sampling=5, is_rolling=False),
202212
epochs=10,

examples/tutorials/plot_tuto_hole_generator.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
It consists in hourly air pollutants data from 12 chinese nationally-controlled
1515
air-quality monitoring sites.
1616
"""
17+
1718
from typing import List
1819

1920
import matplotlib
@@ -49,7 +50,9 @@
4950
# Missing values are in white, while observed ones are in black.
5051

5152
plt.figure(figsize=(15, 4))
52-
plt.imshow(df.notna().values.T, aspect="auto", cmap="binary", interpolation="none")
53+
plt.imshow(
54+
df.notna().values.T, aspect="auto", cmap="binary", interpolation="none"
55+
)
5356
plt.yticks(range(len(df.columns)), df.columns)
5457
plt.xlabel("Samples", fontsize=12)
5558
plt.grid(False)
@@ -96,7 +99,9 @@ def visualise_missing_values(df_init: pd.DataFrame, df_mask: pd.DataFrame):
9699
colorsList = [(0.9, 0, 0), (0, 0, 0), (0.8, 0.8, 0.8)]
97100
custom_cmap = matplotlib.colors.ListedColormap(colorsList)
98101
plt.figure(figsize=(15, 4))
99-
plt.imshow(df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none")
102+
plt.imshow(
103+
df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none"
104+
)
100105
plt.yticks(range(len(df_tot.columns)), df_tot.columns)
101106
plt.xlabel("Samples", fontsize=12)
102107
plt.grid(False)
@@ -156,7 +161,9 @@ def plot_cdf(
156161
_, axs = plt.subplots(1, df.shape[1], sharey=True, figsize=(15, 3))
157162

158163
hole_sizes_original = get_holes_sizes_column_wise(df.to_numpy())
159-
for ind, (hole_original, col) in enumerate(zip(hole_sizes_original, df.columns)):
164+
for ind, (hole_original, col) in enumerate(
165+
zip(hole_sizes_original, df.columns)
166+
):
160167
sorted_data = np.sort(hole_original)
161168
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
162169
axs[ind].plot(sorted_data, cdf, c="gray", lw=2, label="original")
@@ -166,7 +173,9 @@ def plot_cdf(
166173
array_mask[array_mask == True] = np.nan
167174
hole_sizes_created = get_holes_sizes_column_wise(array_mask.to_numpy())
168175

169-
for ind, (hole_created, col) in enumerate(zip(hole_sizes_created, df.columns)):
176+
for ind, (hole_created, col) in enumerate(
177+
zip(hole_sizes_created, df.columns)
178+
):
170179
sorted_data = np.sort(hole_created)
171180
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
172181
axs[ind].plot(sorted_data, cdf, c=color, lw=2, label=label)
@@ -309,7 +318,13 @@ def plot_cdf(
309318

310319
plot_cdf(
311320
df,
312-
[uniform_mask, geometric_mask, empirical_mask, multi_markov_mask, grouped_mask],
321+
[
322+
uniform_mask,
323+
geometric_mask,
324+
empirical_mask,
325+
multi_markov_mask,
326+
grouped_mask,
327+
],
313328
["uniform", "geometric", "empirical", "mutli markov", "grouped"],
314329
["tab:orange", "tab:blue", "tab:green", "tab:pink", "tab:olive"],
315330
)

examples/tutorials/plot_tuto_mcar.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,16 @@
7373
df_observed = df.loc[~has_nan]
7474
df_hidden = df.loc[has_nan]
7575

76-
plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
77-
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
76+
plt.scatter(
77+
df_observed["Column 1"],
78+
df_observed[["Column 2"]],
79+
label="Fully observed values",
80+
)
81+
plt.scatter(
82+
df_hidden[["Column 1"]],
83+
df_hidden[["Column 2"]],
84+
label="Values with missing C2",
85+
)
7886

7987
plt.legend(
8088
loc="lower left",
@@ -99,16 +107,26 @@
99107
# Case 2: MAR holes with mean bias (True positive)
100108
# ================================================
101109

102-
df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index)
110+
df_mask = pd.DataFrame(
111+
{"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index
112+
)
103113

104114
df_nan = df.where(~df_mask, np.nan)
105115

106116
has_nan = df_mask.any(axis=1)
107117
df_observed = df.loc[~has_nan]
108118
df_hidden = df.loc[has_nan]
109119

110-
plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
111-
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
120+
plt.scatter(
121+
df_observed["Column 1"],
122+
df_observed[["Column 2"]],
123+
label="Fully observed values",
124+
)
125+
plt.scatter(
126+
df_hidden[["Column 1"]],
127+
df_hidden[["Column 2"]],
128+
label="Values with missing C2",
129+
)
112130

113131
plt.legend(
114132
loc="lower left",
@@ -139,7 +157,8 @@
139157
# MAR but the means between missing patterns is not statistically different.
140158

141159
df_mask = pd.DataFrame(
142-
{"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index
160+
{"Column 1": False, "Column 2": df["Column 1"].abs() > q975},
161+
index=df.index,
143162
)
144163

145164
df_nan = df.where(~df_mask, np.nan)
@@ -148,8 +167,16 @@
148167
df_observed = df.loc[~has_nan]
149168
df_hidden = df.loc[has_nan]
150169

151-
plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
152-
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
170+
plt.scatter(
171+
df_observed["Column 1"],
172+
df_observed[["Column 2"]],
173+
label="Fully observed values",
174+
)
175+
plt.scatter(
176+
df_hidden[["Column 1"]],
177+
df_hidden[["Column 2"]],
178+
label="Values with missing C2",
179+
)
153180

154181
plt.legend(
155182
loc="lower left",
@@ -182,7 +209,7 @@
182209
#
183210
# We also note that the Little's test does not handle categorical data or temporally
184211
# correlated data.
185-
#
212+
#
186213
# This is why we have implemented the PKLM test, which makes up for the shortcomings of the Little
187214
# test. We present this test in more detail in the next section.
188215

@@ -277,21 +304,18 @@
277304
col1 = rng.rand(n_rows) * 100
278305
col2 = rng.randint(1, 100, n_rows)
279306
col3 = rng.choice([True, False], n_rows)
280-
modalities = ['A', 'B', 'C', 'D']
307+
modalities = ["A", "B", "C", "D"]
281308
col4 = rng.choice(modalities, n_rows)
282309

283-
df = pd.DataFrame({
284-
'Numeric1': col1,
285-
'Numeric2': col2,
286-
'Boolean': col3,
287-
'Object': col4
288-
})
310+
df = pd.DataFrame(
311+
{"Numeric1": col1, "Numeric2": col2, "Boolean": col3, "Object": col4}
312+
)
289313

290314
hole_gen = UniformHoleGenerator(
291315
n_splits=1,
292316
ratio_masked=0.2,
293-
subset=['Numeric1', 'Numeric2', 'Boolean', 'Object'],
294-
random_state=rng
317+
subset=["Numeric1", "Numeric2", "Boolean", "Object"],
318+
random_state=rng,
295319
)
296320
df_mask = hole_gen.generate_mask(df)
297321
df_nan = df.where(~df_mask, np.nan)
@@ -328,9 +352,11 @@
328352
data = rng.multivariate_normal(
329353
mean=[0, 0, 0, 0],
330354
cov=[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
331-
size=400
355+
size=400,
356+
)
357+
df = pd.DataFrame(
358+
data=data, columns=["Column 1", "Column 2", "Column 3", "Column 4"]
332359
)
333-
df = pd.DataFrame(data=data, columns=["Column 1", "Column 2", "Column 3", "Column 4"])
334360

335361
df_mask = pd.DataFrame(
336362
{
@@ -339,7 +365,7 @@
339365
"Column 3": False,
340366
"Column 4": False,
341367
},
342-
index=df.index
368+
index=df.index,
343369
)
344370
df_nan = df.where(~df_mask, np.nan)
345371

@@ -360,7 +386,9 @@
360386

361387
# %%
362388
for col_index, partial_p_v in enumerate(partial_p_values):
363-
print(f"The partial p-value for the column index {col_index + 1} is: {partial_p_v:.2%}")
389+
print(
390+
f"The partial p-value for the column index {col_index + 1} is: {partial_p_v:.2%}"
391+
)
364392

365393
# %%
366394
# As a result, by removing the missing patterns induced by variable 2, the p-value rises
@@ -380,4 +408,4 @@
380408
# | 10000 | 6 | 20"54 |
381409
# | 10000 | 10 | 14"48 |
382410
# | 100000 | 10 | 4'51" |
383-
# | 100000 | 15 | 3'06" |
411+
# | 100000 | 15 | 3'06" |

0 commit comments

Comments
 (0)