|
8 | 8 | """ |
9 | 9 |
|
10 | 10 | # %% |
| 11 | +import pandas as pd |
11 | 12 | import numpy as np |
12 | 13 | import matplotlib.pyplot as plt |
13 | 14 |
|
|
18 | 19 | from qolmat.imputations.diffusions.ddpms import TabDDPM, TsDDPM |
19 | 20 |
|
20 | 21 | # %% |
21 | | -# 1. Data |
| 22 | +# 1. Time-series data |
22 | 23 | # --------------------------------------------------------------- |
23 | 24 | # We use the public Beijing Multi-Site Air-Quality Data Set. |
24 | 25 | # It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality |
25 | 26 | # monitoring sites. The original data from which the features were extracted comes from |
26 | | -# https://archive.ics.uci.edu/static/public/501/beijing+multi+site+air+quality+data.zip |
| 27 | +# https://archive.ics.uci.edu/static/public/501/beijing+multi+site+air+quality+data.zip. |
| 28 | +# For this tutorial, we only use a small subset of this data |
| 29 | +# 1000 rows and 2 features (TEMP, PRES). |
27 | 30 |
|
28 | 31 | df_data = data.get_data_corrupted("Beijing") |
| 32 | +df_data = df_data[["TEMP", "PRES"]].iloc[:1000] |
| 33 | +df_data.index = df_data.index.set_levels( |
| 34 | + [df_data.index.levels[0], pd.to_datetime(df_data.index.levels[1])] |
| 35 | +) |
29 | 36 |
|
30 | 37 | print("Number of nan at each column:") |
31 | 38 | print(df_data.isna().sum()) |
|
59 | 66 | # * ``print_valid``: a boolean to display/hide a training progress (including epoch_loss, |
60 | 67 | # remaining training duration and performance scores computed by the metrics above). |
61 | 68 |
|
62 | | -df_data_valid = df_data.iloc[:5000] |
| 69 | +df_data_valid = df_data.iloc[:500] |
63 | 70 |
|
64 | 71 | tabddpm = ImputerDiffusion( |
65 | 72 | model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True |
|
71 | 78 |
|
72 | 79 | print(tabddpm.get_summary_architecture()) |
73 | 80 |
|
74 | | - |
75 | 81 | # %% |
76 | 82 | # We also get the summary of the training progress with ``get_summary_training()`` |
77 | 83 |
|
|
144 | 150 | # * ``dim_embedding``: dimension of hidden layers in residual blocks (``int = 128``) |
145 | 151 | # |
146 | 152 | # Let see an example below. We can observe that a large ``num_sampling`` generally improves |
147 | | -# reconstruction errors (mae, wmape) but increases distribution distance (KL_columnwise, |
148 | | -# wasserstein_columnwise). |
| 153 | +# reconstruction errors (mae) but increases distribution distance (KL_columnwise). |
149 | 154 |
|
150 | 155 | dict_imputers = { |
151 | 156 | "num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100), |
152 | | - "num_sampling=20": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100), |
| 157 | + "num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100), |
153 | 158 | } |
154 | 159 |
|
155 | 160 | comparison = comparator.Comparator( |
156 | 161 | dict_imputers, |
157 | 162 | selected_columns=df_data.columns, |
158 | | - generator_holes=missing_patterns.UniformHoleGenerator(n_splits=4), |
159 | | - metrics=["mae", "wmape", "KL_columnwise", "wasserstein_columnwise"], |
160 | | - max_evals=10, |
| 163 | + generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2), |
| 164 | + metrics=["mae", "KL_columnwise"], |
161 | 165 | ) |
162 | | -results = comparison.compare(df_data.iloc[:5000]) |
| 166 | +results = comparison.compare(df_data) |
163 | 167 |
|
164 | 168 | results.groupby(axis=0, level=0).mean().groupby(axis=0, level=0).mean() |
165 | 169 |
|
|
174 | 178 | # and ``freq_str``. |
175 | 179 | # E.g., ``ImputerDiffusion(model=TabDDPM(), index_datetime='datetime', freq_str='1D')``, |
176 | 180 | # |
177 | | -# * ``index_datetime``: the column name of datetime in index. |
| 181 | +# * ``index_datetime``: the column name of datetime in index. It must be a pandas datetime object. |
178 | 182 | # |
179 | 183 | # * ``freq_str``: the time-series frequency for splitting data into a list of chunks (each chunk |
180 | 184 | # has the same number of rows). These chunks are fetched up in batches. |
|
196 | 200 |
|
197 | 201 | dict_imputers = { |
198 | 202 | "tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100), |
199 | | - "TsDDPM": ImputerDiffusion( |
200 | | - model=TsDDPM(num_sampling=5, is_rolling=True), |
| 203 | + "tsddpm": ImputerDiffusion( |
| 204 | + model=TsDDPM(num_sampling=5, is_rolling=False), |
201 | 205 | epochs=10, |
202 | | - batch_size=100, |
203 | | - index_datetime="datetime", |
204 | | - freq_str="10D", |
| 206 | + batch_size=5, |
| 207 | + index_datetime="date", |
| 208 | + freq_str="5D", |
205 | 209 | ), |
206 | 210 | } |
207 | 211 |
|
208 | 212 | comparison = comparator.Comparator( |
209 | 213 | dict_imputers, |
210 | 214 | selected_columns=df_data.columns, |
211 | | - generator_holes=missing_patterns.UniformHoleGenerator(n_splits=4), |
212 | | - metrics=["mae", "wmape", "KL_columnwise", "wasserstein_columnwise"], |
213 | | - max_evals=10, |
| 215 | + generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2), |
| 216 | + metrics=["mae", "KL_columnwise"], |
214 | 217 | ) |
215 | | -results = comparison.compare(df_data.iloc[:5000]) |
| 218 | +results = comparison.compare(df_data) |
216 | 219 |
|
217 | 220 | results.groupby(axis=0, level=0).mean().groupby(axis=0, level=0).mean() |
218 | 221 |
|
|
0 commit comments