|
| 1 | +""" |
| 2 | +=============================================== |
| 3 | +Tutorial for imputers based on diffusion models |
| 4 | +=============================================== |
| 5 | +
|
| 6 | +In this tutorial, we show how to use :class:`~qolmat.imputations.diffusions.ddpms.TabDDPM` |
| 7 | +and :class:`~qolmat.imputations.diffusions.ddpms.TsDDPM` classes. |
| 8 | +""" |
| 9 | + |
| 10 | +# %% |
| 11 | +import pandas as pd |
| 12 | +import numpy as np |
| 13 | +import matplotlib.pyplot as plt |
| 14 | + |
| 15 | +from qolmat.utils import data |
| 16 | +from qolmat.benchmark import comparator, missing_patterns |
| 17 | + |
| 18 | +from qolmat.imputations.imputers_pytorch import ImputerDiffusion |
| 19 | +from qolmat.imputations.diffusions.ddpms import TabDDPM, TsDDPM |
| 20 | + |
| 21 | +# %% |
| 22 | +# 1. Time-series data |
| 23 | +# --------------------------------------------------------------- |
| 24 | +# We use the public Beijing Multi-Site Air-Quality Data Set. |
| 25 | +# It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality |
| 26 | +# monitoring sites. The original data from which the features were extracted comes from |
| 27 | +# https://archive.ics.uci.edu/static/public/501/beijing+multi+site+air+quality+data.zip. |
| 28 | +# For this tutorial, we only use a small subset of this data |
| 29 | +# 1000 rows and 2 features (TEMP, PRES). |
| 30 | + |
| 31 | +df_data = data.get_data_corrupted("Beijing") |
| 32 | +df_data = df_data[["TEMP", "PRES"]].iloc[:1000] |
| 33 | +df_data.index = df_data.index.set_levels( |
| 34 | + [df_data.index.levels[0], pd.to_datetime(df_data.index.levels[1])] |
| 35 | +) |
| 36 | + |
| 37 | +print("Number of nan at each column:") |
| 38 | +print(df_data.isna().sum()) |
| 39 | + |
| 40 | +# %% |
| 41 | +# 2. Hyperparameters for the wapper ImputerDiffusion |
| 42 | +# --------------------------------------------------------------- |
| 43 | +# We use the wapper :class:`~qolmat.imputations.imputers_pytorch.ImputerDiffusion` for our |
| 44 | +# diffusion models (e.g., :class:`~qolmat.imputations.diffusions.ddpms.TabDDPM`, |
| 45 | +# :class:`~qolmat.imputations.diffusions.ddpms.TsDDPM`). The most important hyperparameter |
| 46 | +# is ``model`` where we select a diffusion base model for the task of imputation |
| 47 | +# (e.g., ``model=TabDDPM()``). |
| 48 | +# Other hyperparams are for training the selected diffusion model. |
| 49 | +# |
| 50 | +# * ``cols_imputed``: list of columns that need to be imputed. Recall that we train the model on |
| 51 | +# incomplete data by using the self-supervised learning method. We can set which columns to be |
| 52 | +# masked during training. Its defaut value is ``None``. |
| 53 | +# |
| 54 | +# * ``epochs`` : a number of iterations, its defaut value ``epochs=10``. In practice, we should |
| 55 | +# set a larger number of epochs e.g., ``epochs=100``. |
| 56 | +# |
| 57 | +# * ``batch_size`` : a size of batch, its defaut value ``batch_size=100``. |
| 58 | +# |
| 59 | +# The following hyperparams are for validation: |
| 60 | +# |
| 61 | +# * ``x_valid``: a validation set. |
| 62 | +# |
| 63 | +# * ``metrics_valid``: a list validation metrics (see all metrics :doc:`imputers`). Its default |
| 64 | +# value ``metrics_valid=(metrics.mean_absolute_error, metrics.dist_wasserstein,)`` |
| 65 | +# |
| 66 | +# * ``print_valid``: a boolean to display/hide a training progress (including epoch_loss, |
| 67 | +# remaining training duration and performance scores computed by the metrics above). |
| 68 | + |
| 69 | +df_data_valid = df_data.iloc[:500] |
| 70 | + |
| 71 | +tabddpm = ImputerDiffusion( |
| 72 | + model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True |
| 73 | +) |
| 74 | +tabddpm = tabddpm.fit(df_data) |
| 75 | + |
| 76 | +# %% |
| 77 | +# We can see the architecture of the TabDDPM with ``get_summary_architecture()`` |
| 78 | + |
| 79 | +print(tabddpm.get_summary_architecture()) |
| 80 | + |
| 81 | +# %% |
| 82 | +# We also get the summary of the training progress with ``get_summary_training()`` |
| 83 | + |
| 84 | +summary = tabddpm.get_summary_training() |
| 85 | + |
| 86 | +print(f"Performance metrics: {list(summary.keys())}") |
| 87 | + |
| 88 | +metric = "mean_absolute_error" |
| 89 | +metric_scores = summary[metric] |
| 90 | + |
| 91 | +fig, ax = plt.subplots() |
| 92 | +ax.plot(range(len(metric_scores)), metric_scores) |
| 93 | +ax.set_xlabel("Epoch") |
| 94 | +ax.set_ylabel(metric) |
| 95 | + |
| 96 | +plt.show() |
| 97 | + |
| 98 | + |
| 99 | +# %% |
| 100 | +# We display the imputations for the variable TEMP. |
| 101 | + |
| 102 | +df_imputed = tabddpm.transform(df_data) |
| 103 | + |
| 104 | +station = df_data.index.get_level_values("station")[0] |
| 105 | +col = "TEMP" |
| 106 | + |
| 107 | +values_orig = df_data.loc[station, col] |
| 108 | +values_imp = df_imputed.loc[station, col].copy() |
| 109 | + |
| 110 | +fig, ax = plt.subplots(figsize=(10, 3)) |
| 111 | +plt.plot(values_orig, ".", color="black", label="original") |
| 112 | + |
| 113 | +values_imp[values_orig.notna()] = np.nan |
| 114 | + |
| 115 | +plt.plot(values_imp, ".", color="blue", label="TabDDPM") |
| 116 | +plt.ylabel(col, fontsize=10) |
| 117 | +plt.legend(loc=[1.01, 0], fontsize=10) |
| 118 | +ax.tick_params(axis="both", which="major", labelsize=10) |
| 119 | +plt.show() |
| 120 | + |
| 121 | +# %% |
| 122 | +# 3. Hyperparameters for TabDDPM |
| 123 | +# --------------------------------------------------------------- |
| 124 | +# :class:`~qolmat.imputations.diffusions.ddpms.TabDDPM` is a diffusion model based on |
| 125 | +# Denoising Diffusion Probabilistic Models [1] for imputing tabular data. Several important |
| 126 | +# hyperparameters are |
| 127 | +# |
| 128 | +# * ``num_noise_steps``: the number of step in the forward/reverse process. |
| 129 | +# It is T in the equation 1 of [1]. Its default value ``num_noise_steps=50``. |
| 130 | +# Note that a larger value can improve imputation quality but also increases inference time. |
| 131 | +# |
| 132 | +# * ``beta_start`` and ``beta_end``: the minimum and the maximum value |
| 133 | +# for the linear variance schedule (equation 2 of [1]). |
| 134 | +# Their default values ``beta_start=1e-4``, ``beta_end=0.02`` |
| 135 | +# |
| 136 | +# * ``num_sampling``: for each missing value, the model generates n imputation variants. |
| 137 | +# The mean value of these variants is returned. |
| 138 | +# Based on our experiments, a large n (n > 5) often improves reconstruction scores (e.g., MAE). |
| 139 | +# Its default value ``num_sampling=1``. |
| 140 | +# |
| 141 | +# * ``ratio_nan=0.1``: in the self-supervised learning method, we need to randomly mask partial |
| 142 | +# observed data based on this ratio of missing values. |
| 143 | +# |
| 144 | +# Other hyperparams for building this deep learning model are |
| 145 | +# |
| 146 | +# * ``lr``: learning rate (``float = 0.001``) |
| 147 | +# |
| 148 | +# * ``num_blocks``: number of residual blocks (``int = 1)`` |
| 149 | +# |
| 150 | +# * ``dim_embedding``: dimension of hidden layers in residual blocks (``int = 128``) |
| 151 | +# |
| 152 | +# Let see an example below. We can observe that a large ``num_sampling`` generally improves |
| 153 | +# reconstruction errors (mae) but increases distribution distance (KL_columnwise). |
| 154 | + |
| 155 | +dict_imputers = { |
| 156 | + "num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100), |
| 157 | + "num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100), |
| 158 | +} |
| 159 | + |
| 160 | +comparison = comparator.Comparator( |
| 161 | + dict_imputers, |
| 162 | + selected_columns=df_data.columns, |
| 163 | + generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2), |
| 164 | + metrics=["mae", "KL_columnwise"], |
| 165 | +) |
| 166 | +results = comparison.compare(df_data) |
| 167 | + |
| 168 | +results.groupby(axis=0, level=0).mean().groupby(axis=0, level=0).mean() |
| 169 | + |
| 170 | +# %% |
| 171 | +# 4. Hyperparameters for TsDDPM |
| 172 | +# --------------------------------------------------------------- |
| 173 | +# :class:`~qolmat.imputations.diffusions.ddpms.TsDDPM` is built on top of |
| 174 | +# :class:`~qolmat.imputations.diffusions.ddpms.TabDDPM` to capture time-based relationships |
| 175 | +# between data points in a dataset. |
| 176 | +# |
| 177 | +# Two important hyperparameters for processing time-series data are ``index_datetime`` |
| 178 | +# and ``freq_str``. |
| 179 | +# E.g., ``ImputerDiffusion(model=TabDDPM(), index_datetime='datetime', freq_str='1D')``, |
| 180 | +# |
| 181 | +# * ``index_datetime``: the column name of datetime in index. It must be a pandas datetime object. |
| 182 | +# |
| 183 | +# * ``freq_str``: the time-series frequency for splitting data into a list of chunks (each chunk |
| 184 | +# has the same number of rows). These chunks are fetched up in batches. |
| 185 | +# A large frequency e.g., ``6M``, ``1Y`` can cause the out of memory. |
| 186 | +# Its default value ``freq_str: str = "1D"``. Time series frequencies can be found in this |
| 187 | +# `link <https://pandas.pydata.org/pandas-docs/ |
| 188 | +# stable/user_guide/timeseries.html#offset-aliases>`_ |
| 189 | +# |
| 190 | +# For TsDDPM, we have two options for splitting data: |
| 191 | +# |
| 192 | +# * ``is_rolling=False`` (default value): the data is splited by using |
| 193 | +# pandas.DataFrame.resample(rule=freq_str). There is no duplication of row between chunks, |
| 194 | +# leading a smaller number of chunks than the number of rows in the original data. |
| 195 | +# |
| 196 | +# * ``is_rolling=True``: the data is splited by using pandas.DataFrame.rolling(window=freq_str). |
| 197 | +# The number of chunks is also the number of rows in the original data. |
| 198 | +# Note that setting ``is_rolling=True`` always produces better quality of imputations |
| 199 | +# but requires a longer training/inference time. |
| 200 | + |
| 201 | +dict_imputers = { |
| 202 | + "tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100), |
| 203 | + "tsddpm": ImputerDiffusion( |
| 204 | + model=TsDDPM(num_sampling=5, is_rolling=False), |
| 205 | + epochs=10, |
| 206 | + batch_size=5, |
| 207 | + index_datetime="date", |
| 208 | + freq_str="5D", |
| 209 | + ), |
| 210 | +} |
| 211 | + |
| 212 | +comparison = comparator.Comparator( |
| 213 | + dict_imputers, |
| 214 | + selected_columns=df_data.columns, |
| 215 | + generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2), |
| 216 | + metrics=["mae", "KL_columnwise"], |
| 217 | +) |
| 218 | +results = comparison.compare(df_data) |
| 219 | + |
| 220 | +results.groupby(axis=0, level=0).mean().groupby(axis=0, level=0).mean() |
| 221 | + |
| 222 | +# %% |
| 223 | +# [1] Ho, Jonathan, Ajay Jain, and Pieter Abbeel. `Denoising diffusion probabilistic models. |
| 224 | +# <https://arxiv.org/abs/2006.11239>`_ |
| 225 | +# Advances in neural information processing systems 33 (2020): 6840-6851. |
0 commit comments