scikit-learn-contrib
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 5 additions & 7 deletions b/‎.github/workflows/test.yml‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎.github/workflows/test_quick.yml‎
Lines changed: 68 additions & 0 deletions b/‎.github/workflows/test_quick.yml‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎.readthedocs.yml‎
Lines changed: 5 additions & 2 deletions b/‎.readthedocs.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎CONTRIBUTING.rst‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎HISTORY.rst‎
Lines changed: 5 additions & 0 deletions b/‎HISTORY.rst‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 13 additions & 0 deletions b/‎Makefile‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎README.rst‎
Lines changed: 4 additions & 4 deletions b/‎README.rst‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/api.rst‎
Lines changed: 21 additions & 0 deletions b/‎docs/api.rst‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎docs/explanation.rst‎
Lines changed: 7 additions & 4 deletions b/‎docs/explanation.rst‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎docs/imputers.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/imputers.rst‎
Lines changed: 2 additions & 2 deletions
@@ -1,20 +1,22 @@
-name: Unit test Qolmat
+name: Unit tests
 
 on:
   push:
     branches:
       -dev
       -main
   pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
   workflow_dispatch:
 
 jobs:
   build-linux:
+    if: github.event.pull_request.draft == false
     runs-on: ${{matrix.os}}
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
-        python-version: [3.8, 3.9]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
     defaults:
       run:
         shell: bash -l {0}
@@ -27,16 +29,12 @@ jobs:
         with:
           python-version: ${{matrix.python-version}}
           environment-file: environment.ci.yml
-          channels: default, conda-forge
       - name: Lint with flake8
         run: |
-          conda install flake8
           flake8
       - name: Test with pytest
         run: |
-          conda install pytest
-          pytest
-          echo you should uncomment pytest and delete this line
+          make coverage
       - name: typing with mypy
         run: |
           mypy qolmat
 
@@ -0,0 +1,68 @@
+name: Unit tests fast
+
+on:
+  push:
+    branches-ignore:
+      - dev
+      - main
+  workflow_dispatch:
+
+jobs:
+  basic-testing:
+    runs-on: ${{matrix.os}}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Git clone
+        uses: actions/checkout@v3
+
+      # See caching environments
+      # https://github.com/conda-incubator/setup-miniconda#caching-environments
+      - name: Setup Mambaforge
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+            miniforge-variant: Mambaforge
+            miniforge-version: latest
+            activate-environment: env_qolmat_ci
+            use-mamba: true
+
+      - name: Get Date
+        id: get-date
+        run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
+
+      - name: Cache Conda env
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.CONDA }}/envs
+          key:
+            conda-${{ runner.os }}--${{ runner.arch }}--${{
+            steps.get-date.outputs.today }}-${{
+            hashFiles('environment.ci.yml') }}-${{ env.CACHE_NUMBER
+            }}
+        env:
+          # Increase this value to reset cache if environment.ci.yml has not changed
+          CACHE_NUMBER: 0
+        id: cache
+
+      - name: Update environment
+        run: mamba env update -n env_qolmat_ci -f environment.ci.yml
+        if: steps.cache.outputs.cache-hit != 'true'
+
+      - name: Lint with flake8
+        run: |
+          flake8
+      - name: Test with pytest
+        run: |
+          make coverage
+      - name: Test docstrings
+        run: make doctest
+      - name: typing with mypy
+        run: |
+          mypy qolmat
+          echo you should uncomment mypy qolmat and delete this line
@@ -1,13 +1,16 @@
 version: 2
 
 build:
-  image: latest
+  os: "ubuntu-22.04"
+  tools:
+    python: "mambaforge-22.9"
 
 python:
-  version: 3.8
   install:
     - method: pip
       path: .
+      extra_requirements:
+        - pytorch
 
 conda:
   environment: environment.doc.yml
 
@@ -32,7 +32,7 @@ You can create a virtual environment via `conda`:
     $ conda env create -f environment.dev.yml
     $ conda activate env_qolmat_dev
 
-If you need to use tensorflow, enter the command:
+If you need to use pytorch, enter the command:
 
 .. code:: sh
 
 
@@ -2,6 +2,11 @@
 History
 =======
 
+0.1.1 (2023-??-??)
+-------------------
+
+* Hotfix reference to tensorflow in the documentation, when it should be pytorch
+
 0.1.0 (2023-10-11)
 -------------------
 
 
@@ -0,0 +1,13 @@
+coverage:
+	pytest --cov-branch --cov=qolmat --cov-report=xml
+
+doctest:
+	pytest --doctest-modules --pyargs qolmat
+
+doc:
+	make html -C docs
+
+clean:
+	rm -rf .mypy_cache .pytest_cache .coverage*
+	rm -rf **__pycache__
+	make clean -C docs
@@ -47,7 +47,7 @@ Qolmat can be installed in different ways:
 .. code:: sh
 
     $ pip install qolmat  # installation via `pip`
-    $ pip install qolmat[tensorflow] # if you need tensorflow
+    $ pip install qolmat[pytorch] # if you need ImputerDiffusion relying on pytorch
     $ pip install git+https://github.com/Quantmetry/qolmat  # or directly from the github repository
 
 ⚡️ Quickstart
@@ -105,8 +105,8 @@ The full documentation can be found `on this link <https://qolmat.readthedocs.io
 
 **How does Qolmat work ?**
 
-Qolmat allows model selection for scikit-learn compatible imputation algorithms, by performing three steps pictured below:
-1) For each of the K folds, Qolmat artificially masks a set of observed values using a default or user specified `hole generator <explanation.html#hole-generator>`_,
+| Qolmat allows model selection for scikit-learn compatible imputation algorithms, by performing three steps pictured below:
+1) For each of the K folds, Qolmat artificially masks a set of observed values using a default or user specified `hole generator <explanation.html#hole-generator>`_.
 2) For each fold and each compared `imputation method <imputers.html>`_, Qolmat fills both the missing and the masked values, then computes each of the default or user specified `performance metrics <explanation.html#metrics>`_.
 3) For each compared imputer, Qolmat pools the computed metrics from the K folds into a single value.
 
@@ -117,7 +117,7 @@ This is very similar in spirit to the `cross_val_score <https://scikit-learn.org
 
 **Imputation methods**
 
-The following table contains the available imputation methods. We distinguish single imputation methods (aiming for pointwise accuracy, mostly deterministic) from multiple imputation methods (aiming for distribution similarity, mostly stochastic).
+The following table contains the available imputation methods. We distinguish single imputation methods (aiming for pointwise accuracy, mostly deterministic) from multiple imputation methods (aiming for distribution similarity, mostly stochastic). For further details regarding the distinction between single and multiple imputation, you can refer to the `Imputation article <https://en.wikipedia.org/wiki/Imputation_(statistics)>`_ on Wikipedia.
 
 .. list-table::
    :widths: 25 70 15 15
 
@@ -93,3 +93,24 @@ EM engine
 
     imputations.em_sampler.MultiNormalEM
     imputations.em_sampler.VARpEM
+
+Diffusion engine
+================
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+    
+    imputations.imputers_pytorch.ImputerDiffusion
+    imputations.diffusions.ddpms.TabDDPM
+    imputations.diffusions.ddpms.TsDDPM
+
+
+Utils
+================
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+    
+    utils.data.add_holes
@@ -99,7 +99,7 @@ We compute the associated complete dataset :math:`\hat{X}^{(k)}` for the partial
 -----------------
 
 Evaluating the imputers requires to generate holes that are representative of the holes at hand.
-The missingness mechanisms have been classified by Rubin [1] into MCAR, MAR and MNAR.
+The missingness mechanisms have been classified by :ref:`Rubin [1]<rubin-article>` into MCAR, MAR and MNAR.
 
 Suppose we have :math:`X_{obs}`, a subset of a complete data model :math:`X = (X_{obs}, X_{mis})`, which is not fully observable (:math:`X_{mis}` is the missing part).
 We define the matrix :math:`M` such that :math:`M_{ij}=1` if :math:`X_{ij}` is missing, and 0 otherwise, and we assume distribution of :math:`M` is parametrised by :math:`\psi`.
@@ -108,14 +108,14 @@ The observations are said to be Missing Completely at Random (MCAR) if the proba
 Formally,
 
 .. math::
-    P(M | X_{obs}, X_{mis}, \psi) = P(M, \psi), \quad \forall \psi.
+    P(M | X_{obs}, X_{mis}, \psi) = P(M | \psi), \quad \forall \psi.
 
 The observations are said to be Missing at Random (MAR) if the probability of an observation to be missing only depends on the observations. Formally,
 
 .. math::
     P(M | X_{obs}, X_{mis}, \psi) = P(M | X_{obs}, \psi), \quad \forall \psi, X_{mis}.
 
-Finally, the observations are said to be Missing Not at Random (MNAR) in all other cases, i.e. if P(M | X_{obs}, X_{mis}, \psi) does not simplify.
+Finally, the observations are said to be Missing Not at Random (MNAR) in all other cases, i.e. if :math:`P(M | X_{obs}, X_{mis}, \psi)` does not simplify.
 
 Qolmat allows to generate new missing values on a an existing dataset, but only in the MCAR case.
 
@@ -140,4 +140,7 @@ Qolmat can be used to search for hyperparameters in imputation functions. Let sa
 
 References
 ----------
-[1] Rubin, Donald B. `Inference and missing data. <https://www.math.wsu.edu/faculty/xchen/stat115/lectureNotes3/Rubin%20Inference%20and%20Missing%20Data.pdf>`_ Biometrika 63.3 (1976): 581-592.
+
+.. _rubin-article:
+
+[1] Rubin, Donald B. `Inference and missing data. <https://www.math.wsu.edu/faculty/xchen/stat115/lectureNotes3/Rubin%20Inference%20and%20Missing%20Data.pdf>`_ Biometrika 63.3 (1976): 581-592.
@@ -98,14 +98,14 @@ Two parametric distributions are implemented:
 9. TabDDPM
 -----------
 
-:class:`qolmat.diffusions.TabDDPM` is a deep learning imputer based on Denoising Diffusion Probabilistic Models (DDPMs) [7] for handling multivariate tabular data. Our implementation mainly follows the works of [8, 9]. Diffusion models focus on modeling the process of data transitions from noisy and incomplete observations to the underlying true data. They include two main processes:
+:class:`~qolmat.imputations.diffusions.ddpms.TabDDPM` is a deep learning imputer based on Denoising Diffusion Probabilistic Models (DDPMs) [7] for handling multivariate tabular data. Our implementation mainly follows the works of [8, 9]. Diffusion models focus on modeling the process of data transitions from noisy and incomplete observations to the underlying true data. They include two main processes:
 
 * Forward process perturbs observed data to noise until all the original data structures are lost. The pertubation is done over a series of steps. Let :math:`X_{obs}` be observed data, :math:`T` be the number of steps that noises :math:`\epsilon \sim \mathcal{N}(0,I)` are added into the observed data. Therefore, :math:`X_{obs}^t = \bar{\alpha}_t \times X_{obs} + \sqrt{1-\bar{\alpha}_t} \times \epsilon` where :math:`\bar{\alpha}_t` controls the right amount of noise.
 * Reverse process removes noise and reconstructs the observed data. At each step :math:`t`, we train an autoencoder :math:`\epsilon_\theta` based on ResNet [9] to predict the added noise :math:`\epsilon_t` based on the rest of the observed data. The objective function is the error between the noise added in the forward process and the noise predicted by :math:`\epsilon_\theta`.
 
 In training phase, we use the self-supervised learning method of [8] to train incomplete data. In detail, our model randomly masks a part of observed data and computes loss from these masked data. Moving on to the inference phase, (1) missing data are replaced by Gaussian noises :math:`\epsilon \sim \mathcal{N}(0,I)`, (2) at each noise step from :math:`T` to 0, our model denoises these missing data based on :math:`\epsilon_\theta`.
 
-In the case of time-series data, we also propose :class:`qolmat.diffusions.TabDDPMTS` (built on top of :class:`qolmat.diffusions.TabDDPM`) to capture time-based relationships between data points in a dataset. In fact, the dataset is pre-processed by using sliding window method to obtain a set of data partitions. The noise prediction of the model :math:`\epsilon_\theta` takes into account not only the observed data at the current time step but also data from previous time steps. These time-based relationships are encoded by using a transformer-based architecture [8].
+In the case of time-series data, we also propose :class:`~qolmat.imputations.diffusions.ddpms.TsDDPM` (built on top of :class:`~qolmat.imputations.diffusions.ddpms.TabDDPM`) to capture time-based relationships between data points in a dataset. In fact, the dataset is pre-processed by using sliding window method to obtain a set of data partitions. The noise prediction of the model :math:`\epsilon_\theta` takes into account not only the observed data at the current time step but also data from previous time steps. These time-based relationships are encoded by using a transformer-based architecture [8].
 
 References
 ----------