buy the conclusions of the new MIT paper with their solution for the internal covariate shift

lucidrains · lucidrains · commit 1c0e73e195d1 · 2023-05-27T09:21:47.000-07:00
diff --git a/README.md b/README.md
@@ -307,7 +307,7 @@ if __name__ == '__main__':
 
 - [x] allow for multi-headed codebooks
 - [x] support masking
-
+- [ ] make sure affine param works in a distributed setting (batch mean and variance must be synced with dist reduce or whatever)
 
 ## Citations
 
@@ -416,3 +416,13 @@ if __name__ == '__main__':
     volume  = {abs/2304.08612}
 }
 ```
+
+```bibtex
+@inproceedings{huh2023improvedvqste,
+    title   = {Straightening Out the Straight-Through Estimator: Overcoming Optimization Challenges in Vector Quantized Networks},
+    author  = {Huh, Minyoung and Cheung, Brian and Agrawal, Pulkit and Isola, Phillip},
+    booktitle = {International Conference on Machine Learning},
+    year    = {2023},
+    organization = {PMLR}
+}
+```
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '1.5.19',
+  version = '1.6.0',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   long_description_content_type = 'text/markdown',
diff --git a/vector_quantize_pytorch/vector_quantize_pytorch.py b/vector_quantize_pytorch/vector_quantize_pytorch.py
@@ -6,7 +6,7 @@
 import torch.distributed as distributed
 from torch.cuda.amp import autocast
 
-from einops import rearrange, repeat, pack, unpack
+from einops import rearrange, repeat, reduce, pack, unpack
 from contextlib import contextmanager
 
 def exists(val):
@@ -239,7 +239,10 @@ def __init__(
         learnable_codebook = False,
         gumbel_sample = gumbel_sample,
         sample_codebook_temp = 1.,
-        ema_update = True
+        ema_update = True,
+        affine_param = False,
+        affine_param_batch_decay = 0.99,
+        affine_param_codebook_decay = 0.9
     ):
         super().__init__()
         self.transform_input = identity
@@ -278,6 +281,22 @@ def __init__(
         else:
             self.register_buffer('embed', embed)
 
+        # affine related params
+
+        self.affine_param = affine_param
+
+        if not affine_param:
+            return
+
+        self.affine_param_batch_decay = affine_param_batch_decay
+        self.affine_param_codebook_decay = affine_param_codebook_decay
+
+        self.register_buffer('batch_mean', None)
+        self.register_buffer('batch_variance', None)
+
+        self.register_buffer('codebook_mean', None)
+        self.register_buffer('codebook_variance', None)
+
     @torch.jit.ignore
     def init_embed_(self, data):
         if self.initted:
@@ -296,6 +315,29 @@ def init_embed_(self, data):
         self.cluster_size.data.copy_(cluster_size)
         self.initted.data.copy_(torch.Tensor([True]))
 
+    @torch.jit.ignore
+    def update_with_decay(self, buffer_name, new_value, decay):
+        old_value = getattr(self, buffer_name)
+
+        if not exists(old_value):
+            self.register_buffer(buffer_name, new_value)
+            return
+
+        value = old_value * decay + new_value * (1 - decay)
+        self.register_buffer(buffer_name, value)
+
+    @torch.jit.ignore
+    def update_affine(self, data, embed):
+        assert self.affine_param
+
+        var_fn = partial(torch.var, unbiased = False)
+
+        self.update_with_decay('batch_mean', reduce(data, '... d -> d', 'mean'), self.affine_param_batch_decay)
+        self.update_with_decay('batch_variance', reduce(data, '... d -> d', var_fn), self.affine_param_batch_decay)
+
+        self.update_with_decay('codebook_mean', reduce(embed, '... d -> d', 'mean'), self.affine_param_codebook_decay)
+        self.update_with_decay('codebook_variance', reduce(embed, '... d -> d', var_fn), self.affine_param_codebook_decay)
+
     def replace(self, batch_samples, batch_mask):
         for ind, (samples, mask) in enumerate(zip(batch_samples.unbind(dim = 0), batch_mask.unbind(dim = 0))):
             if not torch.any(mask):
@@ -340,8 +382,16 @@ def forward(
 
         self.init_embed_(flatten)
 
+        if self.affine_param:
+            self.update_affine(flatten, self.embed)
+
         embed = self.embed if not self.learnable_codebook else self.embed.detach()
 
+        if self.affine_param:
+            codebook_std = self.codebook_variance.clamp(min = 1e-5).sqrt()
+            batch_std = self.batch_variance.clamp(min = 1e-5).sqrt()
+            embed = (embed - self.codebook_mean) * (batch_std / codebook_std) + self.batch_mean
+
         dist = -torch.cdist(flatten, embed, p = 2)
 
         embed_ind, embed_onehot, straight_through_mult = self.gumbel_sample(dist, dim = -1, temperature = sample_codebook_temp, training = self.training)
@@ -355,6 +405,10 @@ def forward(
             quantize = quantize * mult
 
         if self.training and self.ema_update:
+
+            if self.affine_param:
+                flatten = (flatten - self.batch_mean) * (codebook_std / batch_std) + self.codebook_mean
+
             cluster_size = embed_onehot.sum(dim = 1)
 
             self.all_reduce_fn(cluster_size)
@@ -565,8 +619,10 @@ def __init__(
         reinmax = False,  # using reinmax for improved straight-through, assuming straight through helps at all
         sync_codebook = False,
         ema_update = True,
-        learnable_codebook = False
-
+        learnable_codebook = False,
+        affine_param = False,
+        affine_param_batch_decay = 0.99,
+        affine_param_codebook_decay = 0.9
     ):
         super().__init__()
         self.dim = dim
@@ -598,7 +654,7 @@ def __init__(
             straight_through = straight_through
         )
 
-        self._codebook = codebook_class(
+        codebook_kwargs = dict(
             dim = codebook_dim,
             num_codebooks = heads if separate_codebook_per_head else 1,
             codebook_size = codebook_size,
@@ -615,6 +671,17 @@ def __init__(
             ema_update = ema_update
         )
 
+        if affine_param:
+            assert not use_cosine_sim
+            codebook_kwargs = dict(
+                **codebook_kwargs,
+                affine_param = True,
+                affine_param_batch_decay = affine_param_batch_decay,
+                affine_param_codebook_decay = affine_param_codebook_decay
+            )
+
+        self._codebook = codebook_class(**codebook_kwargs)
+
         self.codebook_size = codebook_size
 
         self.accept_image_fmap = accept_image_fmap