add grouped residual vq technique from hifi-codec paper

lucidrains · lucidrains · commit bf5313731223 · 2023-05-06T09:36:14.000-07:00
diff --git a/README.md b/README.md
@@ -81,6 +81,28 @@ quantized, indices, commit_loss = residual_vq(x)
 # (batch, seq, dim), (quantizer, batch, seq), (quantizer, batch)
 ```
 
+<a href="https://arxiv.org/abs/2305.02765">A recent paper</a> further proposes to do residual VQ on groups of the feature dimension, showing equivalent results to Encodec while using far fewer codebooks. You can use it by importing `GroupedResidualVQ`
+
+```python
+import torch
+from vector_quantize_pytorch import GroupedResidualVQ
+
+residual_vq = GroupedResidualVQ(
+    dim = 256,
+    num_quantizers = 8,      # specify number of quantizers
+    groups = 2,
+    codebook_size = 1024,    # codebook size
+)
+
+x = torch.randn(1, 1024, 256)
+
+quantized, indices, commit_loss = residual_vq(x)
+
+# (1, 1024, 256), (1, 1024, 8), (1, 8)
+# (batch, seq, dim), (groups, batch, seq, quantizer), (groups, batch, quantizer)
+
+```
+
 ## Initialization
 
 The SoundStream paper proposes that the codebook should be initialized by the kmeans centroids of the first batch. You can easily turn on this feature with one flag `kmeans_init = True`, for either `VectorQuantize` or `ResidualVQ` class
@@ -375,3 +397,12 @@ if __name__ == '__main__':
     year    = {2023}
 }
 ```
+
+```bibtex
+@inproceedings{Yang2023HiFiCodecGV,
+    title   = {HiFi-Codec: Group-residual Vector quantization for High Fidelity Audio Codec},
+    author  = {Dongchao Yang and Songxiang Liu and Rongjie Huang and Jinchuan Tian and Chao Weng and Yuexian Zou},
+    year    = {2023}
+}
+```
+
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'vector_quantize_pytorch',
   packages = find_packages(),
-  version = '1.2.3',
+  version = '1.4.0',
   license='MIT',
   description = 'Vector Quantization - Pytorch',
   long_description_content_type = 'text/markdown',
diff --git a/vector_quantize_pytorch/__init__.py b/vector_quantize_pytorch/__init__.py
@@ -1,3 +1,3 @@
 from vector_quantize_pytorch.vector_quantize_pytorch import VectorQuantize
-from vector_quantize_pytorch.residual_vq import ResidualVQ
+from vector_quantize_pytorch.residual_vq import ResidualVQ, GroupedResidualVQ
 from vector_quantize_pytorch.random_projection_quantizer import RandomProjectionQuantizer
diff --git a/vector_quantize_pytorch/residual_vq.py b/vector_quantize_pytorch/residual_vq.py
@@ -1,5 +1,6 @@
 from math import ceil
 from functools import partial
+from itertools import zip_longest
 from random import randrange
 
 import torch
@@ -14,6 +15,9 @@
 def exists(val):
     return val is not None
 
+def default(val, d):
+    return val if exists(val) else d
+
 def round_up_multiple(num, mult):
     return ceil(num / mult) * mult
 
@@ -183,3 +187,77 @@ def forward(
             ret = (*ret, all_codes)
 
         return ret
+
+# grouped residual vq
+
+class GroupedResidualVQ(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        groups = 1,
+        accept_image_fmap = False,
+        **kwargs
+    ):
+        super().__init__()
+        self.dim = dim
+        self.groups = groups
+        assert (dim % groups) == 0
+        dim_per_group = dim // groups
+
+        self.accept_image_fmap = accept_image_fmap
+
+        self.rvqs = nn.ModuleList([])
+
+        for _ in range(groups):
+            self.rvqs.append(ResidualVQ(
+                dim = dim_per_group,
+                accept_image_fmap = accept_image_fmap,
+                **kwargs
+            ))
+
+    @property
+    def codebooks(self):
+        return torch.stack(tuple(rvq.codebooks for rvq in self.rvqs))
+
+    def forward(
+        self,
+        x,
+        indices = None,
+        return_all_codes = False
+    ):
+        shape = x.shape
+        split_dim = 1 if self.accept_image_fmap else -1
+        assert shape[split_dim] == self.dim
+
+        # split the feature dimension into groups
+
+        x = x.chunk(self.groups, dim = split_dim)
+
+        indices = default(indices, tuple())
+        return_ce_loss = len(indices) > 0
+        assert len(indices) == 0 or len(indices) == self.groups
+
+        forward_kwargs = dict(return_all_codes = return_all_codes)
+
+        # invoke residual vq on each group
+
+        out = tuple(rvq(chunk, indices = chunk_indices, **forward_kwargs) for rvq, chunk, chunk_indices in zip_longest(self.rvqs, x, indices))
+        out = tuple(zip(*out))
+
+        # if returning cross entropy loss to rvq codebooks
+
+        if return_ce_loss:
+            quantized, ce_losses = out
+            return torch.cat(quantized, dim = split_dim), sum(ce_losses)
+
+        # otherwise, get all the zipped outputs and combine them
+
+        quantized, all_indices, commit_losses, *maybe_all_codes = out
+
+        quantized = torch.cat(quantized, dim = split_dim)
+        all_indices = torch.stack(all_indices)
+        commit_losses = torch.stack(commit_losses)
+
+        ret = (quantized, all_indices, commit_losses, *maybe_all_codes)
+        return ret