add SimVQ with or without rotation trick https://arxiv.org/abs/2411.02038

lucidrains · lucidrains · commit 97b9a87afb3d · 2024-11-11T09:09:07.000-08:00
diff --git a/README.md b/README.md
@@ -714,3 +714,12 @@ assert loss.item() >= 0
     url     = {https://api.semanticscholar.org/CorpusID:273229218}
 }
 ```
+
+```bibtex
+@inproceedings{Zhu2024AddressingRC,
+    title   = {Addressing Representation Collapse in Vector Quantized Models with One Linear Layer},
+    author  = {Yongxin Zhu and Bocheng Li and Yifei Xin and Linli Xu},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:273812459}
+}
+```
diff --git a/examples/autoencoder.py b/examples/autoencoder.py
@@ -71,7 +71,6 @@ def iterate_dataset(data_loader):
     shuffle=True,
 )
 
-print("baseline")
 torch.random.manual_seed(seed)
 
 model = SimpleVQAutoEncoder(
diff --git a/examples/autoencoder_fsq.py b/examples/autoencoder_fsq.py
@@ -76,7 +76,6 @@ def iterate_dataset(data_loader):
     shuffle=True,
 )
 
-print("baseline")
 torch.random.manual_seed(seed)
 model = SimpleFSQAutoEncoder(levels).to(device)
 opt = torch.optim.AdamW(model.parameters(), lr=lr)
diff --git a/examples/autoencoder_lfq.py b/examples/autoencoder_lfq.py
@@ -87,8 +87,6 @@ def iterate_dataset(data_loader):
     shuffle=True,
 )
 
-print("baseline")
-
 torch.random.manual_seed(seed)
 
 model = LFQAutoEncoder(
diff --git a/examples/autoencoder_sim_vq.py b/examples/autoencoder_sim_vq.py
@@ -22,11 +22,11 @@ def SimVQAutoEncoder(**vq_kwargs):
         nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
         nn.MaxPool2d(kernel_size=2, stride=2),
         nn.GELU(),
-        nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),
+        nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
         nn.MaxPool2d(kernel_size=2, stride=2),
-        SimVQ(dim=64, accept_image_fmap = True, **vq_kwargs),
+        SimVQ(dim=32, accept_image_fmap = True, **vq_kwargs),
         nn.Upsample(scale_factor=2, mode="nearest"),
-        nn.Conv2d(64, 16, kernel_size=3, stride=1, padding=1),
+        nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),
         nn.GELU(),
         nn.Upsample(scale_factor=2, mode="nearest"),
         nn.Conv2d(16, 1, kernel_size=3, stride=1, padding=1),
@@ -73,11 +73,11 @@ def iterate_dataset(data_loader):
     shuffle=True,
 )
 
-print("baseline")
 torch.random.manual_seed(seed)
 
 model = SimVQAutoEncoder(
-    codebook_size=num_codes,
+    codebook_size = num_codes,
+    rotation_trick = rotation_trick
 ).to(device)
 
 opt = torch.optim.AdamW(model.parameters(), lr=lr)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "vector-quantize-pytorch"
-version = "1.19.5"
+version = "1.20.0"
 description = "Vector Quantization - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/vector_quantize_pytorch/sim_vq.py b/vector_quantize_pytorch/sim_vq.py
@@ -6,7 +6,9 @@
 import torch.nn.functional as F
 
 from einx import get_at
-from einops import einsum, rearrange, repeat, reduce, pack, unpack
+from einops import rearrange, pack, unpack
+
+from vector_quantize_pytorch.vector_quantize_pytorch import rotate_from_to
 
 # helper functions
 
@@ -37,7 +39,9 @@ def __init__(
         dim,
         codebook_size,
         init_fn: Callable = identity,
-        accept_image_fmap = False
+        accept_image_fmap = False,
+        rotation_trick = True,  # works even better with rotation trick turned on, with no asymmetric commit loss or straight through
+        commit_loss_input_to_quantize_weight = 0.25,
     ):
         super().__init__()
         self.accept_image_fmap = accept_image_fmap
@@ -50,6 +54,17 @@ def __init__(
         self.codebook_to_codes = nn.Linear(dim, dim, bias = False)
         self.register_buffer('codebook', codebook)
 
+
+        # whether to use rotation trick from Fifty et al. 
+        # https://arxiv.org/abs/2410.06424
+
+        self.rotation_trick = rotation_trick
+        self.register_buffer('zero', torch.tensor(0.), persistent = False)
+
+        # commit loss weighting - weighing input to quantize a bit less is crucial for it to work
+
+        self.commit_loss_input_to_quantize_weight = commit_loss_input_to_quantize_weight
+
     def forward(
         self,
         x
@@ -68,14 +83,21 @@ def forward(
 
         quantized = get_at('[c] d, b n -> b n d', implicit_codebook, indices)
 
-        # commit loss
+        if self.rotation_trick:
+            # rotation trick from @cfifty
+
+            quantized = rotate_from_to(quantized, x)
+
+            commit_loss = self.zero
+        else:
+            # commit loss and straight through, as was done in the paper
 
-        commit_loss = (
-            0.25 * F.mse_loss(x, quantized.detach()) +
-            F.mse_loss(x.detach(), quantized)
-        )
+            commit_loss = (
+                F.mse_loss(x, quantized.detach()) * self.commit_loss_input_to_quantize_weight +
+                F.mse_loss(x.detach(), quantized)
+            )
 
-        quantized = (quantized - x).detach() + x
+            quantized = (quantized - x).detach() + x
 
         if self.accept_image_fmap:
             quantized = inverse_pack(quantized)

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,6 @@ def iterate_dataset(data_loader):`
`71`	`71`	`shuffle=True,`
`72`	`72`	`)`
`73`	`73`
`74`		`-print("baseline")`
`75`	`74`	`torch.random.manual_seed(seed)`
`76`	`75`
`77`	`76`	`model = SimpleVQAutoEncoder(`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,6 @@ def iterate_dataset(data_loader):`
`76`	`76`	`shuffle=True,`
`77`	`77`	`)`
`78`	`78`
`79`		`-print("baseline")`
`80`	`79`	`torch.random.manual_seed(seed)`
`81`	`80`	`model = SimpleFSQAutoEncoder(levels).to(device)`
`82`	`81`	`opt = torch.optim.AdamW(model.parameters(), lr=lr)`
Original file line number	Diff line number	Diff line change
`@@ -87,8 +87,6 @@ def iterate_dataset(data_loader):`
`87`	`87`	`shuffle=True,`
`88`	`88`	`)`
`89`	`89`
`90`		`-print("baseline")`
`91`		`-`
`92`	`90`	`torch.random.manual_seed(seed)`
`93`	`91`
`94`	`92`	`model = LFQAutoEncoder(`