Fix grad strides warning when using ddp (#375)

marksgraham · web-flow · commit 5e3fe02ade6c · 2023-04-20T14:15:12.000+01:00
* Check for xformers install when using flash attention in diffusion unet

* Ensure tensors are contiguous

* Formatting fix
diff --git a/generative/networks/nets/diffusion_model_unet.py b/generative/networks/nets/diffusion_model_unet.py
@@ -334,9 +334,9 @@ def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch
             x = block(x, context=context)
 
         if self.spatial_dims == 2:
-            x = x.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2)
+            x = x.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
         if self.spatial_dims == 3:
-            x = x.reshape(batch, height, width, depth, inner_dim).permute(0, 4, 1, 2, 3)
+            x = x.reshape(batch, height, width, depth, inner_dim).permute(0, 4, 1, 2, 3).contiguous()
 
         x = self.proj_out(x)
         return x + residual
@@ -1702,6 +1702,9 @@ def __init__(
                 "`num_channels`."
             )
 
+        if use_flash_attention and not has_xformers:
+            raise ValueError("use_flash_attention is True but xformers is not installed.")
+
         if use_flash_attention is True and not torch.cuda.is_available():
             raise ValueError(
                 "torch.cuda.is_available() should be True but is False. Flash attention is only available for GPU."