autograd, layernorm bug fix

AkiRusProd · AkiRusProd · commit bd1137168dd7 · 2024-07-19T00:26:27.000+03:00
diff --git a/neunet/autograd.py b/neunet/autograd.py
@@ -1000,7 +1000,7 @@ def _reverse_broadcast(self, grad):
 
     def backward(
         self, grad=None
-    ):  # grad=self.xp.array(1) # TODO: ASSERT GRAD SHAPE == DATA SHAPE, assert grad.device == self.device
+    ):
         if not self.requires_grad:
             return
 
@@ -1012,36 +1012,28 @@ def backward(
 
         self._apply_grad(grad)
         # Perform a topological sort to ensure gradients are calculated in the correct order
-        def toposort(v):
-            tape = []
-            visited_ids = set()
-            stack = [v]
-
-            while stack:
-                node: Tensor = stack.pop()
-                node_id = id(node)
-                
-                if node_id in visited_ids:
-                    continue
-                
-                visited_ids.add(node_id)
-                
-                if node.args is not None:
-                    for child in node.args:
-                        if not isinstance(child, Tensor):
-                            continue
-                        if child.requires_grad is False:
-                            continue
-                        stack.append(child)
-                
-                    tape.append(node)
+        tape = []
+        visited_ids = set()
+
+        def toposort(v, tape: list, visited_ids: set):
+            # Topological Sort Using DFS
+            if id(v) not in visited_ids:
+                visited_ids.add(id(v))
+                if v.args is None:
+                    return
+                for child in v.args:
+                    if not isinstance(child, Tensor):
+                        continue
+                    if child.requires_grad is False:
+                        continue
+
+                    toposort(child, tape, visited_ids)
+                tape.append(v)
 
             return tape
-        
+
+        tape = toposort(self, tape, visited_ids)
         # Apply the backward function in reverse order
-        for v in toposort(self):
+        for v in reversed(tape):
             v.grad_fn(*v.args, grad=v.grad)
 
-# BUGS:
-# grad X - mean not correct with pytorch; maybe NOT BUG becase small numbers manipulation (Numerical stability issues)
-# softmax not equals grads with pytorch; place: div; maybe NOT BUG becase small numbers manipulation (Numerical stability issues)????
diff --git a/neunet/nn/layers/batchnorm1d.py b/neunet/nn/layers/batchnorm1d.py
@@ -13,6 +13,8 @@ def __init__(self, data, args, op, device):
         super().__init__(data, args, op, device=device)
 
         def grad_fn(X: Tensor, weight: Tensor, bias: Tensor, X_centered, stddev_inv, affine, grad):
+            # The method of calculating the derivative is similar to BatchNorm.
+            # https://chrisyeh96.github.io/2017/08/28/deriving-batchnorm-backprop.html
             X_hat = X_centered * stddev_inv
             batch_size = X.data.shape[0]
 
diff --git a/neunet/nn/layers/batchnorm2d.py b/neunet/nn/layers/batchnorm2d.py
@@ -13,6 +13,8 @@ def __init__(self, data, args, op, device):
         super().__init__(data, args, op, device=device)
 
         def grad_fn(X: Tensor, weight: Tensor, bias: Tensor, X_centered, stddev_inv, affine, grad):
+            # https://math.stackexchange.com/questions/2359981/batch-normalization-equation-derivation
+            # https://arxiv.org/pdf/1502.03167
             batch_size = X.data.shape[0] * X.data.shape[2] * X.data.shape[3]
 
             axis = (0, 2, 3)
diff --git a/neunet/nn/layers/layernorm.py b/neunet/nn/layers/layernorm.py
@@ -7,20 +7,20 @@
 from neunet.nn.modules import Module
 from neunet.nn.parameter import Parameter
 
-# class LayerNorm(): #layer with dynamic backpropagation
+# class LayerNorm(Module): #layer with dynamic backpropagation
 #     def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True):
 #         self.normalized_shape = (normalized_shape, ) if isinstance(normalized_shape, int) else normalized_shape
 #         self.eps = eps
 #         self.elementwise_affine = elementwise_affine
 
 #         if elementwise_affine:
-#             self.weight = Tensor(self.xp.ones((normalized_shape)))
-#             self.bias = Tensor(self.xp.zeros((normalized_shape)))
+#             self.weight: Union[Tensor, None] = Parameter(neunet.tensor(np.ones((normalized_shape)), dtype=np.float32))
+#             self.bias: Union[Tensor, None] = Parameter(neunet.tensor(np.zeros((normalized_shape)), dtype=np.float32))
 #         else:
 #             self.weight = None
 #             self.bias = None
 
-#     def forward(self, X):
+#     def forward(self, X: Tensor):
 #         axis = tuple(range(-len(self.normalized_shape), 0))
 
 #         mean = X.mean(axis = axis, keepdims=True)
@@ -46,11 +46,14 @@ def __init__(self, data, args, op, device):
         super().__init__(data, args, op, device=device)
 
         def grad_fn(X: Tensor, weight: Tensor, bias: Tensor, X_centered, stddev_inv, axis, elementwise_affine, grad):
-            # _axis = list(axis) if isinstance(axis, tuple) else axis
+            # The method of calculating the derivative is similar to BatchNorm.
+            _axis = list(axis) if isinstance(axis, tuple) else axis
             X_hat = X_centered * stddev_inv
 
             weight_data = weight.data if elementwise_affine else 1
-            weight_size = weight.size if elementwise_affine else 1
+            # N = X.xp.prod(X.xp.array(X.shape)[_axis]) # Takes up a lot of GPU memory
+            N = np.prod(np.array(X.shape)[_axis])
+
 
             dX_hat = weight_data * grad
             dstddev_inv = (
@@ -59,26 +62,26 @@ def grad_fn(X: Tensor, weight: Tensor, bias: Tensor, X_centered, stddev_inv, axi
                 * X.xp.sum(dX_hat * X_centered, axis=axis, keepdims=True)
             )
             dvar = (
-                X.xp.ones_like(X.data) * dstddev_inv * 2 * X_centered / weight_size
+                X.xp.ones_like(X.data) * dstddev_inv * 2 * X_centered / N
             )  # X.xp.prod(X.xp.array(X.shape)[_axis])
             dmean = (
                 X.xp.ones_like(X.data)
                 * X.xp.sum(dX_hat * stddev_inv, axis=axis, keepdims=True)
                 * (-1)
-                / weight_size
+                / N
             )  # X.xp.prod(X.xp.array(X.shape)[_axis])
             grad_X = dX_hat * stddev_inv + dvar + dmean
 
-            # grad_X = (1 / weight_size) * weight_data * stddev_inv * (
-            #     weight_size * grad
+            # grad_X = (1 / N) * weight_data * stddev_inv * (
+            #     N * grad
             #     - X.xp.sum(grad, axis = axis, keepdims = True)
             #     - X_centered * X.xp.power(stddev_inv, 2) * X.xp.sum(grad * X_centered, axis = axis, keepdims = True)
             #     )
 
             # dX_hat = weight_data * grad
-            # dvar = X.xp.sum(dX_hat * X_centered, axis = axis, keepdims = True) * (-0.5) * X.xp.power(stddev_inv, 3) * 2 * X_centered / weight_size
-            # dmean = (X.xp.sum(dX_hat * (-stddev_inv), axis = axis, keepdims = True) + dvar * X.xp.mean(-2.0 * X_centered, axis = axis, keepdims = True)) * X.xp.ones_like(X.data) / weight_size
-            # grad_X = dX_hat * stddev_inv + dvar + dmean
+            # dvar = X.xp.sum(dX_hat * X_centered, axis = axis, keepdims = True) * (-0.5) * X.xp.power(stddev_inv, 3) 
+            # dmean = (X.xp.sum(dX_hat * (-stddev_inv), axis = axis, keepdims = True) + dvar * X.xp.mean(-2.0 * X_centered, axis = axis, keepdims = True)) * X.xp.ones_like(X.data) / N
+            # grad_X = dX_hat * stddev_inv + dvar * 2 * X_centered / N + dmean / N
 
             if elementwise_affine:
                 grad_weight = X.xp.sum(grad * X_hat, axis=0)