wip

balancap · balancap · commit 730ba81455e1 · 2024-01-09T17:19:34.000Z
diff --git a/experiments/mnist/mnist_classifier_from_scratch.py b/experiments/mnist/mnist_classifier_from_scratch.py
@@ -16,9 +16,8 @@
 
 The primary aim here is simplicity and minimal dependencies.
 """
-
-
 import time
+from functools import partial
 
 import datasets
 import jax
@@ -31,27 +30,56 @@
 import jax_scaled_arithmetics as jsa
 
 
+def print_mean_std(name, v):
+    data, scale = jsa.lax.get_data_scale(v)
+    # Always use np.float32, to avoid floating errors in descaling + stats.
+    v = jsa.asarray(data, dtype=np.float32)
+    m, s = np.mean(v), np.std(v)
+    print(f"{name}: MEAN({m:.4f}) / STD({s:.4f}) / SCALE({scale:.4f})")
+
+
 def init_random_params(scale, layer_sizes, rng=npr.RandomState(0)):
     return [(scale * rng.randn(m, n), scale * rng.randn(n)) for m, n, in zip(layer_sizes[:-1], layer_sizes[1:])]
 
 
 def predict(params, inputs):
     activations = inputs
     for w, b in params[:-1]:
+        # jsa.ops.debug_callback(partial(print_mean_std, "W"), w)
+        # jsa.ops.debug_callback(partial(print_mean_std, "B"), b)
+        # (w,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "WGrad"), w)
+
         # Matmul + relu
         outputs = jnp.dot(activations, w) + b
         activations = jnp.maximum(outputs, 0)
+        # activations = jsa.ops.dynamic_rescale_l2_grad(activations)
 
     final_w, final_b = params[-1]
     logits = jnp.dot(activations, final_w) + final_b
-    # Dynamic rescaling of the gradient, as logits gradient not properly scaled.
+
+    jsa.ops.debug_callback(partial(print_mean_std, "Logits"), logits)
+    # (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
+
     logits = jsa.ops.dynamic_rescale_l2_grad(logits)
-    return logits - logsumexp(logits, axis=1, keepdims=True)
+    # logits = logits.astype(np.float32)
+    # (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
+
+    logits = logits - logsumexp(logits, axis=1, keepdims=True)
+    jsa.ops.debug_callback(partial(print_mean_std, "Logits2"), logits)
+    (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
+    return logits
 
 
 def loss(params, batch):
     inputs, targets = batch
     preds = predict(params, inputs)
+    jsa.ops.debug_callback(partial(print_mean_std, "Preds"), preds)
+    loss = jnp.sum(preds * targets, axis=1)
+    # loss = jsa.ops.dynamic_rescale_l2(loss)
+    jsa.ops.debug_callback(partial(print_mean_std, "LOSS1"), loss)
+    loss = -jnp.mean(loss)
+    jsa.ops.debug_callback(partial(print_mean_std, "LOSS2"), loss)
+    return loss
     return -jnp.mean(jnp.sum(preds * targets, axis=1))
 
 
@@ -94,10 +122,18 @@ def data_stream():
     def update(params, batch):
         grads = grad(loss)(params, batch)
         return [(w - step_size * dw, b - step_size * db) for (w, b), (dw, db) in zip(params, grads)]
+        return [
+            (jsa.ops.dynamic_rescale_l1(w - step_size * dw), jsa.ops.dynamic_rescale_l1(b - step_size * db))
+            for (w, b), (dw, db) in zip(params, grads)
+        ]
 
+    # num_batches = 4
+    # num_epochs = 2
     for epoch in range(num_epochs):
+        # print("EPOCH:", epoch)
         start_time = time.time()
         for _ in range(num_batches):
+            # print("BATCH...")
             batch = next(batches)
             # Scaled micro-batch + training dtype cast.
             batch = jsa.as_scaled_array(batch)
diff --git a/jax_scaled_arithmetics/ops/rescaling.py b/jax_scaled_arithmetics/ops/rescaling.py
@@ -2,6 +2,8 @@
 from functools import partial
 
 import jax
+
+# import jax.numpy as jnp
 import numpy as np
 
 from jax_scaled_arithmetics.core import ScaledArray, pow2_round
@@ -48,7 +50,7 @@ def dynamic_rescale_max_base(arr: ScaledArray) -> ScaledArray:
     data_sq = jax.lax.abs(data)
     axes = tuple(range(data.ndim))
     # Get MAX norm + pow2 rounding.
-    norm = jax.lax.reduce_max_p.bind(data_sq, axes=axes)
+    norm = jax.lax.reduce_max_p.bind(data_sq, axes=axes) + np.float32(1e-3)
     norm = pow2_round(norm.astype(scale.dtype))
     # Rebalancing based on norm.
     return rebalance(arr, norm)
@@ -63,7 +65,7 @@ def dynamic_rescale_l1_base(arr: ScaledArray) -> ScaledArray:
     data_sq = jax.lax.abs(data.astype(np.float32))
     axes = tuple(range(data.ndim))
     # Get L1 norm + pow2 rounding.
-    norm = jax.lax.reduce_sum_p.bind(data_sq, axes=axes) / data.size
+    norm = jax.lax.reduce_sum_p.bind(data_sq, axes=axes) / data.size + np.float32(1e-3)
     norm = pow2_round(norm.astype(scale.dtype))
     # Rebalancing based on norm.
     return rebalance(arr, norm)