wip

balancap · balancap · commit 11f0270af5b1 · 2024-01-08T21:25:07.000Z
diff --git a/experiments/mnist/mnist_classifier_from_scratch.py b/experiments/mnist/mnist_classifier_from_scratch.py
@@ -31,9 +31,9 @@
 
 
 def print_mean_std(name, v):
-    _, scale = jsa.lax.get_data_scale(v)
+    data, scale = jsa.lax.get_data_scale(v)
     # Always use np.float32, to avoid floating errors in descaling + stats.
-    v = jsa.asarray(v, dtype=np.float32)
+    v = jsa.asarray(data, dtype=np.float32)
     m, s = np.mean(v), np.std(v)
     print(f"{name}: MEAN({m:.4f}) / STD({s:.4f}) / SCALE({scale:.4f})")
 
@@ -45,19 +45,23 @@ def init_random_params(scale, layer_sizes, rng=npr.RandomState(0)):
 def predict(params, inputs):
     activations = inputs
     for w, b in params[:-1]:
-        jsa.ops.debug_callback(partial(print_mean_std, "W"), w)
-        jsa.ops.debug_callback(partial(print_mean_std, "B"), b)
-        (w,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "WGrad"), w)
+        # jsa.ops.debug_callback(partial(print_mean_std, "W"), w)
+        # jsa.ops.debug_callback(partial(print_mean_std, "B"), b)
+        # (w,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "WGrad"), w)
 
         # Matmul + relu
         outputs = jnp.dot(activations, w) + b
         activations = jnp.maximum(outputs, 0)
+        # activations = jsa.ops.dynamic_rescale_l2_grad(activations)
 
     final_w, final_b = params[-1]
     logits = jnp.dot(activations, final_w) + final_b
 
-    jsa.ops.debug_callback(partial(print_mean_std, "Logits"), logits)
-    (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
+    # jsa.ops.debug_callback(partial(print_mean_std, "Logits"), logits)
+    # (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
+
+    logits = jsa.ops.dynamic_rescale_l1_grad(logits)
+    # (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
 
     return logits - logsumexp(logits, axis=1, keepdims=True)
 
@@ -81,7 +85,7 @@ def accuracy(params, batch):
     step_size = 0.001
     num_epochs = 10
     batch_size = 128
-    training_dtype = np.float32
+    training_dtype = np.float16
 
     train_images, train_labels, test_images, test_labels = datasets.mnist()
     num_train = train_images.shape[0]
@@ -106,15 +110,19 @@ def data_stream():
     @jsa.autoscale
     def update(params, batch):
         grads = grad(loss)(params, batch)
-        return [(w - step_size * dw, b - step_size * db) for (w, b), (dw, db) in zip(params, grads)]
-
-    num_batches = 4
-    num_epochs = 2
+        # return [(w - step_size * dw, b - step_size * db) for (w, b), (dw, db) in zip(params, grads)]
+        return [
+            (jsa.ops.dynamic_rescale_l1(w - step_size * dw), jsa.ops.dynamic_rescale_l1(b - step_size * db))
+            for (w, b), (dw, db) in zip(params, grads)
+        ]
+
+    # num_batches = 4
+    # num_epochs = 2
     for epoch in range(num_epochs):
-        print("EPOCH:", epoch)
+        # print("EPOCH:", epoch)
         start_time = time.time()
         for _ in range(num_batches):
-            print("BATCH...")
+            # print("BATCH...")
             batch = next(batches)
             # Scaled micro-batch + training dtype cast.
             batch = jsa.as_scaled_array(batch)
@@ -127,8 +135,8 @@ def update(params, batch):
 
         # Evaluation in float32, for consistency.
         raw_params = jsa.asarray(params, dtype=np.float32)
-        # train_acc = accuracy(raw_params, (train_images, train_labels))
-        # test_acc = accuracy(raw_params, (test_images, test_labels))
-        # print(f"Epoch {epoch} in {epoch_time:0.2f} sec")
-        # print(f"Training set accuracy {train_acc:0.5f}")
-        # print(f"Test set accuracy {test_acc:0.5f}")
+        train_acc = accuracy(raw_params, (train_images, train_labels))
+        test_acc = accuracy(raw_params, (test_images, test_labels))
+        print(f"Epoch {epoch} in {epoch_time:0.2f} sec")
+        print(f"Training set accuracy {train_acc:0.5f}")
+        print(f"Test set accuracy {test_acc:0.5f}")
diff --git a/jax_scaled_arithmetics/ops/rescaling.py b/jax_scaled_arithmetics/ops/rescaling.py
@@ -2,6 +2,7 @@
 from functools import partial
 
 import jax
+import jax.numpy as jnp
 import numpy as np
 
 from jax_scaled_arithmetics.core import ScaledArray, pow2_round
@@ -48,7 +49,7 @@ def dynamic_rescale_max_base(arr: ScaledArray) -> ScaledArray:
     data_sq = jax.lax.abs(data)
     axes = tuple(range(data.ndim))
     # Get MAX norm + pow2 rounding.
-    norm = jax.lax.reduce_max_p.bind(data_sq, axes=axes)
+    norm = jax.lax.reduce_max_p.bind(data_sq, axes=axes) + np.float32(1e-3)
     norm = pow2_round(norm.astype(scale.dtype))
     # Rebalancing based on norm.
     return rebalance(arr, norm)
@@ -63,7 +64,7 @@ def dynamic_rescale_l1_base(arr: ScaledArray) -> ScaledArray:
     data_sq = jax.lax.abs(data.astype(np.float32))
     axes = tuple(range(data.ndim))
     # Get L1 norm + pow2 rounding.
-    norm = jax.lax.reduce_sum_p.bind(data_sq, axes=axes) / data.size
+    norm = jax.lax.reduce_sum_p.bind(data_sq, axes=axes) / data.size + np.float32(1e-3)
     norm = pow2_round(norm.astype(scale.dtype))
     # Rebalancing based on norm.
     return rebalance(arr, norm)
@@ -78,7 +79,8 @@ def dynamic_rescale_l2_base(arr: ScaledArray) -> ScaledArray:
     data_sq = jax.lax.integer_pow(data.astype(np.float32), 2)
     axes = tuple(range(data.ndim))
     # Get L2 norm + pow2 rounding.
-    norm = jax.lax.sqrt(jax.lax.reduce_sum_p.bind(data_sq, axes=axes)) / data.size
+    norm = jax.lax.sqrt(jax.lax.reduce_sum_p.bind(data_sq, axes=axes) / np.float32(data.size)) + np.float32(1e-3)
+    # jax.debug.print("{} // {} // {}", jnp.mean(data.astype(np.float32)), jnp.std(data.astype(np.float32)), norm)
     norm = pow2_round(norm.astype(scale.dtype))
     # Rebalancing based on norm.
     return rebalance(arr, norm)