Fix register_block_size codegen (#659)

yf225 · web-flow · commit 94b065018527 · 2025-09-22T23:08:20.000-07:00
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -1064,6 +1064,19 @@ def propagate_attribute(self, attr: str, origin: AttributeOrigin) -> TypeInfo:
         return super().propagate_attribute(attr, origin)
 
 
+class BlockSizeType(SymIntType):
+    """Type for block sizes registered via register_block_size"""
+
+    block_id: int
+
+    def __init__(self, origin: Origin, value: torch.SymInt, block_id: int) -> None:
+        super().__init__(origin, value)
+        self.block_id = block_id
+
+    def __str__(self) -> str:
+        return f"{type(self).__name__}({self.block_id})"
+
+
 class GridIndexType(SymIntType):
     block_id: int
 
diff --git a/helion/language/tunable_ops.py b/helion/language/tunable_ops.py
@@ -66,7 +66,7 @@ def _(min_or_max: int, max_or_none: int | None = None, /) -> int:
 def _(
     min_or_max: TypeInfo, max_or_none: TypeInfo | None = None, /, *, origin: Origin
 ) -> TypeInfo:
-    from .._compiler.type_propagation import SymIntType
+    from .._compiler.type_propagation import BlockSizeType
 
     min_type, max_type = _normalize_begin_end(min_or_max, max_or_none, origin=origin)
     min_proxy = _to_proxy(min_type)
@@ -85,22 +85,27 @@ def _(
     loop_spec.min_size = assert_integer_power_of_two(max(1, min_proxy))
     loop_spec.max_size = next_power_of_2(env.size_hint(max_proxy))
     block_id = result.block_id
-    return SymIntType(origin, env.block_sizes[block_id].var)
+    return BlockSizeType(origin, env.block_sizes[block_id].var, block_id)
 
 
 def _block_id_from_state(state: CodegenState) -> int:
     """Extract the block_id from the current state for nodes hl.register_block_size."""
+    from .._compiler.type_propagation import BlockSizeType
     from .._compiler.type_propagation import SymIntType
 
     env = CompileEnvironment.current()
     if state.fx_node is not None:
         val = state.fx_node.meta["val"]
+        if isinstance(val, BlockSizeType):
+            return val.block_id
         assert isinstance(val, SymIntType)
         block_id = env.get_block_id(val.value)
         assert block_id is not None
         return block_id
     current_node = ExtendedAST.current()[-1]
     type_info = current_node._type_info
+    if isinstance(type_info, BlockSizeType):
+        return type_info.block_id
     assert isinstance(type_info, SymIntType)
     block_id = env.get_block_id(type_info.value)
     assert block_id is not None
diff --git a/test/test_loops.expected b/test/test_loops.expected
@@ -971,6 +971,45 @@ def nested_loop_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
     _launcher(_helion_nested_loop_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestLoops.test_register_block_size_codegen_size_hint)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_fixed_block_size(loss_sum, y_true, kl_loss, loss, loss_sum_stride_0, _BLOCK_SIZE_1: tl.constexpr, _RDIM_SIZE_2: tl.constexpr, _BLOCK_SIZE_3: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_1 = pid_0 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    indices_4 = tl.arange(0, _RDIM_SIZE_2).to(tl.int32)
+    full = tl.full([64, 64], 0.0, tl.float32)
+    tl.store(loss_sum + (indices_4[:, None] * loss_sum_stride_0 + indices_4[None, :] * 1), full, None)
+    for offset_2 in tl.range(0, 128, _BLOCK_SIZE_3):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
+        y_true_val = tl.load(y_true + (indices_1[:, None] * 128 + indices_2[None, :] * 1), None)
+        tl.store(kl_loss + (indices_1[:, None] * 128 + indices_2[None, :] * 1), y_true_val, None)
+        load_1 = tl.load(kl_loss + (indices_1[:, None] * 128 + indices_2[None, :] * 1), None)
+        tl.atomic_add(loss_sum + (indices_1[:, None] * loss_sum_stride_0 + indices_2[None, :] * 1), load_1, mask=None, sem='relaxed')
+    load = tl.load(loss_sum + (indices_4[:, None] * loss_sum_stride_0 + indices_4[None, :] * 1), None)
+    sum_1 = tl.cast(tl.sum(load, 1), tl.float32)
+    tl.store(loss + indices_1 * 1, sum_1, None)
+
+def kernel_fixed_block_size(y_pred: torch.Tensor, y_true: torch.Tensor, *, _launcher=_default_launcher):
+    BT, V_local = y_pred.shape
+    loss = torch.zeros((BT,), dtype=torch.float32, device=y_pred.device)
+    kl_loss = torch.zeros_like(y_pred)
+    block_size_n = 128
+    BT_SIZE = 64
+    loss_sum = torch.zeros([BT_SIZE, block_size_n], dtype=torch.float32, device=y_pred.device)
+    _BLOCK_SIZE_1 = 64
+    _RDIM_SIZE_2 = 64
+    _BLOCK_SIZE_3 = 64
+    _launcher(_helion_kernel_fixed_block_size, (triton.cdiv(64, _BLOCK_SIZE_1),), loss_sum, y_true, kl_loss, loss, loss_sum.stride(0), _BLOCK_SIZE_1, _RDIM_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
+    return torch.sum(loss) / BT
+
 --- assertExpectedJournal(TestLoops.test_reorder_with_register_block_size)
 from __future__ import annotations
 
diff --git a/test/test_loops.py b/test/test_loops.py
@@ -324,6 +324,45 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(spec.min_size, 32)
         self.assertEqual(spec.max_size, 256)
 
+    @skipIfRefEager("Triton codegen is disabled in ref eager mode")
+    def test_register_block_size_codegen_size_hint(self):
+        @helion.kernel(static_shapes=True)
+        def kernel_fixed_block_size(
+            y_pred: torch.Tensor,
+            y_true: torch.Tensor,
+        ) -> torch.Tensor:
+            BT, V_local = y_pred.shape
+
+            loss = torch.zeros((BT,), dtype=torch.float32, device=y_pred.device)
+            kl_loss = torch.zeros_like(y_pred)
+
+            block_size_n = hl.register_block_size(V_local)
+            BT_SIZE = 64
+            loss_sum = torch.zeros(
+                [BT_SIZE, block_size_n], dtype=torch.float32, device=y_pred.device
+            )
+
+            for tile_bt in hl.tile(BT, block_size=BT_SIZE):
+                loss_sum[:, :] = hl.zeros([BT_SIZE, block_size_n], dtype=torch.float32)
+                for tile_v in hl.tile(V_local, block_size=block_size_n):
+                    y_true_val = y_true[tile_bt, tile_v]
+                    kl_loss[tile_bt, tile_v] = y_true_val
+                    hl.atomic_add(loss_sum, [tile_bt, tile_v], kl_loss[tile_bt, tile_v])
+
+                loss[tile_bt] = loss_sum[:, :].sum(dim=-1)
+
+            return torch.sum(loss) / BT
+
+        y_pred = torch.randn(64, 128, device=DEVICE, dtype=torch.float32)
+        y_true = torch.randn(64, 128, device=DEVICE, dtype=torch.float32)
+        args = (y_pred, y_true)
+
+        code, result = code_and_output(kernel_fixed_block_size, args, block_sizes=[128])
+        self.assertExpectedJournal(code)
+
+        expected = y_true[:, : y_pred.size(0)].sum() / y_pred.size(0)
+        torch.testing.assert_close(result, expected)
+
     def test_reorder_with_register_block_size(self):
         @helion.kernel(
             config={