[Torchax] Add attention sink support in torchax (#1038)

kyuyeunk · web-flow · commit 62763b535bfa · 2025-11-06T22:12:59.000-08:00
Signed-off-by: Kyuyeun Kim &lt;kyuyeunk@google.com&gt;
diff --git a/tests/layers/vllm/test_attention.py b/tests/layers/vllm/test_attention.py
@@ -39,31 +39,42 @@
 MAX_BLOCKS_PER_SEQ = 8
 
 
-def create_inputs(mesh: Mesh,
-                  q_dtype: jnp.dtype = jnp.bfloat16,
-                  kv_dtype: jnp.dtype = jnp.bfloat16):
+def create_inputs(
+    mesh: Mesh,
+    q_dtype: jnp.dtype = jnp.bfloat16,
+    kv_dtype: jnp.dtype = jnp.bfloat16,
+    total_tokens: int = TOTAL_TOKENS,
+    num_seqs: int = NUM_SEQS,
+    max_num_seqs: int = MAX_NUM_SEQS,
+    num_heads: int = NUM_HEADS,
+    num_kv_heads: int = NUM_KV_HEADS,
+    head_dim: int = HEAD_DIM,
+    num_blocks: int = NUM_BLOCKS,
+    block_size: int = BLOCK_SIZE,
+    max_blocks_per_seq: int = MAX_BLOCKS_PER_SEQ,
+):
     key = jax.random.key(0)
-    q = jax.random.uniform(key, (TOTAL_TOKENS, NUM_HEADS * HEAD_DIM),
+    q = jax.random.uniform(key, (total_tokens, num_heads * head_dim),
                            dtype=q_dtype)
-    k = jax.random.uniform(key, (TOTAL_TOKENS, NUM_KV_HEADS * HEAD_DIM),
+    k = jax.random.uniform(key, (total_tokens, num_kv_heads * head_dim),
                            dtype=q_dtype)
-    v = jax.random.uniform(key, (TOTAL_TOKENS, NUM_KV_HEADS * HEAD_DIM),
+    v = jax.random.uniform(key, (total_tokens, num_kv_heads * head_dim),
                            dtype=q_dtype)
     q = torch_view(q)
     k = torch_view(k)
     v = torch_view(v)
 
-    kv_cache_shape = get_kv_cache_shape_with_mesh(mesh, NUM_BLOCKS, BLOCK_SIZE,
-                                                  NUM_KV_HEADS, HEAD_DIM,
+    kv_cache_shape = get_kv_cache_shape_with_mesh(mesh, num_blocks, block_size,
+                                                  num_kv_heads, head_dim,
                                                   kv_dtype)
     kv_cache = jax.random.normal(key, kv_cache_shape, dtype=kv_dtype)
 
-    positions = jnp.ones((TOTAL_TOKENS, ), dtype=jnp.int32)
-    block_tables = jnp.zeros((MAX_NUM_SEQS * MAX_BLOCKS_PER_SEQ),
+    positions = jnp.ones((total_tokens, ), dtype=jnp.int32)
+    block_tables = jnp.zeros((max_num_seqs * max_blocks_per_seq),
                              dtype=jnp.int32).reshape(-1)
     seq_lens = jnp.array([5, 5, 0, 0], dtype=jnp.int32)
     query_start_loc = jnp.array([0, 5, 10, 10, 10], dtype=jnp.int32)
-    request_distribution = jnp.array([0, 0, NUM_SEQS], dtype=jnp.int32)
+    request_distribution = jnp.array([0, 0, num_seqs], dtype=jnp.int32)
 
     metadata = AttentionMetadata(
         input_positions=positions,
@@ -276,3 +287,30 @@ def test_forward_with_output_scale_raises_error(self, mesh):
                          torch.tensor([]),
                          metadata,
                          output_scale=output_scale)
+
+    def test_forward_with_attention_sink(self, mesh):
+        head_dim = 64
+        sinks = torch.rand([NUM_HEADS], dtype=torch.float32)
+
+        impl = PallasAttentionBackendImpl(num_heads=NUM_HEADS,
+                                          head_size=head_dim,
+                                          scale=0.088,
+                                          num_kv_heads=NUM_KV_HEADS,
+                                          alibi_slopes=None,
+                                          sliding_window=None,
+                                          kv_cache_dtype="auto",
+                                          attn_type=AttentionType.DECODER,
+                                          sinks=sinks)
+
+        layer = MagicMock()
+        layer.layer_name = "0"
+
+        query, key, value, kv_cache, metadata = create_inputs(
+            mesh, head_dim=head_dim)
+
+        with torchax.default_env(), set_vllm_model_wrapper_context(
+                kv_caches=[kv_cache],
+                mesh=mesh,
+                layer_name_to_kvcache_index={'0': 0}):
+            assert impl.sinks is not None
+            impl.forward(layer, query, key, value, torch.tensor([]), metadata)
diff --git a/tpu_inference/layers/jax/attention_interface.py b/tpu_inference/layers/jax/attention_interface.py
@@ -14,6 +14,7 @@
 from jax.sharding import PartitionSpec as P
 
 import tpu_inference.kernels.ragged_paged_attention.v3.kernel as rpa
+import tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 as rpa_hd64
 from tpu_inference.kernels.flash_attention.kernel import flash_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
 from tpu_inference.layers.jax.sharding import ShardingAxisName
@@ -26,6 +27,9 @@
 ragged_paged_attention = rpa.ragged_paged_attention
 get_kv_cache_shape = rpa.get_kv_cache_shape
 
+ragged_paged_attention_hd64 = rpa_hd64.ragged_paged_attention_hd64
+get_kv_cache_shape_hd64 = rpa_hd64.get_kv_cache_shape
+
 
 def sharded_flash_attention(
     mesh: Mesh,
@@ -268,17 +272,27 @@ def sharded_splash_attention(
 
 
 def sharded_ragged_paged_attention(
-    sm_scale: float,
     mesh: Mesh,
+    q: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    kv_cache: jax.Array,
+    kv_lens: jax.Array,
+    page_indices: jax.Array,
+    cu_q_lens: jax.Array,
+    distribution: jax.Array,
+    attention_sink: jax.Array | None,
+    sm_scale: float,
     attention_chunk_size: int | None = None,
     q_scale: float | None = None,
     k_scale: float | None = None,
     v_scale: float | None = None,
 ):
     """Shards along KV heads."""
 
-    qkv_spec = P(ShardingAxisName.ATTN_DATA, "model", None)
-    kv_cache_spec = P(ShardingAxisName.ATTN_DATA, None, "model")
+    qkv_spec = P(ShardingAxisName.ATTN_DATA, ShardingAxisName.ATTN_HEAD, None)
+    kv_cache_spec = P(ShardingAxisName.ATTN_DATA, None,
+                      ShardingAxisName.ATTN_HEAD, None, None)
     in_specs = (
         qkv_spec,  # q
         qkv_spec,  # k
@@ -291,8 +305,21 @@ def sharded_ragged_paged_attention(
     )
     out_specs = (qkv_spec, kv_cache_spec)
 
+    args = (q, k, v, kv_cache, kv_lens, page_indices, cu_q_lens, distribution)
+
+    use_hd64 = q.shape[-1] == 64
+    func = ragged_paged_attention_hd64 if use_hd64 else ragged_paged_attention
+
+    if attention_sink is not None:
+        if not use_hd64:
+            raise NotImplementedError(
+                "Attention sink support is only available when head_dim==64")
+
+        in_specs += (P(ShardingAxisName.ATTN_HEAD), )
+        args += (attention_sink, )
+
     def _ragged_paged_attention(*args):
-        return ragged_paged_attention(
+        return func(
             *args,
             sm_scale=sm_scale,
             sliding_window=attention_chunk_size,
@@ -301,21 +328,21 @@ def _ragged_paged_attention(*args):
             v_scale=v_scale,
         )
 
-    return jax.jit(
-        shard_map.shard_map(
-            _ragged_paged_attention,
-            mesh=mesh,
-            in_specs=in_specs,
-            out_specs=out_specs,
-            check_rep=False,
-        ))
+    return shard_map.shard_map(
+        _ragged_paged_attention,
+        mesh=mesh,
+        in_specs=in_specs,
+        out_specs=out_specs,
+        check_rep=False,
+    )(*args)
 
 
 def attention(
     kv_cache: jax.Array,
     q: jax.Array,
     k: jax.Array,
     v: jax.Array,
+    sinks: jax.Array | None,
     attention_metadata: AttentionMetadata,
     mesh: Mesh,
     head_dim_original: int | None = None,  # before padding,
@@ -343,16 +370,21 @@ def attention(
 
     # (T, N, H)
     output, kv_cache = sharded_ragged_paged_attention(
-        head_dim_original**-0.5, mesh, attention_chunk_size, q_scale, k_scale,
-        v_scale)(
-            q,
-            k,
-            v,
-            kv_cache,
-            md.seq_lens,
-            md.block_tables,
-            md.query_start_loc,
-            md.request_distribution,
-        )
+        mesh,
+        q,
+        k,
+        v,
+        kv_cache,
+        md.seq_lens,
+        md.block_tables,
+        md.query_start_loc,
+        md.request_distribution,
+        sinks,
+        sm_scale=head_dim_original**-0.5,
+        attention_chunk_size=attention_chunk_size,
+        q_scale=q_scale,
+        k_scale=k_scale,
+        v_scale=v_scale,
+    )
 
     return kv_cache, output
diff --git a/tpu_inference/layers/vllm/attention.py b/tpu_inference/layers/vllm/attention.py
@@ -4,9 +4,11 @@
 from typing import Optional, Tuple
 
 import jax
+import jax.numpy as jnp
 import torch
 from jax.sharding import Mesh
 from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 
@@ -39,18 +41,14 @@ def __init__(
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        use_irope: bool = False,
+        logits_soft_cap: float | None = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Pallas is not supported yet, it will fall back "
-                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -73,6 +71,14 @@ def __init__(
                                       "are not implemented for "
                                       "PallasAttentionBackendImpl")
 
+        #TODO (kyuyeunk): Shard the sinks along head axis.
+        self.sinks = sinks
+        if self.sinks is not None:
+            self.sinks = t2j(self.sinks, use_dlpack=False).astype(jnp.float32)
+            assert self.sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                "heads in the layer")
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -115,9 +121,12 @@ def forward(
             k_scale = layer._k_scale_float
             v_scale = layer._v_scale_float
 
+        sinks = None if self.sinks is None else jax_view(self.sinks)
+
         new_kv_cache, outputs = _jax_attn_func(kv_cache, query, key, value,
-                                               attn_metadata, mesh, self.scale,
-                                               self.head_size, self.num_heads,
+                                               sinks, attn_metadata, mesh,
+                                               self.scale, self.head_size,
+                                               self.num_heads,
                                                self.num_kv_heads, q_scale,
                                                k_scale, v_scale)
         vllm_model_wrapper_context.kv_caches[kv_cache_index] = new_kv_cache
@@ -128,7 +137,7 @@ def forward(
 @functools.partial(
     jax.jit,
     static_argnums=(
-        5, 6, 7, 8, 9, 10, 11, 12
+        6, 7, 8, 9, 10, 11, 12, 13
     ),  # mesh, scale, head_size, num_heads, num_kv_heads, q_scale, k_scale, v_scale
     donate_argnums=(0, ),  # donate kv_cache
 )
@@ -137,6 +146,7 @@ def _jax_attn_func(
     q: jax.Array,
     k: jax.Array,
     v: jax.Array,
+    sinks: jax.Array | None,
     attention_metadata: AttentionMetadata,
     mesh: Mesh,
     scale: float,
@@ -168,6 +178,7 @@ def _jax_attn_func(
         q,
         k,
         v,
+        sinks,
         attention_metadata,
         mesh,
         q_scale=q_scale,