vllm-project
diff --git a/‎vllm_gaudi/extension/features.py‎
Lines changed: 4 additions & 0 deletions b/‎vllm_gaudi/extension/features.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎vllm_gaudi/extension/unified.py‎
Lines changed: 0 additions & 247 deletions b/‎vllm_gaudi/extension/unified.py‎
Lines changed: 0 additions & 247 deletions
@@ -93,5 +93,9 @@ def get_features():
         Value('unified_attn', False),
         Value('scale_adjustment', True, env_var='VLLM_SCALE_ADJUSTMENT', env_var_type=boolean),
         Value('flatten_input', Any(ModelType('qwen3_moe'), ModelType('granitemoe'), ModelType('glm4_moe'))),
+        Value('unified_attn_shared_cache_ratio',
+              1.,
+              env_var='VLLM_UNIFIED_ATTENTION_SHARED_CACHE_RATIO',
+              env_var_type=float),
     ]
     return split_values_and_flags(features)
@@ -269,250 +269,3 @@ def unified_attn(query: torch.tensor, key: torch.tensor, value: torch.tensor, ke
     if attn is None:
         return query
     return attn
-
-
-def to_hpu(data: Optional[Union[torch.tensor, list]], dtype: Optional[torch.dtype] = None) -> torch.tensor:
-    """Copy either data or a cpu tensor to hpu"""
-    if data is None:
-        return None
-    if torch.is_tensor(data):
-        return data.to('hpu', non_blocking=True)
-    else:
-        return to_hpu(torch.tensor(data, dtype=dtype, device='cpu'))
-
-
-def mask_to_bias(mask: torch.tensor, dtype: torch.dtype) -> torch.tensor:
-    """Convert attn mask to attn bias"""
-    return torch.zeros_like(mask, dtype=dtype).masked_fill_(mask, -math.inf)
-
-
-def create_causal_bias(groups: torch.tensor, positions: torch.tensor, dtype: torch.dtype) -> torch.tensor:
-    """Create causal bias from groups and positions"""
-    group_mask = groups.unsqueeze(-1) != groups.unsqueeze(0)
-    position_mask = positions.unsqueeze(-1) < positions.unsqueeze(0)
-    causal_mask = (group_mask | position_mask)
-    return mask_to_bias(causal_mask, dtype)
-
-
-def indices_and_offsets(counts: torch.tensor) -> tuple[torch.tensor, torch.tensor]:
-    """Split groups of sizes 'counts' into individual indices and offsets. Example:
-       counts([1, 2, 3]) -> group_indices=[0, 1, 1, 2, 2, 2] group_offsets=[0, 0, 1, 0, 1, 2]"""
-    cum_end = torch.cumsum(counts, dim=0, dtype=counts.dtype)
-    cum_start = cum_end - counts
-    total = cum_end[-1] + 1
-    indices = torch.zeros(total, dtype=counts.dtype, device=counts.device)
-    indices.scatter_add_(0, cum_end[:-1].to(torch.int64), torch.ones_like(cum_end[:-1]))
-    indices = torch.cumsum(indices, dim=0)
-    offsets = torch.arange(total, dtype=counts.dtype, device=counts.device) - cum_start.index_select(0, indices)
-    return indices[:-1], offsets[:-1]
-
-
-def fetch_2d(table: torch.tensor, indices: torch.tensor, offsets: torch.tensor) -> torch.tensor:
-    """Fetch data from a 2d table using indices and offsets"""
-    assert table.dim() == 2, 'Only 2D tables are supported!'
-    flat_indices = indices * table.size(-1) + offsets
-    return table.flatten().index_select(0, flat_indices)
-
-
-def group_sum(groups: torch.tensor, values: torch.tensor):
-    """ Sum values coresponding to the same groups """
-    max_value = groups.amax().item()
-    tmp = torch.zeros((max_value + 1, ), dtype=values.dtype, device=values.device)
-    tmp.scatter_add_(0, groups.to(torch.int64), values)
-    return tmp.index_select(0, groups)
-
-
-def generate_bias(block_usages: torch.tensor, block_size: torch.tensor, dtype: torch.dtype) -> torch.tensor:
-    """ Generate block bias based on block_usage """
-    block_len_range = torch.arange(1, block_size + 1, dtype=block_usages.dtype, device=block_usages.device)
-    block_mask = block_len_range.unsqueeze(0) > block_usages.unsqueeze(-1)
-    return mask_to_bias(block_mask, dtype=dtype)
-
-
-@dataclass
-class UnifiedBatch:
-    req_ids_cpu: list[str]
-    token_ids: torch.tensor
-    token_positions: torch.tensor
-    new_token_positions_cpu: torch.tensor
-    logits_indices: torch.tensor
-    logits_groups_cpu: torch.tensor
-    attn_metadata: HPUUnifiedAttentionMetadata
-
-
-@dataclass
-class Context:
-    """ Contains relevant information for computing past context either from shared or unique blocks"""
-    group_ids: torch.tensor
-    group_offsets: torch.tensor
-    block_ids: torch.tensor
-    block_usages: torch.tensor
-
-    @staticmethod
-    def create(total_tokens: torch.tensor, block_table: torch.tensor, block_size: int) -> 'Context':
-        """ Create a new Context obj """
-        num_ctx_blocks = (total_tokens + block_size - 1) // block_size
-        if num_ctx_blocks.sum() <= 0:
-            return None
-
-        group_ids, group_offsets = indices_and_offsets(num_ctx_blocks)
-        block_ids = fetch_2d(block_table, group_ids, group_offsets)
-        #NOTE(kzawora): Originally, we were clamping
-        # total_tokens.index_select(0, group_ids) - group_offsets * block_size + 1
-        # I'm not sure why +1 was there originally, but in non-block-aligned prefix-prefill scenarios
-        # it made causal mask not cover the first unused token.
-        # (e.g. with context 28, the 28th slot was unmasked, causing the effective context length to be 29)
-        block_usages = torch.clamp(total_tokens.index_select(0, group_ids) - group_offsets * block_size, 1, block_size)
-
-        ctx = Context(group_ids, group_offsets, block_ids, block_usages)
-        all_shapes = [v.shape for v in ctx._values() if torch.is_tensor(v)]
-        for t in all_shapes[1:]:
-            assert all_shapes[0] == t
-        return ctx
-
-    def _values(self) -> tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
-        """ Split Context into individual values """
-        return (self.group_ids, self.group_offsets, self.block_ids, self.block_usages)
-
-    def index_select(self, indices: torch.tensor) -> 'Context':
-        """ Create a new Context from only specified indices """
-        if indices.size(0) <= 0:
-            return None
-        values = [v.index_select(0, indices) for v in self._values()]
-        return Context(*values)
-
-    def split(self, num_scheduled_tokens: torch.tensor) -> tuple['Context', 'Context']:
-        """ Split a Context into a shared block Context and unique block Context"""
-        num_tokens = num_scheduled_tokens.index_select(0, self.group_ids)
-        block_tokens = group_sum(self.block_ids, num_tokens)
-        shared_idx = torch.argwhere(block_tokens > 1).flatten()
-        unique_idx = torch.argwhere(block_tokens == 1).flatten()
-        assert shared_idx.size(0) + unique_idx.size(0) == self.group_ids.size(0)
-        return self.index_select(shared_idx), self.index_select(unique_idx)
-
-
-def hpu_tensor(tensor: torch.tensor, shape: tuple, pad_value: Union[int, float]) -> torch.tensor:
-    """ Pad if necessary and move tensor to HPU"""
-    if tensor is None:
-        return None
-    assert len(tensor.shape) == len(shape)
-    orig_shape = tensor.shape
-    padding = tuple(itertools.chain(*[(0, target - cur) for cur, target in reversed(list(zip(tensor.shape, shape)))]))
-    assert all(p >= 0 for p in padding)
-    if sum(padding) > 0:
-        tensor = torch.nn.functional.pad(tensor, padding, value=pad_value)
-    return to_hpu(tensor)
-
-
-def create_unified_batch(req_ids: list[str], all_token_ids: torch.tensor, num_computed_tokens: torch.tensor,
-                         num_scheduled_tokens: torch.tensor, num_prompt_tokens: torch.tensor, block_table: torch.tensor,
-                         block_size: int, dtype: torch.dtype, bucketing_fn: Callable[[bool, int, int, int, int],
-                                                                                     tuple[int, int, int, int]],
-                         get_dp_padding_fn: Callable[[int], int]) -> UnifiedBatch:
-    """ Calculate all necessary tensors needed for batch scheduling """
-    total_tokens = num_computed_tokens + num_scheduled_tokens
-    query_len = num_scheduled_tokens.sum().item()
-    is_prompt = total_tokens <= num_prompt_tokens
-    cached_tokens = num_computed_tokens + torch.where(is_prompt, 0, num_scheduled_tokens)
-    contains_prompts = torch.any(is_prompt).item()
-    num_output_tokens = total_tokens - num_prompt_tokens + 1
-    num_output_tokens = torch.clamp(num_output_tokens, torch.zeros_like(num_scheduled_tokens), num_scheduled_tokens)
-    group_starts = torch.cumsum(num_scheduled_tokens, dim=0) - num_scheduled_tokens
-
-    token_groups, token_offsets = indices_and_offsets(num_scheduled_tokens)
-    token_positions = token_offsets + num_computed_tokens.index_select(0, token_groups)
-    token_ids = fetch_2d(all_token_ids, token_groups, token_positions)
-
-    token_blocks = fetch_2d(block_table, token_groups, token_positions.floor_divide(block_size))
-    token_slots = token_blocks * block_size + token_positions.fmod(block_size)
-
-    logits_groups, logits_offsets = indices_and_offsets(num_output_tokens)
-    start_logits_indices = torch.cumsum(num_scheduled_tokens, dim=0,
-                                        dtype=num_scheduled_tokens.dtype) - num_output_tokens
-    logits_indices = logits_offsets + start_logits_indices.index_select(0, logits_groups)
-    new_token_positions = total_tokens.index_select(0, logits_groups)
-
-    def first_dim(t: Optional[torch.tensor]) -> int:
-        """ Takes first dim size or 0 if tensor is None"""
-        return t.size(0) if t is not None else 0
-
-    causal_bias = None
-    shared_blocks = None
-    shared_bias = None
-    unique_blocks = 0
-    unique_block_mapping = None
-    unique_bias = None
-
-    if contains_prompts:
-        causal_bias = create_causal_bias(token_groups, token_positions, dtype)
-
-    ctx = Context.create(cached_tokens, block_table, block_size)
-    if ctx:
-        shared_ctx, unique_ctx = ctx.split(num_scheduled_tokens)
-        if shared_ctx:
-            shared_blocks, orig_shared_blocks = torch.unique(shared_ctx.block_ids, return_inverse=True)
-
-            shared_group_starts = group_starts.index_select(0, shared_ctx.group_ids)
-
-            shared_tokens = num_scheduled_tokens.index_select(0, shared_ctx.group_ids)
-            shared_token_indices, shared_token_offsets = indices_and_offsets(shared_tokens)
-
-            shared_token_idx = shared_group_starts.index_select(0, shared_token_indices) + shared_token_offsets
-            shared_block_idx = orig_shared_blocks.index_select(0, shared_token_indices)
-            shared_block_usage = shared_ctx.block_usages.index_select(0, shared_token_indices)
-            shared_block_bias = generate_bias(shared_block_usage, block_size, dtype)
-
-            shared_bias = torch.full((query_len, shared_blocks.size(0), block_size),
-                                     -math.inf,
-                                     dtype=dtype,
-                                     device=shared_blocks.device)
-            shared_bias.index_put_((shared_token_idx, shared_block_idx), shared_block_bias)
-
-        if unique_ctx:
-            unique_blocks = torch.amax(unique_ctx.block_ids).item() + 1
-            unique_bias = torch.full((unique_blocks, block_size),
-                                     -math.inf,
-                                     dtype=dtype,
-                                     device=unique_ctx.block_ids.device)
-            unique_block_bias = generate_bias(unique_ctx.block_usages, block_size, dtype)
-            unique_bias.index_copy_(0, unique_ctx.block_ids.to(torch.int64), unique_block_bias)
-            unique_group_starts = group_starts.index_select(0, unique_ctx.group_ids)
-            unique_block_mapping = torch.full((unique_blocks, ),
-                                              -1,
-                                              dtype=torch.int64,
-                                              device=unique_ctx.block_ids.device)
-            unique_block_mapping.index_copy_(0, unique_ctx.block_ids.to(torch.int64), unique_group_starts)
-
-    bucket = bucketing_fn(contains_prompts, first_dim(token_ids), first_dim(shared_blocks), unique_blocks,
-                          first_dim(logits_indices))
-    target_qlen, target_shared_blocks, target_unique_blocks, target_logits = bucket
-
-    target_qlen += get_dp_padding_fn(target_qlen)
-    target_shared_blocks += get_dp_padding_fn(target_shared_blocks)
-    target_unique_blocks += get_dp_padding_fn(target_unique_blocks)
-    target_logits += get_dp_padding_fn(target_logits)
-
-    default_causal_width = 512
-    fmin = torch.finfo(dtype).min
-    feps = torch.finfo(dtype).tiny
-
-    return UnifiedBatch(req_ids_cpu=req_ids,
-                        token_ids=hpu_tensor(token_ids, (target_qlen, ), -1),
-                        token_positions=hpu_tensor(token_positions, (target_qlen, ), -1),
-                        new_token_positions_cpu=new_token_positions,
-                        logits_indices=hpu_tensor(logits_indices, (target_logits, ), -1),
-                        logits_groups_cpu=logits_groups,
-                        attn_metadata=HPUUnifiedAttentionMetadata(
-                            block_size=block_size,
-                            slot_mapping=hpu_tensor(token_slots, (target_qlen, ), -1),
-                            causal_bias=hpu_tensor(causal_bias, (target_qlen, target_qlen), -math.inf),
-                            causal_width=default_causal_width,
-                            shared_blocks=hpu_tensor(shared_blocks, (target_shared_blocks, ), -1),
-                            shared_bias=hpu_tensor(shared_bias, (target_qlen, target_shared_blocks, block_size),
-                                                   -math.inf),
-                            unique_blocks=target_unique_blocks,
-                            unique_block_mapping=hpu_tensor(unique_block_mapping, (target_unique_blocks, ), -1),
-                            unique_bias=hpu_tensor(unique_bias, (target_unique_blocks, block_size), -math.inf),
-                            fmin=to_hpu(fmin),
-                            feps=to_hpu(feps),
-                        ))