fix: add a check for int32 indices in sampling.py (#2127)

raayandhar · web-flow · commit 5e110040cb10 · 2025-11-21T23:26:02.000-08:00
## 📌 Description New function to validate that the indices type, when provided, is `int32`. To close #2115. There are now two separate functions doing checking in this file. I will move them to the C++ side later when I have some more bandwidth, probably after Thanksgiving. Just a short fix for now. You can close if you'd rather wait for that.  ## 🔍 Related Issues #2115  Relevant to the issue. Now running their code: ``` (flashinfer) raayan@uril-1:~/projects/flashinfer$ python test.py tensor([1, 1, 0, 0], device='cuda:0', dtype=torch.int32) Traceback (most recent call last): File "/home/raayan/projects/flashinfer/test.py", line 15, in <module> incorrect_samples = flashinfer.sampling.top_k_top_p_sampling_from_logits( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/raayan/projects/flashinfer/flashinfer/sampling.py", line 1031, in top_k_top_p_sampling_from_logits _check_indices_dtype(indices) File "/home/raayan/projects/flashinfer/flashinfer/sampling.py", line 487, in _check_indices_dtype raise ValueError(f"indices must have dtype torch.int32, got {indices.dtype}") ValueError: indices must have dtype torch.int32, got torch.int64 ``` ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Improvements** * Enforced that indices passed to sampling operations must use int32, adding runtime validation before sampling. * **Documentation** * Clarified docstrings to state the int32 requirement for indices parameters. * **Tests** * Updated and expanded tests to cover the new dtype validation paths and related error cases. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>  --------- Signed-off-by: Raayan Dhar raayan.dhar@gmail.com <raayan.dhar@gmail.com>
diff --git a/flashinfer/sampling.py b/flashinfer/sampling.py
@@ -481,6 +481,12 @@ def _to_tensor_scalar_tuple(x):
         return (None, x)
 
 
+def _check_indices_dtype(indices: Optional[torch.Tensor]) -> None:
+    """Validate indices dtype."""
+    if indices is not None and indices.dtype != torch.int32:
+        raise ValueError(f"indices must have dtype torch.int32, got {indices.dtype}")
+
+
 def _check_tensor_param(param: Any, tensor: torch.Tensor) -> None:
     """Validate sampling parameters."""
     if isinstance(param, torch.Tensor):
@@ -576,7 +582,7 @@ def sampling_from_logits(
         shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
         probability distributions.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in logits.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in logits.
         For example, if indices[i] = j, then the i-th output will be sampled from logits[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of logits.
@@ -612,6 +618,7 @@ def sampling_from_logits(
     if check_nan:
         if torch.any(torch.isnan(logits)):
             raise ValueError("Input logits contains NaN.")
+    _check_indices_dtype(indices)
     return get_sampling_module().sampling_from_logits(
         logits, indices, deterministic, generator
     )
@@ -634,7 +641,7 @@ def sampling_from_probs(
         shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
         probability distributions.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -676,6 +683,7 @@ def sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
+    _check_indices_dtype(indices)
     return get_sampling_module().sampling_from_probs(
         probs, indices, deterministic, generator
     )
@@ -708,7 +716,7 @@ def top_p_sampling_from_probs(
         If a float, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -758,6 +766,7 @@ def top_p_sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
+    _check_indices_dtype(indices)
     _check_tensor_param(top_p, probs)
     return get_sampling_module().top_p_sampling_from_probs(
         probs, indices, *_to_tensor_scalar_tuple(top_p), deterministic, generator
@@ -791,7 +800,7 @@ def top_k_sampling_from_probs(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -841,6 +850,7 @@ def top_k_sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
+    _check_indices_dtype(indices)
     _check_tensor_param(top_k, probs)
     return get_sampling_module().top_k_sampling_from_probs(
         probs, indices, *_to_tensor_scalar_tuple(top_k), deterministic, generator
@@ -875,7 +885,7 @@ def min_p_sampling_from_probs(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -920,6 +930,7 @@ def min_p_sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
+    _check_indices_dtype(indices)
     _check_tensor_param(min_p, probs)
     return get_sampling_module().min_p_sampling_from_probs(
         probs, indices, *_to_tensor_scalar_tuple(min_p), deterministic, generator
@@ -960,7 +971,7 @@ def top_k_top_p_sampling_from_logits(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -1018,6 +1029,7 @@ def top_k_top_p_sampling_from_logits(
     top_k_mask_logits
     top_p_sampling_from_probs
     """
+    _check_indices_dtype(indices)
     _check_tensor_param(top_k, logits)
     _check_tensor_param(top_p, logits)
     if filter_apply_order == "top_k_first":
@@ -1082,7 +1094,7 @@ def top_k_top_p_sampling_from_probs(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -1135,6 +1147,7 @@ def top_k_top_p_sampling_from_probs(
     top_p_renorm_probs
     top_k_mask_logits
     """
+    _check_indices_dtype(indices)
     _check_tensor_param(top_k, probs)
     _check_tensor_param(top_p, probs)
     if filter_apply_order == "top_k_first":
diff --git a/tests/utils/test_sampling.py b/tests/utils/test_sampling.py
@@ -572,7 +572,7 @@ def test_chain_speculative_sampling(
 @pytest.mark.parametrize("batch_size", [1, 99, 989])
 @pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
 @pytest.mark.parametrize("p", [0.05, 0.1, 0.2, 0.7, 1])
-def test_check_tensor_param_min_p(batch_size, vocab_size, p):
+def test_tensor_validation_min_p(batch_size, vocab_size, p):
     pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
     normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
 
@@ -587,7 +587,7 @@ def test_check_tensor_param_min_p(batch_size, vocab_size, p):
         flashinfer.sampling.min_p_sampling_from_probs(
             normalized_prob,
             torch.tensor(
-                [[p] * vocab_size] * batch_size, dtype=torch.int, device="cuda:0"
+                [[p] * vocab_size] * batch_size, dtype=torch.float32, device="cuda:0"
             ),
         )
 
@@ -597,22 +597,33 @@ def test_check_tensor_param_min_p(batch_size, vocab_size, p):
         match=r"Expected a 1D tensor of shape \(batch_size,\) or scalar.*got a 0-dimensional tensor",
     ):
         flashinfer.sampling.min_p_sampling_from_probs(
-            normalized_prob, torch.tensor(p, dtype=torch.int, device="cuda:0")
+            normalized_prob, torch.tensor(p, dtype=torch.float32, device="cuda:0")
         )
 
-    # 4: 1D tensor with a broken batch size raises error (only when batch_size > 1).
+    # 4: non-int32 indices raises error.
+    with pytest.raises(
+        ValueError,
+        match=r"indices must have dtype torch\.int32, got torch\.int64",
+    ):
+        flashinfer.sampling.min_p_sampling_from_probs(
+            normalized_prob,
+            torch.tensor([p] * batch_size, dtype=torch.float32, device="cuda:0"),
+            torch.tensor([p] * batch_size, dtype=torch.int64, device="cuda:0"),
+        )
+
+    # 5: 1D tensor with a broken batch size raises error (only when batch_size > 1).
     if batch_size > 1:
         with pytest.raises(
             ValueError, match="Sampling parameter tensor batch size mismatch"
         ):
             flashinfer.sampling.min_p_sampling_from_probs(
-                normalized_prob, torch.tensor([p], dtype=torch.int, device="cuda:0")
+                normalized_prob, torch.tensor([p], dtype=torch.float32, device="cuda:0")
             )
 
-    # 5: 1D tensor with the correct batch size works.
+    # 6: 1D tensor with the correct batch size works.
     samples = flashinfer.sampling.min_p_sampling_from_probs(
         normalized_prob,
-        torch.tensor([p] * batch_size, dtype=torch.int, device="cuda:0"),
+        torch.tensor([p] * batch_size, dtype=torch.float32, device="cuda:0"),
     )
     assert samples.shape == (batch_size,)