@@ -71,6 +71,8 @@ class ParallelConfig:
7171 """Number of pipeline parallel groups."""
7272 tensor_parallel_size : int = 1
7373 """Number of tensor parallel groups."""
74+ prefill_context_parallel_size : int = 1
75+ """Number of prefill context parallel groups."""
7476 data_parallel_size : int = 1
7577 """Number of data parallel groups. MoE layers will be sharded according to
7678 the product of the tensor parallel size and data parallel size."""
@@ -239,14 +241,25 @@ class is dynamically inherited by the worker class. This is used to inject
239241 needs to be divisible by dcp_size."""
240242
241243 dcp_kv_cache_interleave_size : int = 1
242- """Interleave size of kv_cache storage while using dcp or cp > 1,
243- store interleave_size tokens on (d)cp i,
244- then store next interleave_size tokens on (d)cp i+1.
245- Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
246- Interleave_size=block_size: block-level align, first fill the block on first rank,
247- token is stored on rank i+1 block j after rank i block j is full.
248- Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
249- Block_size should be divisible by dcp_kv_cache_interleave_size.
244+ """
245+ Interleave size of kv_cache storage while using DCP.
246+ dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
247+ and will be deprecated when PCP is fully supported.
248+
249+ """
250+ cp_kv_cache_interleave_size : int = 1
251+ """Interleave size of kv_cache storage while using DCP or PCP.
252+ For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
253+ and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
254+ store interleave_size tokens on total_cp_rank i,
255+ then store next interleave_size tokens on taotal_cp_rank i+1.
256+ Interleave_size=1: token-level alignment, where token `i` is stored on
257+ total_cp_rank `i % total_cp_world_size`.
258+ Interleave_size=block_size: block-level alignment, where tokens are
259+ first populated to the preceding ranks. Tokens are then stored
260+ in (rank i+1, block j) only after (rank i, block j) is fully occupied.
261+ Block_size should be greater than or equal to cp_kv_cache_interleave_size.
262+ Block_size should be divisible by cp_kv_cache_interleave_size.
250263 """
251264
252265 _api_process_count : int = Field (default = 1 , gt = 0 )
@@ -311,6 +324,11 @@ def _validate_parallel_config(self) -> Self:
311324 "num_redundant_experts."
312325 )
313326
327+ if self .prefill_context_parallel_size > 1 :
328+ raise ValueError (
329+ "Prefill context parallelism is not fully supported. "
330+ "Please set prefill_context_parallel_size to 1."
331+ )
314332 return self
315333
316334 @property
@@ -529,7 +547,11 @@ def __post_init__(self) -> None:
529547 )
530548
531549 # Continue with the rest of the initialization
532- self .world_size = self .pipeline_parallel_size * self .tensor_parallel_size
550+ self .world_size = (
551+ self .pipeline_parallel_size
552+ * self .tensor_parallel_size
553+ * self .prefill_context_parallel_size
554+ )
533555
534556 if self .distributed_executor_backend == "external_launcher" :
535557 logger .info ("Using external launcher for distributed inference." )
0 commit comments