huggingface
diff --git a/‎timm/layers/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎timm/layers/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎timm/layers/drop.py‎
Lines changed: 43 additions & 0 deletions b/‎timm/layers/drop.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎timm/models/beit.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/beit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/models/byobnet.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/byobnet.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/models/coat.py‎
Lines changed: 0 additions & 2 deletions b/‎timm/models/coat.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎timm/models/convit.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/convit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/models/convnext.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/convnext.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/models/crossvit.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/crossvit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/models/cspnet.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/cspnet.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/models/davit.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/models/davit.py‎
Lines changed: 2 additions & 2 deletions
@@ -42,7 +42,7 @@
 from .create_conv2d import create_conv2d
 from .create_norm import get_norm_layer, create_norm_layer
 from .create_norm_act import get_norm_act_layer, create_norm_act_layer, get_norm_act_layer
-from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
+from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path, calculate_drop_path_rates
 from .eca import EcaModule, CecaModule, EfficientChannelAttn, CircularEfficientChannelAttn
 from .evo_norm import (
     EvoNorm2dB0,
 
@@ -14,6 +14,8 @@
 
 Hacked together by / Copyright 2020 Ross Wightman
 """
+from typing import List, Union
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -180,3 +182,44 @@ def forward(self, x):
 
     def extra_repr(self):
         return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+
+def calculate_drop_path_rates(
+        drop_path_rate: float,
+        depths: Union[int, List[int]],
+        stagewise: bool = False,
+) -> Union[List[float], List[List[float]]]:
+    """Generate drop path rates for stochastic depth.
+
+    This function handles two common patterns for drop path rate scheduling:
+    1. Per-block: Linear increase from 0 to drop_path_rate across all blocks
+    2. Stage-wise: Linear increase across stages, with same rate within each stage
+
+    Args:
+        drop_path_rate: Maximum drop path rate (at the end).
+        depths: Either a single int for total depth (per-block mode) or
+                list of ints for depths per stage (stage-wise mode).
+        stagewise: If True, use stage-wise pattern. If False, use per-block pattern.
+                   When depths is a list, stagewise defaults to True.
+
+    Returns:
+        For per-block mode: List of drop rates, one per block.
+        For stage-wise mode: List of lists, drop rates per stage.
+    """
+    if isinstance(depths, int):
+        # Single depth value - per-block pattern
+        if stagewise:
+            raise ValueError("stagewise=True requires depths to be a list of stage depths")
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depths, device='cpu')]
+        return dpr
+    else:
+        # List of depths - can be either pattern
+        total_depth = sum(depths)
+        if stagewise:
+            # Stage-wise pattern: same drop rate within each stage
+            dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, total_depth, device='cpu').split(depths)]
+            return dpr
+        else:
+            # Per-block pattern across all stages
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth, device='cpu')]
+            return dpr
@@ -46,7 +46,7 @@
 import torch.nn.functional as F
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import PatchEmbed, Mlp, SwiGLU, LayerNorm, DropPath, trunc_normal_, use_fused_attn
+from timm.layers import PatchEmbed, Mlp, SwiGLU, LayerNorm, DropPath, calculate_drop_path_rates, trunc_normal_, use_fused_attn
 from timm.layers import resample_patch_embed, resample_abs_pos_embed, resize_rel_pos_bias_table, ndgrid
 
 from ._builder import build_model_with_cfg
@@ -448,7 +448,7 @@ def __init__(
         else:
             self.rel_pos_bias = None
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = calculate_drop_path_rates(drop_path_rate, depth)  # stochastic depth decay rule
         self.blocks = nn.ModuleList([
             Block(
                 dim=embed_dim,
 
@@ -39,7 +39,7 @@
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from timm.layers import (
     ClassifierHead, NormMlpClassifierHead, ConvNormAct, BatchNormAct2d, EvoNorm2dS0a,
-    AttentionPool2d, RotAttentionPool2d, DropPath, AvgPool2dSame,
+    AttentionPool2d, RotAttentionPool2d, DropPath, calculate_drop_path_rates, AvgPool2dSame,
     create_conv2d, get_act_layer, get_norm_act_layer, get_attn, make_divisible, to_2tuple,
 )
 from ._builder import build_model_with_cfg
@@ -1212,7 +1212,7 @@ def create_byob_stages(
     feature_info = []
     block_cfgs = [expand_blocks_cfg(s) for s in cfg.blocks]
     depths = [sum([bc.d for bc in stage_bcs]) for stage_bcs in block_cfgs]
-    dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+    dpr = calculate_drop_path_rates(drop_path_rate, depths, stagewise=True)
     dilation = 1
     net_stride = stem_feat['reduction']
     prev_chs = stem_feat['num_chs']
 
@@ -417,9 +417,7 @@ def __init__(
         self.crpe3 = ConvRelPosEnc(head_chs=embed_dims[2] // num_heads, num_heads=num_heads, window=crpe_window)
         self.crpe4 = ConvRelPosEnc(head_chs=embed_dims[3] // num_heads, num_heads=num_heads, window=crpe_window)
 
-        # Disable stochastic depth.
         dpr = drop_path_rate
-        assert dpr == 0.0
         skwargs = dict(
             num_heads=num_heads,
             qkv_bias=qkv_bias,
 
@@ -27,7 +27,7 @@
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import DropPath, trunc_normal_, PatchEmbed, Mlp, LayerNorm, HybridEmbed
+from timm.layers import DropPath, calculate_drop_path_rates, trunc_normal_, PatchEmbed, Mlp, LayerNorm, HybridEmbed
 from ._builder import build_model_with_cfg
 from ._features_fx import register_notrace_module
 from ._registry import register_model, generate_default_cfgs
@@ -292,7 +292,7 @@ def __init__(
             self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
             trunc_normal_(self.pos_embed, std=.02)
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = calculate_drop_path_rates(drop_path_rate, depth)  # stochastic depth decay rule
         self.blocks = nn.ModuleList([
             Block(
                 dim=embed_dim,
 
@@ -44,7 +44,7 @@
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, Mlp, GlobalResponseNormMlp, \
+from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, calculate_drop_path_rates, Mlp, GlobalResponseNormMlp, \
     LayerNorm2d, LayerNorm, RmsNorm2d, RmsNorm, create_conv2d, get_act_layer, get_norm_layer, make_divisible, to_ntuple
 from timm.layers import SimpleNorm2d, SimpleNorm
 from timm.layers import NormMlpClassifierHead, ClassifierHead
@@ -377,7 +377,7 @@ def __init__(
             stem_stride = 4
 
         self.stages = nn.Sequential()
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = calculate_drop_path_rates(drop_path_rate, depths, stagewise=True)
         stages = []
         prev_chs = dims[0]
         curr_stride = stem_stride
 
@@ -27,7 +27,7 @@
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import DropPath, to_2tuple, trunc_normal_, _assert
+from timm.layers import DropPath, calculate_drop_path_rates, to_2tuple, trunc_normal_, _assert
 from ._builder import build_model_with_cfg
 from ._features_fx import register_notrace_function
 from ._registry import register_model, generate_default_cfgs
@@ -346,7 +346,7 @@ def __init__(
         self.pos_drop = nn.Dropout(p=pos_drop_rate)
 
         total_depth = sum([sum(x[-2:]) for x in depth])
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth)]  # stochastic depth decay rule
+        dpr = calculate_drop_path_rates(drop_path_rate, total_depth)  # stochastic depth decay rule
         dpr_ptr = 0
         self.blocks = nn.ModuleList()
         for idx, block_cfg in enumerate(depth):
 
@@ -20,7 +20,7 @@
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import ClassifierHead, ConvNormAct, DropPath, get_attn, create_act_layer, make_divisible
+from timm.layers import ClassifierHead, ConvNormAct, DropPath, calculate_drop_path_rates, get_attn, create_act_layer, make_divisible
 from ._builder import build_model_with_cfg
 from ._manipulate import named_apply, MATCH_PREV_GROUP
 from ._registry import register_model, generate_default_cfgs
@@ -569,7 +569,7 @@ def create_csp_stages(
     cfg_dict = asdict(cfg.stages)
     num_stages = len(cfg.stages.depth)
     cfg_dict['block_dpr'] = [None] * num_stages if not drop_path_rate else \
-        [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.stages.depth)).split(cfg.stages.depth)]
+        calculate_drop_path_rates(drop_path_rate, cfg.stages.depth, stagewise=True)
     stage_args = [dict(zip(cfg_dict.keys(), values)) for values in zip(*cfg_dict.values())]
     block_kwargs = dict(
         act_layer=cfg.act_layer,
 
@@ -20,7 +20,7 @@
 from torch import Tensor
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import DropPath, to_2tuple, trunc_normal_, Mlp, LayerNorm2d, get_norm_layer, use_fused_attn
+from timm.layers import DropPath, calculate_drop_path_rates, to_2tuple, trunc_normal_, Mlp, LayerNorm2d, get_norm_layer, use_fused_attn
 from timm.layers import NormMlpClassifierHead, ClassifierHead
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
@@ -555,7 +555,7 @@ def __init__(
         self.stem = Stem(in_chans, embed_dims[0], norm_layer=norm_layer)
         in_chs = embed_dims[0]
 
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = calculate_drop_path_rates(drop_path_rate, depths, stagewise=True)
         stages = []
         for i in range(num_stages):
             out_chs = embed_dims[i]