Ema momentum (#2333)

Yawei Li · vfdev-5 · Ishan-Kumar2 · commit 8ed6c294fc33 · 2021-12-27T01:20:33.000+05:30
* deprecate warmup function in ema_handler.py

Signed-off-by: sandylaker &lt;yawei.li@tum.de&gt;

* keep the previous API but throw warnings

Signed-off-by: sandylaker &lt;yawei.li@tum.de&gt;

* revert changes to state_param_scheduler.py and test_state_param_scheduler.py

Signed-off-by: sandylaker &lt;yawei.li@tum.de&gt;

* use `LambdaStateScheduler` to schedule EMA momentum

* fix docs

* fix mypy

* Apply suggestions from code review

Co-authored-by: vfdev &lt;vfdev.5@gmail.com&gt;
diff --git a/ignite/handlers/ema_handler.py b/ignite/handlers/ema_handler.py
@@ -1,40 +1,53 @@
+import warnings
 from copy import deepcopy
 from typing import Optional, Union
 
 import torch.nn as nn
 
 from ignite.engine import CallableEventWithFilter, Engine, Events, EventsList
+from ignite.handlers.param_scheduler import BaseParamScheduler
+from ignite.handlers.state_param_scheduler import LambdaStateScheduler
 
 __all__ = ["EMAHandler"]
 
 
+class EMAWarmUp:
+    def __init__(self, momentum_warmup: float, warmup_iters: int, momentum: float) -> None:
+        self.momentum_warmup = momentum_warmup
+        self.warmup_iters = warmup_iters
+        self.momentum = momentum
+
+    def __call__(self, event_index: int) -> float:
+        denominator = max(1, self.warmup_iters - 1)
+        curr_momentum = self.momentum_warmup + (self.momentum - self.momentum_warmup) * (event_index - 1) / denominator
+        if self.momentum >= self.momentum_warmup:
+            return min(self.momentum, curr_momentum)
+        else:
+            return max(self.momentum, curr_momentum)
+
+
 class EMAHandler:
     r"""Exponential moving average (EMA) handler can be used to compute a smoothed version of model.
     The EMA model is updated as follows:
 
     .. math:: \theta_{\text{EMA}, t+1} = (1 - \lambda) \cdot \theta_{\text{EMA}, t} + \lambda \cdot \theta_{t}
 
     where :math:`\theta_{\text{EMA}, t}` and :math:`\theta_{t}` are the EMA weights and online model weights at
-    :math:`t`-th iteration, respectively; :math:`\lambda` is the update momentum. The handler allows for linearly
-    warming up the momentum in the beginning when training process is not stable. Current momentum can be retrieved
+    :math:`t`-th iteration, respectively; :math:`\lambda` is the update momentum. Current momentum can be retrieved
     from ``Engine.state.ema_momentum``.
 
     Args:
           model: the online model for which an EMA model will be computed. If ``model`` is ``DataParallel`` or
               ``DistributedDataParallel``, the EMA smoothing will be applied to ``model.module`` .
           momentum: the update momentum after warmup phase, should be float in range :math:`\left(0, 1 \right)`.
-          momentum_warmup: the initial update momentum during warmup phase, the value should be smaller than
-              ``momentum``. Momentum will linearly increase from this value to ``momentum`` in ``warmup_iters``
-              iterations. If ``None``, no warmup will be performed.
-          warmup_iters: iterations of warmup. If ``None``, no warmup will be performed.
+          momentum_warmup: the initial update momentum during warmup phase.
+          warmup_iters: iterations of warmup.
 
     Attributes:
           ema_model: the exponential moving averaged model.
           model: the online model that is tracked by EMAHandler. It is ``model.module`` if ``model`` in
               the initialization method is an instance of ``DistributedDataParallel``.
-          momentum: the update momentum after warmup phase.
-          momentum_warmup: the initial update momentum.
-          warmup_iters: number of warmup iterations.
+          momentum: the update momentum.
 
     Note:
           The EMA model is already in ``eval`` mode. If model in the arguments is an ``nn.Module`` or
@@ -56,8 +69,7 @@ class EMAHandler:
               device = torch.device("cuda:0")
               model = nn.Linear(2, 1).to(device)
               # update the ema every 5 iterations
-              ema_handler = EMAHandler(
-                  model, momentum=0.0002, momentum_warmup=0.0001, warmup_iters=10000)
+              ema_handler = EMAHandler(model, momentum=0.0002)
               # get the ema model, which is an instance of nn.Module
               ema_model = ema_handler.ema_model
               trainer = Engine(train_step_fn)
@@ -89,6 +101,19 @@ def run_validation(engine):
 
               trainer.run(...)
 
+          The following example shows how to perform warm-up to the EMA momentum:
+
+          .. code-block:: python
+
+              device = torch.device("cuda:0")
+              model = nn.Linear(2, 1).to(device)
+              # linearly change the EMA momentum from 0.2 to 0.002 in the first 100 iterations,
+              # then keep a constant EMA momentum of 0.002 afterwards
+              ema_handler = EMAHandler(model, momentum=0.002, momentum_warmup=0.2, warmup_iters=100)
+              engine = Engine(step_fn)
+              ema_handler.attach(engine, name="ema_momentum")
+              engine.run(...)
+
           The following example shows how to attach two handlers to the same trainer:
 
           .. code-block:: python
@@ -125,25 +150,19 @@ def __init__(
         momentum_warmup: Optional[float] = None,
         warmup_iters: Optional[int] = None,
     ) -> None:
-        if momentum_warmup is not None and not 0 < momentum_warmup < 1:
-            raise ValueError(f"Invalid momentum_warmup: {momentum_warmup}")
         if not 0 < momentum < 1:
             raise ValueError(f"Invalid momentum: {momentum}")
-        if momentum_warmup is not None and not momentum_warmup <= momentum:
-            raise ValueError(
-                f"momentum_warmup should be less than or equal to momentum, but got "
-                f"momentum_warmup: {momentum_warmup} and momentum: {momentum}"
-            )
-        if warmup_iters is not None and not (isinstance(warmup_iters, int) and warmup_iters > 0):
-            raise ValueError(f"Invalid warmup_iters: {warmup_iters}")
+        self.momentum = momentum
+        self._momentum_lambda_obj: Optional[EMAWarmUp] = None
+        if momentum_warmup is not None and warmup_iters is not None:
+            self.momentum_scheduler: Optional[BaseParamScheduler] = None
+            self._momentum_lambda_obj = EMAWarmUp(momentum_warmup, warmup_iters, momentum)
+
         if not isinstance(model, nn.Module):
             raise ValueError(
                 f"model should be an instance of nn.Module or its subclasses, but got"
                 f"model: {model.__class__.__name__}"
             )
-        self.momentum_warmup = momentum_warmup
-        self.momentum = momentum
-        self.warmup_iters = warmup_iters
 
         if isinstance(model, nn.parallel.DistributedDataParallel):
             model = model.module
@@ -154,22 +173,6 @@ def __init__(
             param.detach_()
         self.ema_model.eval()
 
-    def _get_momentum(self, curr_iter: int) -> float:
-        """Get current momentum, `curr_iter` should be 1-based. When `curr_iter = 1`, `momentum =
-        self.momentum_warmup`; when `curr_iter >= self.warmup_iters`, `momentum = self.momentum`"""
-
-        # TODO: use ignite's parameter scheduling, see also GitHub issue #2090
-        if curr_iter < 1:
-            raise ValueError(f"curr_iter should be at least 1, but got {curr_iter}.")
-
-        # no warmup
-        if self.momentum_warmup is None or self.warmup_iters is None:
-            return self.momentum
-
-        denominator = max(1, self.warmup_iters - 1)
-        momentum = self.momentum_warmup + (self.momentum - self.momentum_warmup) * (curr_iter - 1) / denominator
-        return min(self.momentum, momentum)
-
     def _update_ema_model(self, engine: Engine, name: str) -> None:
         """Update weights of ema model"""
         momentum = getattr(engine.state, name)
@@ -179,36 +182,47 @@ def _update_ema_model(self, engine: Engine, name: str) -> None:
         for ema_b, model_b in zip(self.ema_model.buffers(), self.model.buffers()):
             ema_b.data = model_b.data
 
-    def _update_ema_momentum(self, engine: Engine, name: str) -> None:
-        """Update momentum in engine.state"""
-        curr_iter = engine.state.iteration
-        momentum = self._get_momentum(curr_iter)
-        setattr(engine.state, name, momentum)
-
     def attach(
         self,
         engine: Engine,
         name: str = "ema_momentum",
+        warn_if_exists: bool = True,
         event: Union[str, Events, CallableEventWithFilter, EventsList] = Events.ITERATION_COMPLETED,
     ) -> None:
         """Attach the handler to engine. After the handler is attached, the ``Engine.state`` will add an new attribute
-        with name ``name``. Then, current momentum can be retrieved by from ``Engine.state`` when the engine runs.
+        with name ``name`` if the attribute does not exist. Then, the current momentum can be retrieved from
+        ``Engine.state`` when the engine runs.
+
+
+        Note:
+            There are two cases where a momentum with name ``name`` already exists: 1. the engine has loaded its
+            state dict after resuming. In this case, there is no need to initialize the momentum again, and users
+            can set ``warn_if_exists`` to False to suppress the warning message; 2. another handler has created
+            a state attribute with the same name. In this case, users should choose another name for the ema momentum.
+
 
         Args:
             engine: trainer to which the handler will be attached.
             name: attribute name for retrieving EMA momentum from ``Engine.state``. It should be a unique name since a
                 trainer can have multiple EMA handlers.
+            warn_if_exists: if True, a warning will be thrown if the momentum with name ``name`` already exists.
             event: event when the EMA momentum and EMA model are updated.
 
         """
         if hasattr(engine.state, name):
-            raise ValueError(
-                f"Attribute: '{name}' is already in Engine.state. Thus it might be "
-                f"overridden by other EMA handlers. Please select another name."
-            )
-
-        setattr(engine.state, name, 0.0)
-
-        # first update momentum, then update ema model
-        engine.add_event_handler(event, self._update_ema_momentum, name)
+            if warn_if_exists:
+                warnings.warn(
+                    f"Attribute '{name}' already exists in Engine.state. It might because 1. the engine has loaded its "
+                    f"state dict or 2. {name} is already created by other handlers. Turn off this warning by setting"
+                    f"warn_if_exists to False.",
+                    category=UserWarning,
+                )
+        else:
+            setattr(engine.state, name, self.momentum)
+
+        if self._momentum_lambda_obj is not None:
+            self.momentum_scheduler = LambdaStateScheduler(self._momentum_lambda_obj, param_name="ema_momentum")
+
+            # first update the momentum and then update the EMA model
+            self.momentum_scheduler.attach(engine, event)
         engine.add_event_handler(event, self._update_ema_model, name)
diff --git a/ignite/handlers/state_param_scheduler.py b/ignite/handlers/state_param_scheduler.py
@@ -4,7 +4,7 @@
 from typing import Any, List, Sequence, Tuple, Union
 
 from ignite.engine import CallableEventWithFilter, Engine, Events, EventsList
-from ignite.handlers import BaseParamScheduler
+from ignite.handlers.param_scheduler import BaseParamScheduler
 
 
 class StateParamScheduler(BaseParamScheduler):
diff --git a/tests/ignite/handlers/test_ema_handler.py b/tests/ignite/handlers/test_ema_handler.py
@@ -47,18 +47,51 @@ def test_ema_invalid_momentum(get_dummy_model, momentum):
         EMAHandler(get_dummy_model(), momentum=momentum)
 
 
-@pytest.mark.parametrize("momentum_warmup", [-1, 2])
-def test_ema_invalid_momentum_warmup(get_dummy_model, momentum_warmup):
-    with pytest.raises(ValueError, match="Invalid momentum_warmup"):
-        EMAHandler(get_dummy_model, momentum_warmup=momentum_warmup)
-
+def test_has_momentum_scheduler(get_dummy_model):
+    """Test the handler has attribute `momentum_scheduler` and `_momentum_lambda_obj`"""
+    momentum_warmup = 0.0
+    warmup_iters = 10
+    ema_handler = EMAHandler(get_dummy_model(), momentum_warmup=momentum_warmup, warmup_iters=warmup_iters)
+    assert hasattr(ema_handler, "momentum_scheduler")
+    assert hasattr(ema_handler, "_momentum_lambda_obj")
+
+
+def test_ema_warmup_func(get_dummy_model):
+    """Test the built-in linear warmup function for the EMA momentum"""
+    momentum = 0.5
+    momentum_warmup_1 = 0.0
+    momentum_warmup_2 = 1.0
+    warmup_iters = 5
+
+    def check_ema_momentum(engine: Engine, momentum_warmup, final_momentum, warmup_iters):
+        if engine.state.iteration == 1:
+            assert engine.state.ema_momentum == momentum_warmup
+        elif engine.state.iteration >= 1 + warmup_iters:
+            assert engine.state.ema_momentum == final_momentum
+        else:
+            min_momentum = min(momentum, momentum_warmup)
+            max_momentum = max(momentum, momentum_warmup)
+            assert min_momentum <= engine.state.ema_momentum <= max_momentum
 
-def test_ema_invalid_momentum_start_end(get_dummy_model):
-    """Test momentum_end > momentum_start"""
-    momentum = 0.001
-    momentum_warmup = 0.1
-    with pytest.raises(ValueError, match="momentum_warmup should be less than or equal to momentum"):
-        EMAHandler(get_dummy_model(), momentum_warmup=momentum_warmup, momentum=momentum)
+    # momentum_warmup < momentum
+    model_1 = get_dummy_model()
+    engine_1 = Engine(_get_dummy_step_fn(model_1))
+    ema_handler_1 = EMAHandler(model_1, momentum, momentum_warmup_1, warmup_iters)
+    ema_handler_1.attach(engine_1)
+    engine_1.add_event_handler(
+        Events.ITERATION_COMPLETED, check_ema_momentum, momentum_warmup_1, momentum, warmup_iters
+    )
+    engine_1.run(range(10))
+
+    # momentum_warmup > momentum
+    model_2 = get_dummy_model()
+    engine_2 = Engine(_get_dummy_step_fn(model_2))
+    ema_handler_2 = EMAHandler(model_2, momentum, momentum_warmup_2, warmup_iters)
+    ema_handler_2.attach(engine_2)
+    engine_2.add_event_handler(
+        Events.ITERATION_COMPLETED, check_ema_momentum, momentum_warmup_2, momentum, warmup_iters
+    )
+    engine_2.run(range(10))
 
 
 def test_ema_invalid_model():
@@ -98,54 +131,19 @@ def test_ema_load_state_dict(get_dummy_model):
     assert ema_model.weight.data.allclose(model_1.weight.data)
 
 
-def test_ema_no_warmup_momentum(get_dummy_model):
+def test_ema_get_const_momentum(get_dummy_model):
+    """Test if momentum retrieved from the engine is constant and equal to the handler's momentum"""
     model = get_dummy_model()
     step_fn = _get_dummy_step_fn(model)
     engine = Engine(step_fn)
 
     def assert_const_momentum(engine: Engine, const_momentum):
         assert engine.state.ema_momentum == const_momentum
 
-    # no momentum_warmup
-    ema_handler = EMAHandler(model, momentum=0.002, momentum_warmup=None, warmup_iters=1)
-    ema_handler.attach(engine)
-    # attach the assertion handler after ema_handler, so the momentum is first updated and then tested
-    engine.add_event_handler(Events.ITERATION_COMPLETED, assert_const_momentum, ema_handler.momentum)
-    engine.run(range(2))
-
-    # no warmup_iters
-    engine = Engine(step_fn)
-    ema_handler = EMAHandler(model, momentum=0.002, momentum_warmup=0.001, warmup_iters=None)
+    ema_handler = EMAHandler(model, momentum=0.002)
     ema_handler.attach(engine)
-    # attach the assertion handler after ema_handler, so the momentum is first updated and then tested
     engine.add_event_handler(Events.ITERATION_COMPLETED, assert_const_momentum, ema_handler.momentum)
-    engine.run(range(2))
-
-
-def test_ema_update_ema_momentum(get_dummy_model):
-    model = get_dummy_model()
-    step_fn = _get_dummy_step_fn(model)
-    engine = Engine(step_fn)
-
-    warmup_iters = 4
-    momentum_warmup = 0.1
-    momentum = 0.2
-    ema_handler = EMAHandler(model, momentum_warmup=momentum_warmup, momentum=momentum, warmup_iters=warmup_iters)
-    ema_handler.attach(engine)
-
-    # add handlers to check momentum at each iteration
-    @engine.on(Events.ITERATION_COMPLETED)
-    def assert_momentum(engine: Engine):
-        curr_iter = engine.state.iteration
-        curr_momentum = engine.state.ema_momentum
-        if curr_iter == 1:
-            assert curr_momentum == momentum_warmup
-        elif 1 < curr_iter < warmup_iters:
-            assert momentum_warmup < curr_momentum < momentum
-        else:
-            assert curr_momentum == momentum
-
-    engine.run(range(2), max_epochs=5)
+    engine.run(range(10))
 
 
 def test_ema_buffer():
@@ -180,11 +178,10 @@ def check_buffers():
 def test_ema_two_handlers(get_dummy_model):
     """Test when two EMA handlers are attached to a trainer"""
     model_1 = get_dummy_model()
-    # momentum will be constantly 0.5
-    ema_handler_1 = EMAHandler(model_1, momentum_warmup=0.5, momentum=0.5, warmup_iters=1)
+    ema_handler_1 = EMAHandler(model_1, momentum=0.5)
 
     model_2 = get_dummy_model()
-    ema_handler_2 = EMAHandler(model_2, momentum_warmup=0.5, momentum=0.5, warmup_iters=1)
+    ema_handler_2 = EMAHandler(model_2, momentum=0.5)
 
     def _step_fn(engine: Engine, batch: Any):
         model_1.weight.data.add_(1)
@@ -214,8 +211,8 @@ def _step_fn(engine: Engine, batch: Any):
 
     model_3 = get_dummy_model()
     ema_handler_3 = EMAHandler(model_3)
-    with pytest.raises(ValueError, match="Please select another name"):
-        ema_handler_3.attach(engine, "ema_momentum_2")
+    with pytest.warns(UserWarning, match="Attribute 'ema_momentum_1' already exists"):
+        ema_handler_3.attach(engine, name="ema_momentum_1")
 
 
 def _test_ema_final_weight(model, device=None, ddp=False, interval=1):
@@ -231,8 +228,7 @@ def _test_ema_final_weight(model, device=None, ddp=False, interval=1):
     step_fn = _get_dummy_step_fn(model)
     engine = Engine(step_fn)
 
-    # momentum will be constantly 0.5
-    ema_handler = EMAHandler(model, momentum_warmup=0.5, momentum=0.5, warmup_iters=1)
+    ema_handler = EMAHandler(model, momentum=0.5)
     ema_handler.attach(engine, "model", event=Events.ITERATION_COMPLETED(every=interval))
 
     # engine will run 4 iterations