[BUG] fixed memory leak in BaseModel by detach some tensor (#1924)

zju-ys · web-flow · commit a88a40439953 · 2025-08-05T18:11:07.000+02:00
#### Reference Issues/PRs #1369 #1461 #### What does this implement/fix? Explain your changes. 1.Detached tensors in the log dictionary before appending them to the training/validation/testing_step_outputs lists. This fixes a memory leak caused by retaining the computation graph for every batch throughout an entire epoch. 2.Detached the loss tensor within the step() method before logging. 3.Move prediction results to CPU to prevent VRAM growth. #### Did you add any tests for the change? I ran my training code for 5 epochs using a memory profiler. Here are two comparison plot: before <img width="1156" height="472" alt="before" src="https://github.com/user-attachments/assets/45a6696a-efe6-4f06-897e-d80daee79977" /> after <img width="1132" height="470" alt="alfter" src="https://github.com/user-attachments/assets/c3ba5187-97ee-4b6b-b4cb-2d641b2d0d88" />
diff --git a/pytorch_forecasting/models/base/_base_model.py b/pytorch_forecasting/models/base/_base_model.py
@@ -54,8 +54,10 @@
     apply_to_list,
     concat_sequences,
     create_mask,
+    detach,
     get_embedding_size,
     groupby_apply,
+    move_to_device,
     to_list,
 )
 from pytorch_forecasting.utils._classproperty import classproperty
@@ -308,6 +310,8 @@ def on_predict_batch_end(
         else:
             raise ValueError(f"Unknown mode {self.mode} - see docs for valid arguments")
 
+        out = move_to_device(detach(out), "cpu")
+        x = move_to_device(detach(x), "cpu")
         self._output.append(out)
         out = dict(output=out)
         if self.return_x:
@@ -720,7 +724,7 @@ def training_step(self, batch, batch_idx):
         """
         x, y = batch
         log, out = self.step(x, y, batch_idx)
-        self.training_step_outputs.append(log)
+        self.training_step_outputs.append(detach(log))
         return log
 
     def on_train_epoch_end(self):
@@ -739,7 +743,7 @@ def validation_step(self, batch, batch_idx):
         x, y = batch
         log, out = self.step(x, y, batch_idx)
         log.update(self.create_log(x, y, out, batch_idx))
-        self.validation_step_outputs.append(log)
+        self.validation_step_outputs.append(detach(log))
         return log
 
     def on_validation_epoch_end(self):
@@ -750,7 +754,7 @@ def test_step(self, batch, batch_idx):
         x, y = batch
         log, out = self.step(x, y, batch_idx)
         log.update(self.create_log(x, y, out, batch_idx))
-        self.testing_step_outputs.append(log)
+        self.testing_step_outputs.append(detach(log))
         return log
 
     def on_test_epoch_end(self):
@@ -934,7 +938,7 @@ def step(
             loss.requires_grad_(True)
         self.log(
             f"{self.current_stage}_loss",
-            loss,
+            detach(loss),
             on_step=self.training,
             on_epoch=True,
             prog_bar=True,