update torch.compile tutorial

williamwen42 · williamwen42 · commit 434020fa3c69 · 2025-10-13T14:41:52.000-07:00
diff --git a/index.rst b/index.rst
@@ -536,6 +536,13 @@ Welcome to PyTorch Tutorials
    :link: intermediate/torch_compile_tutorial.html
    :tags: Model-Optimization
 
+.. customcarditem::
+   :header: torch.compile End-to-End Tutorial
+   :card_description: An example of applying torch.compile to a real model, demonstrating speedups.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/torch_compile_full_example.html
+   :tags: Model-Optimization
+
 .. customcarditem::
    :header: Building a Convolution/Batch Norm fuser in torch.compile
    :card_description: Build a simple pattern matcher pass that fuses batch norm into convolution to improve performance during inference.
diff --git a/intermediate_source/torch_compile_full_example.py b/intermediate_source/torch_compile_full_example.py
@@ -6,27 +6,40 @@
 **Author:** William Wen
 """
 
-import warnings
-
 ######################################################################
 # ``torch.compile`` is the new way to speed up your PyTorch code!
 # ``torch.compile`` makes PyTorch code run faster by
 # JIT-compiling PyTorch code into optimized kernels,
 # while requiring minimal code changes.
 #
 # This tutorial covers an end-to-end example of training and evaluating a
-# real model with ``torch.compile``. For a gentler introduction to ``torch.compile``,
-# please check out our ```torch.compile`` tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__.
+# real model with ``torch.compile``. For a gentle introduction to ``torch.compile``,
+# please check out `the introduction to ``torch.compile`` tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__.
 #
 # **Required pip Dependencies**
 #
 # - ``torch >= 2.0``
 # - ``torchvision``
+#
+# .. grid:: 2
+#
+#     .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+#        :class-card: card-prerequisites
+#
+#        * How to apply ``torch.compile`` to a real model
+#        * ``torch.compile`` speedups on a real model
+#        * ``torch.compile``'s first few iterations are expected to be slower due to compilation overhead
+#
+#     .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+#        :class-card: card-prerequisites
+#
+#        * `Introduction to ``torch.compile`` <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
 
 # NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in
 # order to reproduce the speedup numbers shown below and documented elsewhere.
 
 import torch
+import warnings
 
 gpu_ok = False
 if torch.cuda.is_available():
@@ -88,7 +101,10 @@ def init_model():
 
 model = init_model()
 
-model_opt = torch.compile(model, mode="reduce-overhead")
+# Note that we generally recommend directly compiling a torch.nn.Module by calling
+# its .compile() method.
+model_opt = init_model()
+model_opt.compile(mode="reduce-overhead")
 
 inp = generate_data(16)[0]
 with torch.no_grad():
@@ -175,6 +191,9 @@ def train(mod, data):
 
 model = init_model()
 opt = torch.optim.Adam(model.parameters())
+
+# Note that because we are compiling a regular Python function, we do not
+# call any .compile() method.
 train_opt = torch.compile(train, mode="reduce-overhead")
 
 compile_times = []
@@ -202,3 +221,20 @@ def train(mod, data):
 # We remark that the speedup numbers presented in this tutorial are for
 # demonstration purposes only. Official speedup values can be seen at the
 # `TorchInductor performance dashboard <https://hud.pytorch.org/benchmark/compilers>`__.
+
+######################################################################
+# Conclusion
+# ------------
+#
+# In this tutorial, we applied ``torch.compile`` to training and inference on a real model,
+# demonstrating speedups.
+#
+# Importantly, we note that the first few iterations of a compiled model
+# are slower than eager mode due to compilation overhead, but subsequent iterations are expected to
+# have speedups.
+#
+# For a gentle introduction to ``torch.compile``, please check out `the introduction to ``torch.compile`` tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__.
+#
+# To troubleshoot issues and to gain a deeper understanding of how to apply ``torch.compile`` to your code, check out `the ``torch.compile`` programming model <https://docs.pytorch.org/docs/main/compile/programming_model.html>`__.
+#
+# We hope that you will give ``torch.compile`` a try!
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
@@ -6,6 +6,17 @@
 **Author:** William Wen
 """
 
+# sphinx_gallery_start_ignore
+# to clear torch logs format
+import torch
+import os
+os.environ["TORCH_LOGS_FORMAT"] = ""
+torch._logging._internal.DEFAULT_FORMATTER = (
+    torch._logging._internal._default_formatter()
+)
+torch._logging._internal._init_logs()
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # ``torch.compile`` is the new way to speed up your PyTorch code!
 # ``torch.compile`` makes PyTorch code run faster by
@@ -27,6 +38,8 @@
 #
 # For an end-to-end example on a real model, check out our `end-to-end ``torch.compile`` tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_full_example.html>`__.
 #
+# To troubleshoot issues and to gain a deeper understanding of how to apply ``torch.compile`` to your code, check out `the ``torch.compile`` programming model <https://docs.pytorch.org/docs/main/compile/programming_model.html>`__.
+#
 # **Contents**
 #
 # .. contents::
@@ -128,7 +141,7 @@ def forward(self, x):
 # -----------------------
 #
 # Now let's demonstrate how ``torch.compile`` speeds up a simple PyTorch example.
-# For a demonstration on a more complex model, see <TODO link>.
+# For a demonstration on a more complex model, see our `end-to-end ``torch.compile`` tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_full_example.html>`__.
 
 
 def foo3(x):
@@ -161,7 +174,7 @@ def timed(fn):
 ######################################################################
 # Notice that ``torch.compile`` appears to take a lot longer to complete
 # compared to eager. This is because ``torch.compile`` takes extra time to compile
-# the model on the first execution.
+# the model on the first few executions.
 # ``torch.compile`` re-uses compiled code whever possible,
 # so if we run our optimized model several more times, we should
 # see a significant improvement compared to eager.
@@ -281,43 +294,6 @@ def f2(x, y):
 print("compile 2:", test_fns(f2, compile_f2, (inp1, inp2)))
 print("~" * 10)
 
-######################################################################
-# Another case that ``torch.compile`` handles well compared to
-# both TorchScript tracing and scripting is the usage of third-party library functions.
-
-import scipy
-
-
-def f3(x):
-    x = x * 2
-    x = scipy.fft.dct(x.numpy())
-    x = torch.from_numpy(x)
-    x = x * 2
-    return x
-
-
-######################################################################
-# TorchScript tracing treats results from non-PyTorch function calls
-# as constants, and so our results can be silently wrong.
-# TorchScript scripting disallows non-PyTorch function calls.
-# On the other hand, ``torch.compile`` is easily able to handle
-# the non-PyTorch function call.
-
-
-inp1 = torch.randn(5, 5)
-inp2 = torch.randn(5, 5)
-traced_f3 = torch.jit.trace(f3, (inp1,))
-print("traced 3:", test_fns(f3, traced_f3, (inp2,)))
-
-try:
-    torch.jit.script(f3)
-except:
-    tb.print_exc()
-
-compile_f3 = torch.compile(f3)
-print("compile 3:", test_fns(f3, compile_f3, (inp2,)))
-
-
 ######################################################################
 # Graph Breaks
 # ------------------------------------
@@ -418,6 +394,9 @@ def false_branch(y):
 # One important restriction is that ``torch.export`` does not support graph breaks. Please check
 # `this tutorial <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__
 # for more details on ``torch.export``.
+#
+# Check out our `section on graph breaks in the ``torch.compile`` programming model <https://docs.pytorch.org/docs/main/compile/programming_model.graph_breaks_index.html>`__
+# for tips on how to work around graph breaks.
 
 ######################################################################
 # Troubleshooting
@@ -427,7 +406,7 @@ def false_branch(y):
 # Are you looking for tips on how to best use ``torch.compile``?
 # Or maybe you simply want to learn more about the inner workings of ``torch.compile``?
 #
-# Check out `the ``torch.compile`` troubleshooting guide <https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html>`__!
+# Check out `the ``torch.compile`` programming model <https://docs.pytorch.org/docs/main/compile/programming_model.html>`__.
 
 ######################################################################
 # Conclusion
@@ -439,4 +418,6 @@ def false_branch(y):
 #
 # For an end-to-end example on a real model, check out our `end-to-end ``torch.compile`` tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_full_example.html>`__.
 #
+# To troubleshoot issues and to gain a deeper understanding of how to apply ``torch.compile`` to your code, check out `the ``torch.compile`` programming model <https://docs.pytorch.org/docs/main/compile/programming_model.html>`__.
+#
 # We hope that you will give ``torch.compile`` a try!