@@ -260,7 +260,7 @@ def forward(self,
260260
261261###############################################################################
262262# Utilities
263- # =========
263+ # ~~~~~~~~~
264264# In this section, we include a utility to generate semi-realistic data using
265265# Zipf distribution for sentence lengths. This is used to generate the nested
266266# query, key and value tensors. We also include a benchmark utility.
@@ -400,11 +400,13 @@ def benchmark(func, *args, **kwargs):
400400######################################################################################
401401# For reference some sample outputs on A100:
402402#
403- # padded_time=0.03454, padded_peak_memory=4.14 GB
404- # nested_time=0.00612, nested_peak_memory=0.76 GB
405- # Difference between vanilla and nested result 0.0
406- # Nested speedup: 5.65
407- # Nested peak memory reduction 3.39 GB
403+ # ..code::
404+ #
405+ # padded_time=0.03454, padded_peak_memory=4.14 GB
406+ # nested_time=0.00612, nested_peak_memory=0.76 GB
407+ # Difference between vanilla and nested result 0.0
408+ # Nested speedup: 5.65
409+ # Nested peak memory reduction 3.39 GB
408410#
409411# We can also see the same for backward pass
410412
@@ -428,14 +430,16 @@ def benchmark(func, *args, **kwargs):
428430##################################################################################
429431# Sample outputs on A100:
430432#
431- # ``padded_bw_time``=2.09337, ``padded_bw_peak_mem``=5.10 GB
432- # ``nested_bw_time``=0.01452, ``nested_bw_peak_mem``=3.24 GB
433- # Nested backward speedup: 144.13
434- # Nested backward peak memory reduction 1.86 GB
435- # Difference in ``out_proj.weight.grad`` 0.000244140625
436- # Difference in ``packed_proj.weight.grad`` 0.001556396484375
437- # Difference in ``out_proj.bias.grad`` 0.0
438- # Difference in ``packed_proj.bias.grad`` 0.001953125
433+ # ..code::
434+ #
435+ # ``padded_bw_time``=2.09337, ``padded_bw_peak_mem``=5.10 GB
436+ # ``nested_bw_time``=0.01452, ``nested_bw_peak_mem``=3.24 GB
437+ # Nested backward speedup: 144.13
438+ # Nested backward peak memory reduction 1.86 GB
439+ # Difference in ``out_proj.weight.grad`` 0.000244140625
440+ # Difference in ``packed_proj.weight.grad`` 0.001556396484375
441+ # Difference in ``out_proj.bias.grad`` 0.0
442+ # Difference in ``packed_proj.bias.grad`` 0.001953125
439443#
440444
441445##################################################################################
0 commit comments