Update

svekars · svekars · commit 913d43e8817c · 2024-09-30T11:34:29.000-07:00
diff --git a/prototype_source/gpu_quantization_torchao_tutorial.py b/prototype_source/gpu_quantization_torchao_tutorial.py
@@ -170,7 +170,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
 #    ``apply_weight_only_int8_quant`` instead as drop in replacement for the two
 #    above (no replacement for int4).
 #
-#  The difference between the two APIs is that ``change_linear_weights`` API
+# The difference between the two APIs is that ``int8_dynamic_activation`` API
 # alters the weight tensor of the linear module so instead of doing a
 # normal linear, it does a quantized operation. This is helpful when you
 # have non-standard linear ops that do more than one thing. The ``apply``
@@ -220,7 +220,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
 model = model.to(torch.bfloat16)
 image = image.to(torch.bfloat16)
 torch._inductor.config.force_fuse_int_mm_with_mul = True
-change_linear_weights_to_int8_dqtensors(model)
+quantize_(model, int8_dynamic_activation_int8_weight())
 model_c = torch.compile(model, mode='max-autotune')
 quant_res = benchmark(model_c, image)
 print(f"bf16 compiled runtime of the fused quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
@@ -251,7 +251,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
 torch._inductor.config.coordinate_descent_tuning = True
 torch._inductor.config.coordinate_descent_check_all_directions = True
 torch._inductor.config.force_fuse_int_mm_with_mul = True
-change_linear_weights_to_int8_dqtensors(model)
+quantize_(model, int8_dynamic_activation_int8_weight())
 model_c = torch.compile(model, mode='max-autotune')
 quant_res = benchmark(model_c, image)
 print(f"bf16 compiled runtime of the final quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
@@ -280,7 +280,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
     model, image = get_sam_model(False, batchsize)
     model = model.to(torch.bfloat16)
     image = image.to(torch.bfloat16)
-    change_linear_weights_to_int8_dqtensors(model)
+    quantize_(model, int8_dynamic_activation_int8_weight())
     model_c = torch.compile(model, mode='max-autotune')
     quant_res = benchmark(model_c, image)
     print(f"bf16 compiled runtime of the quantized full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")