@@ -170,7 +170,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
170170# ``apply_weight_only_int8_quant`` instead as drop in replacement for the two
171171# above (no replacement for int4).
172172#
173- # The difference between the two APIs is that ``change_linear_weights `` API
173+ # The difference between the two APIs is that ``int8_dynamic_activation `` API
174174# alters the weight tensor of the linear module so instead of doing a
175175# normal linear, it does a quantized operation. This is helpful when you
176176# have non-standard linear ops that do more than one thing. The ``apply``
@@ -220,7 +220,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
220220model = model .to (torch .bfloat16 )
221221image = image .to (torch .bfloat16 )
222222torch ._inductor .config .force_fuse_int_mm_with_mul = True
223- change_linear_weights_to_int8_dqtensors (model )
223+ quantize_ (model , int8_dynamic_activation_int8_weight () )
224224model_c = torch .compile (model , mode = 'max-autotune' )
225225quant_res = benchmark (model_c , image )
226226print (f"bf16 compiled runtime of the fused quantized block is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
@@ -251,7 +251,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
251251torch ._inductor .config .coordinate_descent_tuning = True
252252torch ._inductor .config .coordinate_descent_check_all_directions = True
253253torch ._inductor .config .force_fuse_int_mm_with_mul = True
254- change_linear_weights_to_int8_dqtensors (model )
254+ quantize_ (model , int8_dynamic_activation_int8_weight () )
255255model_c = torch .compile (model , mode = 'max-autotune' )
256256quant_res = benchmark (model_c , image )
257257print (f"bf16 compiled runtime of the final quantized block is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
@@ -280,7 +280,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
280280 model , image = get_sam_model (False , batchsize )
281281 model = model .to (torch .bfloat16 )
282282 image = image .to (torch .bfloat16 )
283- change_linear_weights_to_int8_dqtensors (model )
283+ quantize_ (model , int8_dynamic_activation_int8_weight () )
284284 model_c = torch .compile (model , mode = 'max-autotune' )
285285 quant_res = benchmark (model_c , image )
286286 print (f"bf16 compiled runtime of the quantized full model is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
0 commit comments