diff --git a/awq/models/base.py b/awq/models/base.py index e5691ae0..e09336fa 100644 --- a/awq/models/base.py +++ b/awq/models/base.py @@ -136,6 +136,12 @@ def quantize( "This argument avoids real quantization by only applying the scales without quantizing down to FP16." ), ] = False, + apply_clip: Annotated[ + bool, + Doc( + "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False." + ), + ] = True, ): """ The main quantization function that you can use to quantize your model. @@ -173,6 +179,7 @@ def quantize( duo_scaling, modules_to_not_convert=self.quant_config.modules_to_not_convert, export_compatible=export_compatible, + apply_clip=apply_clip, ) self.quantizer.quantize() diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py index 37b1dbdc..6a4574e6 100644 --- a/awq/quantize/quantizer.py +++ b/awq/quantize/quantizer.py @@ -40,6 +40,7 @@ def __init__( duo_scaling, modules_to_not_convert=None, export_compatible=False, + apply_clip=True, ) -> None: self.awq_model = awq_model self.model = model @@ -53,6 +54,7 @@ def __init__( self.text_column = text_column self.duo_scaling = duo_scaling self.export_compatible = export_compatible + self.apply_clip = apply_clip self.modules_to_not_convert = ( modules_to_not_convert if modules_to_not_convert is not None else [] ) @@ -161,13 +163,14 @@ def quantize(self): ) # [STEP 3]: Compute and apply clipping list - clip_list = self._search_best_clip( - self.modules[i], named_linears, input_feat - ) - apply_clip(self.modules[i], clip_list) - clip_list = append_str_prefix( - clip_list, get_op_name(self.model, self.modules[i]) + "." - ) + if self.apply_clip: + clip_list = self._search_best_clip( + self.modules[i], named_linears, input_feat + ) + apply_clip(self.modules[i], clip_list) + clip_list = append_str_prefix( + clip_list, get_op_name(self.model, self.modules[i]) + "." + ) # [STEP 4]: Quantize weights if not self.export_compatible: