Skip to content

Commit

Permalink
Batched quantization (#516)
Browse files Browse the repository at this point in the history
  • Loading branch information
casper-hansen authored Jul 2, 2024
1 parent 1768bac commit c025b15
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 76 deletions.
79 changes: 63 additions & 16 deletions awq/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"phi3": "AutoModelForCausalLM",
"cohere": "AutoModelForCausalLM",
"deepseek_v2": "AutoModelForCausalLM",
"minicpm":"AutoModelForCausalLM",
"minicpm": "AutoModelForCausalLM",
}


Expand Down Expand Up @@ -156,6 +156,34 @@ def quantize(
"Whether to apply clipping to the model during quantization. Some models may perform better with this set to False."
),
] = True,
n_parallel_calib_samples: Annotated[
int,
Doc(
"The number of parallel samples to run through the model. "
"A high number of parallel samples can result in OOM during quantization if max_calib_samples is high enough. "
"If None, runs through all samples at the same time. "
"You can set this to a low number for more memory efficient quantization."
),
] = None,
max_calib_samples: Annotated[
int, Doc("The maximum number of samples to run through the model.")
] = 128,
max_calib_seq_len: Annotated[
int,
Doc(
"The maximum sequence length of the calibration dataset. Discard samples greater than max_calib_seq_len."
),
] = 512,
max_chunk_memory: Annotated[
int,
Doc(
"The loss computation and per-channel mean is optimized into chunked computations."
" Adjust this parameter to increase or decrease memory usage for these computations."
" Default is 1GB (1024 * 1024 * 1024)."
),
] = 1024
* 1024
* 1024,
):
"""
The main quantization function that you can use to quantize your model.
Expand Down Expand Up @@ -194,6 +222,10 @@ def quantize(
modules_to_not_convert=self.quant_config.modules_to_not_convert,
export_compatible=export_compatible,
apply_clip=apply_clip,
n_parallel_calib_samples=n_parallel_calib_samples,
max_calib_samples=max_calib_samples,
max_calib_seq_len=max_calib_seq_len,
max_chunk_memory=max_chunk_memory,
)
self.quantizer.quantize()

Expand Down Expand Up @@ -312,7 +344,8 @@ def from_pretrained(
),
] = None,
download_kwargs: Annotated[
Dict, Doc("Used for configure download model"),
Dict,
Doc("Used for configure download model"),
] = None,
**model_init_kwargs: Annotated[
Dict,
Expand All @@ -324,9 +357,12 @@ def from_pretrained(
"""A method for initialization of pretrained models, usually in FP16."""
# Get weights path and quant config
model_weights_path, config, quant_config = self._load_config(
self, model_path, "", safetensors,
self,
model_path,
"",
safetensors,
trust_remote_code=trust_remote_code,
download_kwargs=download_kwargs
download_kwargs=download_kwargs,
)

target_cls_name = TRANSFORMERS_AUTO_MAPPING_DICT[config.model_type]
Expand Down Expand Up @@ -409,7 +445,7 @@ def from_quantized(
),
] = "balanced",
max_memory: Annotated[
Dict[Union[int, str], Union[int, str]],
Dict[Union[int, str], Union[int, str]],
Doc(
'A dictionary device identifier to maximum memory which will be passed onto the model loading method from transformers. For example:{0: "4GB",1: "10GB"'
),
Expand All @@ -419,7 +455,8 @@ def from_quantized(
Doc("The folder ot offload the model to."),
] = None,
download_kwargs: Annotated[
Dict, Doc("Used for configure download model"),
Dict,
Doc("Used for configure download model"),
] = None,
**config_kwargs: Annotated[
Dict,
Expand Down Expand Up @@ -455,11 +492,15 @@ def from_quantized(
use_cpu_qbits = use_qbits or get_best_device() == "cpu"
if use_cpu_qbits:
if not qbits_available:
raise ImportError("Please install intel-extension-for-transformers with "
"`pip install intel-extension-for-transformers` for 'qbits' kernel!")
raise ImportError(
"Please install intel-extension-for-transformers with "
"`pip install intel-extension-for-transformers` for 'qbits' kernel!"
)

fuse_layers = False
logging.warn("Unsupport fuse_layers featrue for CPU device with QBits backend!")
logging.warn(
"Unsupport fuse_layers featrue for CPU device with QBits backend!"
)
# Prepare WQLinear layers, replace nn.Linear
self._load_quantized_modules(
self,
Expand Down Expand Up @@ -547,7 +588,9 @@ def _load_config(
elif isinstance(download_kwargs_ignore_patterns, list):
ignore_patterns.extend(download_kwargs_ignore_patterns)

model_path = snapshot_download(model_path, ignore_patterns=ignore_patterns, **download_kwargs)
model_path = snapshot_download(
model_path, ignore_patterns=ignore_patterns, **download_kwargs
)

if model_filename != "":
model_weights_path = model_path + f"/{model_filename}"
Expand Down Expand Up @@ -621,13 +664,17 @@ def _load_quantized_modules(
q_linear_module = WQLinear_GEMVFast

if use_qbits:
q_linear = q_linear_module.from_linear(module,
quant_config.w_bit,
quant_config.q_group_size,
True,
has_zero_points=quant_config.zero_point)
q_linear = q_linear_module.from_linear(
module,
quant_config.w_bit,
quant_config.q_group_size,
True,
has_zero_points=quant_config.zero_point,
)
else:
q_linear = q_linear_module.from_linear(module, quant_config.w_bit, quant_config.q_group_size, True)
q_linear = q_linear_module.from_linear(
module, quant_config.w_bit, quant_config.q_group_size, True
)
q_linear.to(next(layer.parameters()).device)
set_op_by_name(layer, name, q_linear)

Expand Down
Loading

0 comments on commit c025b15

Please sign in to comment.