-
Notifications
You must be signed in to change notification settings - Fork 117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support AWQ models #1049
Support AWQ models #1049
Changes from 9 commits
64f64b0
86d9328
decbcc2
9fb1da4
04d0cf9
b51cdee
df97004
cf2fc8b
ae8c7db
f0f7a72
ab6ac99
ff66f43
3b73f17
e8be988
5d8bcb7
cf3aad4
106a5b7
cda4908
8ef3997
8e5573f
0d7f4bf
ae544af
013081c
b7cd49d
da3bd88
0a0c7aa
55dad0c
c05aaf0
40cd57f
9ddc5a8
a241a7d
b0e4860
630d36a
7607f45
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -247,7 +247,10 @@ def main_export( | |
trust_remote_code=trust_remote_code, | ||
) | ||
quantization_config = getattr(config, "quantization_config", None) | ||
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" | ||
supported_quant_methods = ["gptq"] | ||
if is_openvino_version(">=", "2024.6.0"): | ||
supported_quant_methods.append("awq") | ||
do_gptq_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why patch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. separated gptq specific patching |
||
model_type = config.model_type.replace("_", "-") | ||
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: | ||
custom_architecture = True | ||
|
@@ -296,7 +299,6 @@ def main_export( | |
if ( | ||
dtype is None | ||
and framework == "pt" | ||
and not do_gptq_patching | ||
and ( | ||
task.startswith("text-generation") | ||
or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS | ||
|
@@ -316,7 +318,6 @@ def main_export( | |
loading_kwargs["torch_dtype"] = dtype | ||
# Patch the modules to export of GPTQ models w/o GPU | ||
if do_gptq_patching: | ||
torch.set_default_dtype(torch.float32) | ||
orig_cuda_check = torch.cuda.is_available | ||
torch.cuda.is_available = lambda: True | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,7 +62,7 @@ | |
) | ||
from transformers.onnx.utils import get_preprocessor | ||
from transformers.testing_utils import slow | ||
from utils_tests import MODEL_NAMES, TEST_IMAGE_URL | ||
from utils_tests import MODEL_NAMES, TEST_IMAGE_URL, mock_torch_cuda_is_available, patch_awq_for_inference | ||
|
||
from optimum.exporters.openvino.model_patcher import patch_update_causal_mask | ||
from optimum.intel import ( | ||
|
@@ -872,7 +872,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): | |
"gpt_neo", | ||
"gpt_neox", | ||
"llama", | ||
# "llama_gptq", | ||
"marian", | ||
"minicpm", | ||
"mistral", | ||
|
@@ -915,8 +914,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): | |
"exaone", | ||
"mistral-nemo", | ||
"minicpm3", | ||
"opt_gptq", | ||
) | ||
|
||
if is_openvino_version(">=", "2024.6.0"): | ||
SUPPORTED_ARCHITECTURES += ("mixtral_awq",) | ||
|
||
GENERATION_LENGTH = 100 | ||
REMOTE_CODE_MODELS = ( | ||
"chatglm", | ||
|
@@ -949,9 +952,6 @@ def test_compare_to_transformers(self, model_arch): | |
if is_openvino_version("<", "2024.1"): | ||
not_stateful.extend(["llama", "gemma", "gpt_bigcode"]) | ||
|
||
if "gptq" in model_arch: | ||
self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM") | ||
|
||
set_seed(SEED) | ||
|
||
model_kwargs = {} | ||
|
@@ -978,20 +978,30 @@ def test_compare_to_transformers(self, model_arch): | |
if is_stateful: | ||
self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) | ||
|
||
if "awq" in model_arch or "gptq" in model_arch: | ||
# infer in FP32 | ||
model_kwargs["torch_dtype"] = torch.float32 | ||
|
||
set_seed(SEED) | ||
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) | ||
with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): | ||
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) | ||
if model_arch in ["qwen", "arctic", "glm4"]: | ||
transformers_model.to(torch.float32) | ||
|
||
with torch.no_grad(): | ||
transformers_outputs = transformers_model(**tokens) | ||
with patch_awq_for_inference("awq" in model_arch): | ||
transformers_outputs = transformers_model(**tokens) | ||
|
||
# Compare tensor outputs | ||
atol = 1e-3 if model_arch == "minicpm" else 1e-4 | ||
# quantized models have higher tolerance | ||
if "awq" in model_arch: | ||
atol = 1e-2 | ||
elif "gptq" in model_arch: | ||
atol = 0.6 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these logits can't be considered "allclose" if this is the atol imo. does generation returns the same output ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. possibly it is an issue with model itself, let me check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. slow test with beam search testing and generated sequences in the same test are equal. logits can be represented in denormalized format and it is hard to say which threshold is meaningful as we do not know range of values. If I apply softmax on output I see 1-e5 difference between torch and ov result, argsort of both tensors shows small permutations after 1000 tokens in logits probs (it is just misplacing 1027 and 1028 tokens), so I believe it is enough accurate to avoid impact on generated strings There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added skipping logits comparision for quantized models |
||
self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol)) | ||
|
||
# Qwen tokenizer does not support padding | ||
|
||
if model_arch in ["qwen"]: | ||
return | ||
|
||
|
@@ -1025,7 +1035,12 @@ def test_compare_to_transformers(self, model_arch): | |
from transformers.cache_utils import DynamicCache | ||
|
||
additional_inputs = {"past_key_values": DynamicCache()} | ||
transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs) | ||
with patch_awq_for_inference("awq" in model_arch): | ||
transformers_outputs = transformers_model.generate( | ||
**tokens, generation_config=gen_config, **additional_inputs | ||
) | ||
print(f"ov_outputs: {ov_outputs}") | ||
print(f"transformers_outputs: {transformers_outputs}") | ||
self.assertTrue( | ||
torch.allclose(ov_outputs, transformers_outputs), | ||
"OV output {ov_outputs}\nTransformers output {transformers_output}", | ||
|
@@ -1261,8 +1276,13 @@ def test_beam_search(self, model_arch): | |
ov_model_stateless = OVModelForCausalLM.from_pretrained( | ||
model_id, export=True, use_cache=True, stateful=False, **model_kwargs | ||
) | ||
if "awq" in model_arch or "gptq" in model_arch: | ||
# infer in FP32 | ||
model_kwargs["torch_dtype"] = torch.float32 | ||
|
||
set_seed(SEED) | ||
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) | ||
with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): | ||
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) | ||
|
||
if model_arch == "arctic": | ||
transformers_model.to(torch.float32) | ||
|
@@ -1288,9 +1308,10 @@ def test_beam_search(self, model_arch): | |
|
||
if model_arch == "gemma2": | ||
additional_inputs = {"past_key_values": DynamicCache()} | ||
transformers_outputs = transformers_model.generate( | ||
**tokens, generation_config=gen_config, **additional_inputs | ||
) | ||
with patch_awq_for_inference("awq" in model_arch): | ||
transformers_outputs = transformers_model.generate( | ||
**tokens, generation_config=gen_config, **additional_inputs | ||
) | ||
set_seed(SEED) | ||
ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) | ||
self.assertTrue( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these are not valid extra urls for auto-gptq and awq
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is needed for preventing reinstalling torch with cuda during installing third-party, packages themselves should be installed from regular source, torch-dependent libs (the difference from --index-url and --extra-index-url that first redefine source index completely, the second one parameter used for usage index URL as additional source if library present in that source) will be tried to install from torch cpu url