From 9bb4334a8bf99449154490201f618dd3f081a59d Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Fri, 3 May 2024 10:59:28 +0200 Subject: [PATCH] Adding more INC tests (#698) * added bert static test * fix test for models that require position ids * remove io_binding * optimum refuses io binding without kv cache --- tests/neural_compressor/test_optimization.py | 33 ++++++++++---------- tests/neural_compressor/utils_tests.py | 11 +++++++ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 5d99306df1..da42586139 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -70,12 +70,13 @@ class QuantizationTest(INCTestMixin): - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( + SUPPORTED_ARCHITECTURES_STATIC = ( + ("text-generation", "gpt_neo", 17), ("text-classification", "bert", 21), - # ("text-generation", "bloom", 21), + ("text-generation", "bloom", 21), ) - SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + ( + SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_STATIC + ( ("fill-mask", "bert", 22), ("token-classification", "albert", 26), ) @@ -88,12 +89,14 @@ class QuantizationTest(INCTestMixin): @parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC) def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls): model_name = MODEL_NAMES[model_arch] - quantization_config = PostTrainingQuantConfig(approach="dynamic") model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) - save_onnx_model = False + quantized_model = None + save_onnx_model = False model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} + quantization_config = PostTrainingQuantConfig(approach="dynamic") + with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: if backend == "torch": @@ -104,8 +107,8 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls quantizer = INCQuantizer.from_pretrained(model, task=task) quantizer.quantize( quantization_config=quantization_config, - save_directory=tmp_dir, save_onnx_model=save_onnx_model, + save_directory=tmp_dir, ) if backend == "torch": quantized_model = quantizer._quantized_model @@ -121,7 +124,7 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls load_inc_model=True, ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_STATIC) def test_static_quantization(self, task, model_arch, expected_quantized_matmuls): num_samples = 10 model_name = MODEL_NAMES[model_arch] @@ -130,28 +133,26 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - save_onnx_model = False - op_type_dict = ( - {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} - if save_onnx_model - else None - ) - quantization_config = PostTrainingQuantConfig(approach="static", op_type_dict=op_type_dict) quantized_model = None + save_onnx_model = False + quantization_config = PostTrainingQuantConfig(approach="static") + model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: if backend == "torch": model = model_class.auto_model_class.from_pretrained(model_name) else: - model = model_class.from_pretrained(model_name, export=True) + model = model_class.from_pretrained(model_name, export=True, **model_kwargs) + quantizer = INCQuantizer.from_pretrained(model, task=task) calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) + quantizer.quantize( quantization_config=quantization_config, calibration_dataset=calibration_dataset, - save_directory=tmp_dir, save_onnx_model=save_onnx_model, + save_directory=tmp_dir, ) if backend == "torch": quantized_model = quantizer._quantized_model diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index c91270355a..a6d09954f5 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -47,6 +47,7 @@ from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification from optimum.pipelines import ORT_SUPPORTED_TASKS +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS if is_ipex_available(): from optimum.intel import ( @@ -135,6 +136,13 @@ def _generate_dataset(quantizer, tokenizer, num_samples=10): num_samples=num_samples, dataset_split="train", ) + model_type = quantizer._original_model.config.model_type.replace("_", "-") + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + dataset = dataset.map( + lambda x: { + "position_ids": np.arange(len(x["input_ids"])), + } + ) return dataset @@ -187,6 +195,9 @@ def check_model_outputs( self.assertEqual(expected_quantized_matmuls, num_quantized_matmul) ort_model = ORT_SUPPORTED_TASKS[task]["class"][0].from_pretrained(save_directory, **model_kwargs) + model_type = ort_model.config.model_type.replace("_", "-") + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + tokens["position_ids"] = torch.arange(len(tokens["input_ids"])).unsqueeze(0) ort_outputs = ort_model(**tokens) self.assertTrue("logits" in ort_outputs) # self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2))