Skip to content

Commit

Permalink
Adding more INC tests (#698)
Browse files Browse the repository at this point in the history
* added bert static test

* fix test for models that require position ids

* remove io_binding

* optimum refuses io binding without kv cache
  • Loading branch information
IlyasMoutawwakil authored May 3, 2024
1 parent e1b6a59 commit 9bb4334
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 16 deletions.
33 changes: 17 additions & 16 deletions tests/neural_compressor/test_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,13 @@


class QuantizationTest(INCTestMixin):
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
SUPPORTED_ARCHITECTURES_STATIC = (
("text-generation", "gpt_neo", 17),
("text-classification", "bert", 21),
# ("text-generation", "bloom", 21),
("text-generation", "bloom", 21),
)

SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + (
SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_STATIC + (
("fill-mask", "bert", 22),
("token-classification", "albert", 26),
)
Expand All @@ -88,12 +89,14 @@ class QuantizationTest(INCTestMixin):
@parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC)
def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls):
model_name = MODEL_NAMES[model_arch]
quantization_config = PostTrainingQuantConfig(approach="dynamic")
model_class = ORT_SUPPORTED_TASKS[task]["class"][0]
tokenizer = AutoTokenizer.from_pretrained(model_name)
save_onnx_model = False

quantized_model = None
save_onnx_model = False
model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {}
quantization_config = PostTrainingQuantConfig(approach="dynamic")

with tempfile.TemporaryDirectory() as tmp_dir:
for backend in ["torch", "ort"]:
if backend == "torch":
Expand All @@ -104,8 +107,8 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls
quantizer = INCQuantizer.from_pretrained(model, task=task)
quantizer.quantize(
quantization_config=quantization_config,
save_directory=tmp_dir,
save_onnx_model=save_onnx_model,
save_directory=tmp_dir,
)
if backend == "torch":
quantized_model = quantizer._quantized_model
Expand All @@ -121,7 +124,7 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls
load_inc_model=True,
)

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
@parameterized.expand(SUPPORTED_ARCHITECTURES_STATIC)
def test_static_quantization(self, task, model_arch, expected_quantized_matmuls):
num_samples = 10
model_name = MODEL_NAMES[model_arch]
Expand All @@ -130,28 +133,26 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

save_onnx_model = False
op_type_dict = (
{"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}}
if save_onnx_model
else None
)
quantization_config = PostTrainingQuantConfig(approach="static", op_type_dict=op_type_dict)
quantized_model = None
save_onnx_model = False
quantization_config = PostTrainingQuantConfig(approach="static")
model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {}

with tempfile.TemporaryDirectory() as tmp_dir:
for backend in ["torch", "ort"]:
if backend == "torch":
model = model_class.auto_model_class.from_pretrained(model_name)
else:
model = model_class.from_pretrained(model_name, export=True)
model = model_class.from_pretrained(model_name, export=True, **model_kwargs)

quantizer = INCQuantizer.from_pretrained(model, task=task)
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples)

quantizer.quantize(
quantization_config=quantization_config,
calibration_dataset=calibration_dataset,
save_directory=tmp_dir,
save_onnx_model=save_onnx_model,
save_directory=tmp_dir,
)
if backend == "torch":
quantized_model = quantizer._quantized_model
Expand Down
11 changes: 11 additions & 0 deletions tests/neural_compressor/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME
from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification
from optimum.pipelines import ORT_SUPPORTED_TASKS
from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS

if is_ipex_available():
from optimum.intel import (
Expand Down Expand Up @@ -135,6 +136,13 @@ def _generate_dataset(quantizer, tokenizer, num_samples=10):
num_samples=num_samples,
dataset_split="train",
)
model_type = quantizer._original_model.config.model_type.replace("_", "-")
if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
dataset = dataset.map(
lambda x: {
"position_ids": np.arange(len(x["input_ids"])),
}
)
return dataset


Expand Down Expand Up @@ -187,6 +195,9 @@ def check_model_outputs(

self.assertEqual(expected_quantized_matmuls, num_quantized_matmul)
ort_model = ORT_SUPPORTED_TASKS[task]["class"][0].from_pretrained(save_directory, **model_kwargs)
model_type = ort_model.config.model_type.replace("_", "-")
if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
tokens["position_ids"] = torch.arange(len(tokens["input_ids"])).unsqueeze(0)
ort_outputs = ort_model(**tokens)
self.assertTrue("logits" in ort_outputs)
# self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2))
Expand Down

0 comments on commit 9bb4334

Please sign in to comment.