From 6ea6b5d23ce7664f11afba3c45f337ceb3b64412 Mon Sep 17 00:00:00 2001 From: kaixuanliu Date: Tue, 10 Dec 2024 02:38:05 +0800 Subject: [PATCH] ipex Page attn xpu support bug fix (#1053) * fix ipex xpu support issues Signed-off-by: Liu, Kaixuan * use `device_map` Signed-off-by: Liu, Kaixuan * small adjust Signed-off-by: Liu, Kaixuan * to compatible with openvino Signed-off-by: Liu, Kaixuan * fix format Signed-off-by: Liu, Kaixuan * refine code Signed-off-by: Liu, Kaixuan * Update tests/ipex/test_modeling.py * update code Signed-off-by: Liu, Kaixuan --------- Signed-off-by: Liu, Kaixuan Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- optimum/exporters/ipex/modeling_utils.py | 22 +++-- optimum/intel/pipelines/pipeline_base.py | 9 +- tests/ipex/test_modeling.py | 107 ++++++++++++----------- tests/ipex/test_pipelines.py | 45 ++++++---- 4 files changed, 104 insertions(+), 79 deletions(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index ccd98ce2e9..e741575edd 100755 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -744,13 +744,13 @@ def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - self.module_device = next(module.parameters()).device.type - if self.module_device == "cpu": + self.module_device = next(module.parameters()).device + if self.module_device.type == "cpu": # LinearAllreduce and LinearLayer cannot use fused op LinearAdd if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: self.mlp_linear_add = LinearAdd(module.down_proj) self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj) - elif self.module_device == "xpu": + elif self.module_device.type == "xpu": # LinearAllreduce and LinearLayer cannot use fused op LinearAdd if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: self.mlp_linear_add = XPULinearAdd(module.down_proj) @@ -777,15 +777,15 @@ def __init__(self, module, config) -> None: _setattr_from_module(self, module) self.config = config # LinearAllreduce and LinearLayer cannot use fused op LinearAdd - self.module_device = next(module.parameters()).device.type - if self.module_device == "cpu": + self.module_device = next(module.parameters()).device + if self.module_device.type == "cpu": self.linear_gelu = LinearGelu(module.dense_h_to_4h) - elif self.module_device == "xpu": + elif self.module_device.type == "xpu": self.linear_gelu = XPULinearGelu(module.dense_h_to_4h) if module.dense_4h_to_h.__class__.__name__ not in ["LinearAllreduce"]: - if self.module_device == "cpu": + if self.module_device.type == "cpu": self.linear_add_add = LinearAddAdd(module.dense_4h_to_h) - elif self.module_device == "xpu": + elif self.module_device.type == "xpu": self.linear_add_add = XPUlinearAddAdd(module.dense_4h_to_h) def forward( @@ -870,7 +870,11 @@ class _IPEXIntermediate(nn.Module): def __init__(self, module, config): super().__init__() _setattr_from_module(self, module) - self.linear_gelu = LinearGelu(module.dense) + self.module_device = next(module.parameters()).device + if self.module_device.type == "cpu": + self.linear_gelu = LinearGelu(module.dense) + elif self.module_device.type == "xpu": + self.linear_gelu = XPULinearGelu(module.dense) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_gelu(hidden_states) diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py index d26d8c42b6..5b8531c674 100644 --- a/optimum/intel/pipelines/pipeline_base.py +++ b/optimum/intel/pipelines/pipeline_base.py @@ -246,6 +246,7 @@ def load_ipex_model( SUPPORTED_TASKS, hub_kwargs: Optional[Dict[str, Any]] = None, model_kwargs: Optional[Dict[str, Any]] = None, + device_map: Optional[torch.device] = None, ): hub_kwargs = hub_kwargs or {} model_kwargs = model_kwargs or {} @@ -253,7 +254,9 @@ def load_ipex_model( if model is None: model_id = SUPPORTED_TASKS[targeted_task]["default"] - model = ipex_model_class.from_pretrained(model_id, export=True, **hub_kwargs, **model_kwargs) + model = ipex_model_class.from_pretrained( + model_id, export=True, **hub_kwargs, **model_kwargs, device_map=device_map + ) elif isinstance(model, str): model_id = model try: @@ -262,7 +265,9 @@ def load_ipex_model( except RuntimeError: logger.warning("We will use IPEXModel with export=True to export the model") export = True - model = ipex_model_class.from_pretrained(model, export=export, **hub_kwargs, **model_kwargs) + model = ipex_model_class.from_pretrained( + model, export=export, **hub_kwargs, **model_kwargs, device_map=device_map + ) elif isinstance(model, IPEXModel): model_id = getattr(model.config, "name_or_path", None) else: diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 1216d8d83d..3a6abd9c35 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -54,6 +54,7 @@ SEED = 42 torch.use_deterministic_algorithms(True) +DEVICE = "xpu:0" if IS_XPU_AVAILABLE else "cpu" class Timer(object): @@ -84,15 +85,14 @@ class IPEXModelTest(unittest.TestCase): def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id) + ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, device_map=DEVICE) if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES: self.assertTrue(ipex_model.add_patch) - device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device) + transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id, device_map=DEVICE) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = "This is a sample input" - tokens = tokenizer(inputs, return_tensors="pt").to(device) + tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) @@ -100,7 +100,7 @@ def test_compare_to_transformers(self, model_arch): # Test re-load model with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) - loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE) loaded_model_outputs = loaded_model(**tokens) # Test init method init_model = self.IPEX_MODEL_CLASS(transformers_model) @@ -116,7 +116,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = self.IPEX_MODEL_CLASS.from_pretrained(model_id) + model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, device_map=DEVICE) tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline(self.IPEX_MODEL_CLASS.export_feature, model=model, tokenizer=tokenizer) text = "This restaurant is awesome" @@ -151,13 +151,12 @@ class IPEXModelForQuestionAnsweringTest(unittest.TestCase): def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - ipex_model = IPEXModelForQuestionAnswering.from_pretrained(model_id) - device = ipex_model.device + ipex_model = IPEXModelForQuestionAnswering.from_pretrained(model_id, device_map=DEVICE) self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id).to(device) + transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id, device_map=DEVICE) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = "This is a sample input" - tokens = tokenizer(inputs, return_tensors="pt").to(device) + tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) @@ -165,7 +164,7 @@ def test_compare_to_transformers(self, model_arch): # Test re-load model with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) - loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE) loaded_model_outputs = loaded_model(**tokens) # Test init method @@ -185,7 +184,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = IPEXModelForQuestionAnswering.from_pretrained(model_id) + model = IPEXModelForQuestionAnswering.from_pretrained(model_id, device_map=DEVICE) tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("question-answering", model=model, tokenizer=tokenizer) question = "What's my name?" @@ -196,11 +195,15 @@ def test_pipeline(self, model_arch): self.assertIsInstance(outputs["answer"], str) def test_patched_model(self): - ipex_model = IPEXModelForQuestionAnswering.from_pretrained("Intel/tiny-random-bert_ipex_model") - transformers_model = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-bert") + ipex_model = IPEXModelForQuestionAnswering.from_pretrained( + "Intel/tiny-random-bert_ipex_model", device_map=DEVICE + ) + transformers_model = AutoModelForQuestionAnswering.from_pretrained( + "hf-internal-testing/tiny-random-bert", device_map=DEVICE + ) tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert") inputs = "This is a sample input" - tokens = tokenizer(inputs, return_tensors="pt") + tokens = tokenizer(inputs, return_tensors="pt").to(DEVICE) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) @@ -239,27 +242,27 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 # Test model forward do not need cache. - ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) self.assertIsInstance(ipex_model.config, PretrainedConfig) tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer( "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch in ("llama", "llama2") else None, - ) + ).to(DEVICE) inputs = ipex_model.prepare_inputs_for_generation(**tokens) outputs = ipex_model(**inputs) self.assertIsInstance(outputs.logits, torch.Tensor) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Test re-load model with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) - loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype, device_map=DEVICE) loaded_model_outputs = loaded_model(**inputs) # Test init method @@ -277,9 +280,8 @@ def test_pipeline(self, model_arch): dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) + model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) model.config.encoder_no_repeat_ngram_size = 0 - # model.to("cpu") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) outputs = pipe("This is a sample", max_new_tokens=10) self.assertEqual(pipe.device, model.device) @@ -294,10 +296,9 @@ def test_assisted_decoding(self, model_arch): model_id = MODEL_NAMES[model_arch] dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 tokenizer = AutoTokenizer.from_pretrained(model_id) - ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) - device = ipex_model.device - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device) - tokens = tokenizer("This is a sample input", return_tensors="pt").to(device) + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) + tokens = tokenizer("This is a sample input", return_tensors="pt").to(DEVICE) ipex_output = ipex_model.generate(**tokens, do_sample=False, max_new_tokens=4) ipex_output_assisted = ipex_model.generate( **tokens, do_sample=False, assistant_model=transformers_model, max_new_tokens=4 @@ -325,11 +326,12 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache): model_id = MODEL_NAMES[model_arch] set_seed(SEED) dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 - model = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=use_cache, torch_dtype=dtype) + model = IPEXModelForCausalLM.from_pretrained( + model_id, use_cache=use_cache, torch_dtype=dtype, device_map=DEVICE + ) if use_cache and model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES: self.assertTrue(model.add_patch) - device = model.device - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) self.assertEqual(model.use_cache, use_cache) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token @@ -345,7 +347,7 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache): ), ) for text in texts: - tokens = tokenizer(text, padding=True, return_tensors="pt").to(device) + tokens = tokenizer(text, padding=True, return_tensors="pt").to(DEVICE) for generation_config in generation_configs: outputs = model.generate(**tokens, generation_config=generation_config) transformers_outputs = transformers_model.generate(**tokens, generation_config=generation_config) @@ -355,17 +357,20 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache): def test_compare_with_and_without_past_key_values(self): model_id = "Intel/tiny_random_llama2_ipex_model" dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 - model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True, torch_dtype=dtype) - device = model_with_pkv.device + model_with_pkv = IPEXModelForCausalLM.from_pretrained( + model_id, use_cache=True, torch_dtype=dtype, device_map=DEVICE + ) tokenizer = AutoTokenizer.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt").to(device) + tokens = tokenizer("This is a sample input", return_tensors="pt").to(DEVICE) # Warmup model_with_pkv.generate(**tokens) with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 ) - model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=dtype) + model_without_pkv = IPEXModelForCausalLM.from_pretrained( + model_id, use_cache=False, torch_dtype=dtype, device_map=DEVICE + ) # Warmup model_without_pkv.generate(**tokens) with Timer() as without_pkv_timer: @@ -380,10 +385,10 @@ def test_compare_with_and_without_past_key_values(self): def test_patched_model(self, model_arch): model_id = MODEL_NAMES[model_arch] patched_model_id = MODEL_NAMES["patched_" + model_arch] - ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) - exported_model = IPEXModelForCausalLM.from_pretrained(patched_model_id) + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, device_map=DEVICE) + exported_model = IPEXModelForCausalLM.from_pretrained(patched_model_id, device_map=DEVICE) tokenizer = AutoTokenizer.from_pretrained(model_id) - tokens = tokenizer("This is a sample", return_tensors="pt") + tokens = tokenizer("This is a sample", return_tensors="pt").to(DEVICE) ipex_outputs = ipex_model.generate( **tokens, max_new_tokens=1, return_dict_in_generate=True, output_logits=True ) @@ -410,12 +415,11 @@ def _generate_random_audio_data(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] - ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id) - device = ipex_model.device + ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, device_map=DEVICE) self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device) + transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id, device_map=DEVICE) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) - inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt").to(device) + inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt").to(DEVICE) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) outputs = ipex_model(**inputs) @@ -423,7 +427,7 @@ def test_compare_to_transformers(self, model_arch): # Test re-load model with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) - loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE) loaded_model_outputs = loaded_model(**inputs) # Test init method @@ -438,7 +442,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = self.IPEX_MODEL_CLASS.from_pretrained(model_id) + model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, device_map=DEVICE) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) pipe = pipeline("audio-classification", model=model, feature_extractor=preprocessor) outputs = pipe([np.random.random(16000)]) @@ -462,16 +466,15 @@ class IPEXModelForImageClassificationIntegrationTest(unittest.TestCase): def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id) + ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, device_map=DEVICE) if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES: self.assertTrue(ipex_model.add_patch) - device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device) + transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id, device_map=DEVICE) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) - inputs = preprocessor(images=image, return_tensors="pt").to(device) + inputs = preprocessor(images=image, return_tensors="pt").to(DEVICE) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) outputs = ipex_model(**inputs) @@ -479,7 +482,7 @@ def test_compare_to_transformers(self, model_arch): # Test re-load model with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) - loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, device_map=DEVICE) loaded_model_outputs = loaded_model(**inputs) # Test init method @@ -495,7 +498,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = self.IPEX_MODEL_CLASS.from_pretrained(model_id) + model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, device_map=DEVICE) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) outputs = pipe("http://images.cocodataset.org/val2017/000000039769.jpg") @@ -504,8 +507,12 @@ def test_pipeline(self, model_arch): self.assertTrue(isinstance(outputs[0]["label"], str)) def test_patched_model(self): - ipex_model = IPEXModelForImageClassification.from_pretrained("Intel/tiny-random-vit_ipex_model") - transformers_model = self.IPEX_MODEL_CLASS.from_pretrained("hf-internal-testing/tiny-random-vit") + ipex_model = IPEXModelForImageClassification.from_pretrained( + "Intel/tiny-random-vit_ipex_model", device_map=DEVICE + ) + transformers_model = self.IPEX_MODEL_CLASS.from_pretrained( + "hf-internal-testing/tiny-random-vit", device_map=DEVICE + ) preprocessor = AutoFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-vit") url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py index 77790e19f4..d9ddaf2586 100644 --- a/tests/ipex/test_pipelines.py +++ b/tests/ipex/test_pipelines.py @@ -35,6 +35,7 @@ torch.use_deterministic_algorithms(True) +DEVICE = "xpu:0" if IS_XPU_AVAILABLE else "cpu" class PipelinesIntegrationTest(unittest.TestCase): @@ -86,8 +87,8 @@ class PipelinesIntegrationTest(unittest.TestCase): @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) def test_token_classification_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] - transformers_generator = transformers_pipeline("token-classification", model_id) - ipex_generator = ipex_pipeline("token-classification", model_id, accelerator="ipex") + transformers_generator = transformers_pipeline("token-classification", model_id, device_map=DEVICE) + ipex_generator = ipex_pipeline("token-classification", model_id, accelerator="ipex", device_map=DEVICE) inputs = "Hello I'm Omar and I live in Zürich." with torch.inference_mode(): transformers_output = transformers_generator(inputs) @@ -101,8 +102,8 @@ def test_token_classification_pipeline_inference(self, model_arch): @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) def test_sequence_classification_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] - transformers_generator = transformers_pipeline("text-classification", model_id) - ipex_generator = ipex_pipeline("text-classification", model_id, accelerator="ipex") + transformers_generator = transformers_pipeline("text-classification", model_id, device_map=DEVICE) + ipex_generator = ipex_pipeline("text-classification", model_id, accelerator="ipex", device_map=DEVICE) inputs = "This restaurant is awesome" with torch.inference_mode(): transformers_output = transformers_generator(inputs) @@ -116,8 +117,8 @@ def test_sequence_classification_pipeline_inference(self, model_arch): def test_fill_mask_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] inputs = "The Milky Way is a galaxy." - transformers_generator = transformers_pipeline("fill-mask", model_id) - ipex_generator = ipex_pipeline("fill-mask", model_id, accelerator="ipex") + transformers_generator = transformers_pipeline("fill-mask", model_id, device_map=DEVICE) + ipex_generator = ipex_pipeline("fill-mask", model_id, accelerator="ipex", device_map=DEVICE) mask_token = transformers_generator.tokenizer.mask_token inputs = inputs.replace("", mask_token) with torch.inference_mode(): @@ -134,8 +135,12 @@ def test_fill_mask_pipeline_inference(self, model_arch): def test_text_generation_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 - transformers_generator = transformers_pipeline("text-generation", model_id, torch_dtype=dtype) - ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex", torch_dtype=dtype) + transformers_generator = transformers_pipeline( + "text-generation", model_id, torch_dtype=dtype, device_map=DEVICE + ) + ipex_generator = ipex_pipeline( + "text-generation", model_id, accelerator="ipex", torch_dtype=dtype, device_map=DEVICE + ) inputs = "Describe a real-world application of AI." with torch.inference_mode(): transformers_output = transformers_generator(inputs, do_sample=False, max_new_tokens=10) @@ -147,8 +152,8 @@ def test_text_generation_pipeline_inference(self, model_arch): @parameterized.expand(QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES) def test_question_answering_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] - transformers_generator = transformers_pipeline("question-answering", model_id) - ipex_generator = ipex_pipeline("question-answering", model_id, accelerator="ipex") + transformers_generator = transformers_pipeline("question-answering", model_id, device_map=DEVICE) + ipex_generator = ipex_pipeline("question-answering", model_id, accelerator="ipex", device_map=DEVICE) question = "How many programming languages does BLOOM support?" context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." with torch.inference_mode(): @@ -163,8 +168,8 @@ def test_question_answering_pipeline_inference(self, model_arch): @parameterized.expand(AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES) def test_audio_classification_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] - transformers_generator = transformers_pipeline("audio-classification", model_id) - ipex_generator = ipex_pipeline("audio-classification", model_id, accelerator="ipex") + transformers_generator = transformers_pipeline("audio-classification", model_id, device_map=DEVICE) + ipex_generator = ipex_pipeline("audio-classification", model_id, accelerator="ipex", device_map=DEVICE) inputs = [np.random.random(16000)] with torch.inference_mode(): transformers_output = transformers_generator(inputs) @@ -177,8 +182,8 @@ def test_audio_classification_pipeline_inference(self, model_arch): @parameterized.expand(IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES) def test_image_classification_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] - transformers_generator = transformers_pipeline("image-classification", model_id) - ipex_generator = ipex_pipeline("image-classification", model_id, accelerator="ipex") + transformers_generator = transformers_pipeline("image-classification", model_id, device_map=DEVICE) + ipex_generator = ipex_pipeline("image-classification", model_id, accelerator="ipex", device_map=DEVICE) inputs = "http://images.cocodataset.org/val2017/000000039769.jpg" with torch.inference_mode(): transformers_output = transformers_generator(inputs) @@ -193,9 +198,11 @@ def test_image_classification_pipeline_inference(self, model_arch): @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) def test_pipeline_load_from_ipex_model(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = IPEXModelForSequenceClassification.from_pretrained(model_id) + model = IPEXModelForSequenceClassification.from_pretrained(model_id, device_map=DEVICE) tokenizer = AutoTokenizer.from_pretrained(model_id) - ipex_generator = ipex_pipeline("text-classification", model, tokenizer=tokenizer, accelerator="ipex") + ipex_generator = ipex_pipeline( + "text-classification", model, tokenizer=tokenizer, accelerator="ipex", device_map=DEVICE + ) inputs = "This restaurant is awesome" with torch.inference_mode(): ipex_output = ipex_generator(inputs) @@ -205,11 +212,13 @@ def test_pipeline_load_from_ipex_model(self, model_arch): @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) def test_pipeline_load_from_jit_model(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = IPEXModelForSequenceClassification.from_pretrained(model_id) + model = IPEXModelForSequenceClassification.from_pretrained(model_id, device_map=DEVICE) save_dir = TemporaryDirectory().name model.save_pretrained(save_dir) tokenizer = AutoTokenizer.from_pretrained(model_id) - ipex_generator = ipex_pipeline("text-classification", save_dir, tokenizer=tokenizer, accelerator="ipex") + ipex_generator = ipex_pipeline( + "text-classification", save_dir, tokenizer=tokenizer, accelerator="ipex", device_map=DEVICE + ) inputs = "This restaurant is awesome" with torch.inference_mode(): ipex_output = ipex_generator(inputs)