diff --git a/python/llm/src/ipex_llm/optimize.py b/python/llm/src/ipex_llm/optimize.py index 2af7ccc5e4a..0bf1c41023a 100644 --- a/python/llm/src/ipex_llm/optimize.py +++ b/python/llm/src/ipex_llm/optimize.py @@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_ torch_dtype=torch_dtype, optimize_model=optimize_llm, modules_to_not_convert=modules_to_not_convert, - cpu_embedding=cpu_embedding) + cpu_embedding=cpu_embedding, + disable_optimize_pre=kwargs.pop("disable_optimize_pre", + False)) # add save_low_bit to pretrained model dynamically import types model._bigdl_config = dict() diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index f18eac4b08f..169b7102ff4 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, torch_dtype="auto", imatrix_data=None, embedding_qtype=None, - mixed_precision=False): + mixed_precision=False, + disable_optimize_pre=False): if qtype in ggml_tensor_qtype.values(): index = list(ggml_tensor_qtype.values()).index(qtype) logger.info(f"Converting the current model to " @@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, model = _optimize_ipex(model, qtype) return model - if optimize_model: + if optimize_model and not disable_optimize_pre: model = _optimize_pre(model, qtype) act_order = False diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index f9b0757842d..28e4c0834ae 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -764,6 +764,7 @@ def __init__(self, input_features, output_features, bias=True, # weigh_type = 3 means weight has been transposed by esimd method self.weight_type = 1 self.optimize_lm_head = optimize_lm_head + self.disable_fp16_opt = False def forward(self, x: torch.Tensor): # only work for GPU @@ -779,8 +780,11 @@ def forward(self, x: torch.Tensor): self.weight.data = self.weight.data.to(x.dtype) if not self.use_esimd_kernel(x): - if get_ipex_version() < "2.1.10+xpu" \ - or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]: + if ( + get_ipex_version() < "2.1.10+xpu" + or get_xpu_device_type(x) not in ["arc", "flex", "pvc"] + or self.disable_fp16_opt + ): if self.weight_type == 2: self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(), requires_grad=False) @@ -845,6 +849,8 @@ def forward(self, x: torch.Tensor): def use_esimd_kernel(self, x): gpu_type = get_xpu_device_type(x) + if self.disable_fp16_opt: + return False # esimd kernel can only be used for Arc and Flex if gpu_type not in ["arc", "flex"]: return False diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index 323b73cbce9..7411afde974 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -445,6 +445,7 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): mixed_precision = kwargs.pop("mixed_precision", False) if embedding_qtype is not None: embedding_qtype = ggml_tensor_qtype[embedding_qtype] + disable_optimize_pre = kwargs.pop("disable_optimize_pre", False) _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) awq_config = None @@ -513,7 +514,8 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): torch_dtype=kwargs.get("torch_dtype", 'auto'), imatrix_data=imatrix_data, embedding_qtype=embedding_qtype, - mixed_precision=mixed_precision) + mixed_precision=mixed_precision, + disable_optimize_pre=disable_optimize_pre) if disk_embedding: from ipex_llm.transformers.embedding import DiskEmbedding