Skip to content

Commit

Permalink
add disable opts for awq (#12641)
Browse files Browse the repository at this point in the history
  • Loading branch information
cyita authored Jan 2, 2025
1 parent 6231896 commit 8e5328e
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 6 deletions.
4 changes: 3 additions & 1 deletion python/llm/src/ipex_llm/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
torch_dtype=torch_dtype,
optimize_model=optimize_llm,
modules_to_not_convert=modules_to_not_convert,
cpu_embedding=cpu_embedding)
cpu_embedding=cpu_embedding,
disable_optimize_pre=kwargs.pop("disable_optimize_pre",
False))
# add save_low_bit to pretrained model dynamically
import types
model._bigdl_config = dict()
Expand Down
5 changes: 3 additions & 2 deletions python/llm/src/ipex_llm/transformers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
torch_dtype="auto",
imatrix_data=None,
embedding_qtype=None,
mixed_precision=False):
mixed_precision=False,
disable_optimize_pre=False):
if qtype in ggml_tensor_qtype.values():
index = list(ggml_tensor_qtype.values()).index(qtype)
logger.info(f"Converting the current model to "
Expand All @@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
model = _optimize_ipex(model, qtype)
return model

if optimize_model:
if optimize_model and not disable_optimize_pre:
model = _optimize_pre(model, qtype)

act_order = False
Expand Down
10 changes: 8 additions & 2 deletions python/llm/src/ipex_llm/transformers/low_bit_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,7 @@ def __init__(self, input_features, output_features, bias=True,
# weigh_type = 3 means weight has been transposed by esimd method
self.weight_type = 1
self.optimize_lm_head = optimize_lm_head
self.disable_fp16_opt = False

def forward(self, x: torch.Tensor):
# only work for GPU
Expand All @@ -779,8 +780,11 @@ def forward(self, x: torch.Tensor):
self.weight.data = self.weight.data.to(x.dtype)

if not self.use_esimd_kernel(x):
if get_ipex_version() < "2.1.10+xpu" \
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
if (
get_ipex_version() < "2.1.10+xpu"
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
or self.disable_fp16_opt
):
if self.weight_type == 2:
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
requires_grad=False)
Expand Down Expand Up @@ -845,6 +849,8 @@ def forward(self, x: torch.Tensor):

def use_esimd_kernel(self, x):
gpu_type = get_xpu_device_type(x)
if self.disable_fp16_opt:
return False
# esimd kernel can only be used for Arc and Flex
if gpu_type not in ["arc", "flex"]:
return False
Expand Down
4 changes: 3 additions & 1 deletion python/llm/src/ipex_llm/transformers/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
mixed_precision = kwargs.pop("mixed_precision", False)
if embedding_qtype is not None:
embedding_qtype = ggml_tensor_qtype[embedding_qtype]
disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
_args = copy.deepcopy(args)
_kwargs = copy.deepcopy(kwargs)
awq_config = None
Expand Down Expand Up @@ -513,7 +514,8 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
torch_dtype=kwargs.get("torch_dtype", 'auto'),
imatrix_data=imatrix_data,
embedding_qtype=embedding_qtype,
mixed_precision=mixed_precision)
mixed_precision=mixed_precision,
disable_optimize_pre=disable_optimize_pre)

if disk_embedding:
from ipex_llm.transformers.embedding import DiskEmbedding
Expand Down

0 comments on commit 8e5328e

Please sign in to comment.