From ca54dea707d46f7cb0010a0a9708fbe2d05c88a5 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 23 Jul 2024 16:45:51 +0200 Subject: [PATCH] Better CLI + RunPod Script (#552) --- examples/cli.py | 13 ++++++++++--- scripts/runpod_quantize.py | 37 +++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/examples/cli.py b/examples/cli.py index 4537bf7d..a52d1193 100644 --- a/examples/cli.py +++ b/examples/cli.py @@ -20,8 +20,9 @@ def main(): parser.add_argument("--no-low_cpu_mem_usage", action="store_false", dest="low_cpu_mem_usage", help="Don't use low CPU memory") parser.add_argument("--use_cache", action="store_true", help="Use cache") parser.add_argument("--no-use_cache", action="store_false", dest="use_cache", help="Don't use cache") + parser.add_argument("--device_map", type=str, default=None, help="Device map for loading the pretrained model") - parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=False) + parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=None) args = parser.parse_args() @@ -34,11 +35,17 @@ def main(): model_config = { "low_cpu_mem_usage": args.low_cpu_mem_usage, - "use_cache": args.use_cache } + if args.use_cache is not None: + model_config["use_cache"] = args.use_cache + print(f"Loading model from: {args.hf_model_path}") - model = AutoAWQForCausalLM.from_pretrained(args.hf_model_path, **model_config) + model = AutoAWQForCausalLM.from_pretrained( + args.hf_model_path, + device_map=args.device_map, + **model_config + ) tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path, trust_remote_code=True) print(f"Quantizing model with config: {quant_config}") diff --git a/scripts/runpod_quantize.py b/scripts/runpod_quantize.py index 0d447f8d..754cf71f 100644 --- a/scripts/runpod_quantize.py +++ b/scripts/runpod_quantize.py @@ -23,14 +23,19 @@ gpu_id = gpu_ids["4090"] num_gpus = 1 system_memory_gb = 100 -system_storage_gb = 20 # fp16 model is downloaded here -volume_storage_gb = 20 # quantized model is saved here +system_storage_gb = 150 # fp16 model is downloaded here +volume_storage_gb = 50 # quantized model is saved here # Quantization Parameters hf_model_path = "Qwen/Qwen2-0.5B-Instruct" quant_name = "qwen2-0.5b-instruct-awq" local_save_path = f"/workspace/{quant_name}" hf_upload_path = f"casperhansen/{quant_name}" +INSTALL_TRANSFORMERS_MAIN = False +USE_HF_TRANSFER = False + +if USE_HF_TRANSFER: + env_variables["HF_HUB_ENABLE_HF_TRANSFER"] = "1" cli_args = dict( hf_model_path = hf_model_path, @@ -45,18 +50,22 @@ ) cli_args = " ".join([f"--{k}" if isinstance(v, bool) else f"--{k} {v}" for k,v in cli_args.items()]) -docker_command = ( - "bash -c '" + - "cd /workspace && " + - "git clone https://github.com/casper-hansen/AutoAWQ.git && " + - "cd AutoAWQ && " + - "pip install -e . && " + - "huggingface-cli login --token $HF_TOKEN && " + - f"python examples/cli.py {cli_args} && " + - f"huggingface-cli upload {hf_upload_path} {local_save_path} ./ && " + - "runpodctl stop pod $RUNPOD_POD_ID" + - "'" -) +commands = [ + "cd /workspace", + "git clone https://github.com/casper-hansen/AutoAWQ.git", + "cd AutoAWQ", + "pip install -e .", + "pip install -U git+https://github.com/huggingface/transformers.git" if INSTALL_TRANSFORMERS_MAIN else "", + "pip install hf-transfer" if USE_HF_TRANSFER else "", + "huggingface-cli login --token $HF_TOKEN", + f"python examples/cli.py {cli_args}", + f"huggingface-cli upload {hf_upload_path} {local_save_path} ./", + "runpodctl stop pod $RUNPOD_POD_ID", +] +commands = [cmd for cmd in commands if cmd] +commands = " && ".join(commands) + +docker_command = "bash -c '" + commands + "'" template = runpod.create_template( name=template_name,