From ca54dea707d46f7cb0010a0a9708fbe2d05c88a5 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 23 Jul 2024 16:45:51 +0200
Subject: [PATCH] Better CLI + RunPod Script (#552)

---
 examples/cli.py            | 13 ++++++++++---
 scripts/runpod_quantize.py | 37 +++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/examples/cli.py b/examples/cli.py
index 4537bf7d..a52d1193 100644
--- a/examples/cli.py
+++ b/examples/cli.py
@@ -20,8 +20,9 @@ def main():
     parser.add_argument("--no-low_cpu_mem_usage", action="store_false", dest="low_cpu_mem_usage", help="Don't use low CPU memory")
     parser.add_argument("--use_cache", action="store_true", help="Use cache")
     parser.add_argument("--no-use_cache", action="store_false", dest="use_cache", help="Don't use cache")
+    parser.add_argument("--device_map", type=str, default=None, help="Device map for loading the pretrained model")
 
-    parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=False)
+    parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=None)
 
     args = parser.parse_args()
 
@@ -34,11 +35,17 @@ def main():
 
     model_config = {
         "low_cpu_mem_usage": args.low_cpu_mem_usage,
-        "use_cache": args.use_cache
     }
 
+    if args.use_cache is not None:
+        model_config["use_cache"] = args.use_cache
+
     print(f"Loading model from: {args.hf_model_path}")
-    model = AutoAWQForCausalLM.from_pretrained(args.hf_model_path, **model_config)
+    model = AutoAWQForCausalLM.from_pretrained(
+        args.hf_model_path,
+        device_map=args.device_map,
+        **model_config
+    )
     tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path, trust_remote_code=True)
 
     print(f"Quantizing model with config: {quant_config}")
diff --git a/scripts/runpod_quantize.py b/scripts/runpod_quantize.py
index 0d447f8d..754cf71f 100644
--- a/scripts/runpod_quantize.py
+++ b/scripts/runpod_quantize.py
@@ -23,14 +23,19 @@
 gpu_id = gpu_ids["4090"]
 num_gpus = 1
 system_memory_gb = 100
-system_storage_gb = 20 # fp16 model is downloaded here
-volume_storage_gb = 20 # quantized model is saved here
+system_storage_gb = 150 # fp16 model is downloaded here
+volume_storage_gb = 50 # quantized model is saved here
 
 # Quantization Parameters
 hf_model_path = "Qwen/Qwen2-0.5B-Instruct"
 quant_name = "qwen2-0.5b-instruct-awq"
 local_save_path = f"/workspace/{quant_name}"
 hf_upload_path = f"casperhansen/{quant_name}"
+INSTALL_TRANSFORMERS_MAIN = False
+USE_HF_TRANSFER = False
+
+if USE_HF_TRANSFER:
+    env_variables["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 cli_args = dict(
     hf_model_path = hf_model_path,
@@ -45,18 +50,22 @@
 )
 cli_args = " ".join([f"--{k}" if isinstance(v, bool) else f"--{k} {v}" for k,v in cli_args.items()])
 
-docker_command = (
-    "bash -c '" +
-    "cd /workspace && " +
-    "git clone https://github.com/casper-hansen/AutoAWQ.git && " +
-    "cd AutoAWQ && " +
-    "pip install -e . && " +
-    "huggingface-cli login --token $HF_TOKEN && " +
-    f"python examples/cli.py {cli_args} && " +
-    f"huggingface-cli upload {hf_upload_path} {local_save_path} ./ && " +
-    "runpodctl stop pod $RUNPOD_POD_ID" +
-    "'"
-)
+commands = [
+    "cd /workspace",
+    "git clone https://github.com/casper-hansen/AutoAWQ.git",
+    "cd AutoAWQ",
+    "pip install -e .",
+    "pip install -U git+https://github.com/huggingface/transformers.git" if INSTALL_TRANSFORMERS_MAIN else "",
+    "pip install hf-transfer" if USE_HF_TRANSFER else "",
+    "huggingface-cli login --token $HF_TOKEN",
+    f"python examples/cli.py {cli_args}",
+    f"huggingface-cli upload {hf_upload_path} {local_save_path} ./",
+    "runpodctl stop pod $RUNPOD_POD_ID",
+]
+commands = [cmd for cmd in commands if cmd]
+commands = " && ".join(commands)
+
+docker_command = "bash -c '" + commands + "'"
 
 template = runpod.create_template(
     name=template_name,