Merge branch 'main' into ean-sd-fp16

monorimet · web-flow · commit cc7657dc0214 · 2024-03-17T22:40:06.000-05:00
diff --git a/.github/workflows/test_sdxl.yml b/.github/workflows/test_sdxl.yml
@@ -0,0 +1,50 @@
+name: SDXL Models Nightly
+
+on:
+  schedule:
+    - cron:  '30 6 * * *'
+
+jobs:
+  test-sdxl-models:
+    strategy:
+      matrix:
+        version: [3.11]
+        os: [nodai-amdgpu-w7900-x86-64]
+
+    runs-on: ${{matrix.os}}
+    steps:
+      - name: "Setting up Python"
+        uses: actions/setup-python@75f3110429a8c05be0e1bf360334e4cced2b63fa # v2.3.3
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@v2
+        with:
+          ref: ean-sd-fp16
+
+      - name: Sync source deps
+        # build IREE from source with -DIREE_BUILD_TRACY=ON if getting tracy profile
+        run: |
+          python -m pip install --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --index-url https://download.pytorch.org/whl/cpu \
+            -r core/pytorch-cpu-requirements.txt \
+            -r core/torchvision-requirements.txt          
+          pip install --upgrade -r core/requirements.txt
+          pip install -e core[testing,torch-cpu-nightly]
+          pip install --upgrade -r models/requirements.txt
+          pip install -e models
+      
+      - name: Show current free memory
+        run: |
+          free -mh
+
+      - name: Run sdxl tests
+        run: |
+          pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+          pytest models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu
+          pytest models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
+          pytest models/turbine_models/tests/sdxl_test.py --device rocm --rt_device rocm --iree_target_triple gfx90a
diff --git a/README.md b/README.md
@@ -10,8 +10,8 @@ is intended to be a general purpose model compilation and execution tool.
 Turbine provides three primary tools:
 
 * *AOT Export*: For compiling one or more `nn.Module`s to compiled, deployment
-  ready artifacts. This operates via both a [simple one-shot export API](https://github.com/nod-ai/SHARK-Turbine/blob/main/python/shark_turbine/aot/exporter.py)
-  for simple models and an underlying [advanced API](https://github.com/nod-ai/SHARK-Turbine/blob/main/python/shark_turbine/aot/compiled_module.py) for complicated models
+  ready artifacts. This operates via both a simple one-shot export API (Already upstreamed to [torch-mlir](https://github.com/llvm/torch-mlir/blob/main/python/torch_mlir/extras/fx_importer.py))
+  for simple models and an underlying [advanced API](https://github.com/nod-ai/SHARK-Turbine/blob/main/core/shark_turbine/aot/compiled_module.py) for complicated models
   and accessing the full features of the runtime.
 * *Eager Execution*: A `torch.compile` backend is provided and a Turbine Tensor/Device
   is available for more native, interactive use within a PyTorch session.
@@ -39,7 +39,7 @@ please reach out to us on the `#turbine` channel of the
 
 ```
 pip install shark-turbine
-# Or editable: pip install -e core
+# Or for editable: see instructions under developers
 ```
 
 The above does install some unecessary cuda/cudnn packages for cpu use. To avoid this you
@@ -62,11 +62,11 @@ compiler, these should be compilable via IREE with `--iree-input-type=torch` for
 end to end execution. Dynamic shape support in torch-mlir is a work in progress,
 and not everything works at head with release binaries at present.
 
-  * [AOT MLP With Static Shapes](https://github.com/nod-ai/SHARK-Turbine/blob/main/examples/aot_mlp/mlp_export_simple.py)
-  * [AOT MLP with a dynamic batch size](https://github.com/nod-ai/SHARK-Turbine/blob/main/examples/aot_mlp/mlp_export_dynamic.py)
-  * [AOT llama2](https://github.com/nod-ai/SHARK-Turbine/blob/main/examples/llama2_inference/llama2.ipynb):
+  * [AOT MLP With Static Shapes](https://github.com/nod-ai/SHARK-Turbine/blob/main/core/examples/aot_mlp/mlp_export_simple.py)
+  * [AOT MLP with a dynamic batch size](https://github.com/nod-ai/SHARK-Turbine/blob/main/core/examples/aot_mlp/mlp_export_dynamic.py)
+  * [AOT llama2](https://github.com/nod-ai/SHARK-Turbine/blob/main/core/examples/llama2_inference/llama2.ipynb):
     Dynamic sequence length custom compiled module with state management internal to the model.
-  * [Eager MNIST with `torch.compile`](https://github.com/nod-ai/SHARK-Turbine/blob/main/examples/eager_mlp/mlp_eager_simple.py)
+  * [Eager MNIST with `torch.compile`](https://github.com/nod-ai/SHARK-Turbine/blob/main/core/examples/eager_mlp/mlp_eager_simple.py)
 
 ## Developers
 
diff --git a/core/iree-requirements.txt b/core/iree-requirements.txt
@@ -1,2 +1,2 @@
-iree-compiler>=20240306.822
-iree-runtime>=20240306.822
+iree-compiler==20240311.828
+iree-runtime==20240311.828
diff --git a/models/turbine_models/custom_models/llm_runner.py b/models/turbine_models/custom_models/llm_runner.py
@@ -168,12 +168,14 @@ def run_llm(
     streaming_llm=False,
     chat_mode=False,
     chat_sys_prompt=DEFAULT_CHAT_SYS_PROMPT,
+    tokenizer=None,
 ):
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model_name,
-        use_fast=False,
-        token=hf_auth_token,
-    )
+    if tokenizer == None:
+        tokenizer = AutoTokenizer.from_pretrained(
+            hf_model_name,
+            use_fast=False,
+            token=hf_auth_token,
+        )
     llm = SharkLLM(
         device=device,
         vmfb_path=vmfb_path,
@@ -204,43 +206,35 @@ def run_torch_llm(
     prompt,
     streaming_llm=False,
     chat_sys_prompt=DEFAULT_CHAT_SYS_PROMPT,
+    model=None,
+    tokenizer=None,
 ):
-    from turbine_models.model_builder import HFTransformerBuilder
-    from transformers import AutoModelForCausalLM
-
-    model_builder = HFTransformerBuilder(
-        example_input=None,
-        hf_id=hf_model_name,
-        auto_model=AutoModelForCausalLM,
-        hf_auth_token=hf_auth_token,
-        auto_tokenizer=AutoTokenizer,
-    )
     if streaming_llm is True:
-        enable_llama_pos_shift_attention(model_builder.model)
+        enable_llama_pos_shift_attention(model)
 
     def get_token_from_logits(logits):
         return torch.argmax(logits[:, -1, :], dim=1)
 
     prompt = append_user_prompt(chat_sys_prompt, prompt)
-    initial_input = model_builder.tokenizer(prompt, return_tensors="pt")
+    initial_input = tokenizer(prompt, return_tensors="pt")
     example_input_id = initial_input.input_ids
 
-    model_results = model_builder.model.forward(example_input_id)
+    model_results = model.forward(example_input_id)
     model_token = get_token_from_logits(model_results.logits)
 
     pkv = model_results.past_key_values
 
     torch_results = []
     torch_results.append(int(model_token))
     while model_token != 2:
-        model_results = model_builder.model.forward(
+        model_results = model.forward(
             torch.unsqueeze(model_token, 0), past_key_values=pkv
         )
         model_token = get_token_from_logits(model_results.logits)
         pkv = model_results.past_key_values
         torch_results.append(int(model_token[0]))
 
-    return model_builder.tokenizer.decode(torch_results)
+    return tokenizer.decode(torch_results)
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
@@ -121,18 +121,21 @@ def export_transformer_model(
     streaming_llm=False,
     vmfb_path=None,
     upload_ir=False,
+    mod=None,
+    tokenizer=None,
 ):
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model_name,
-        use_fast=False,
-        token=hf_auth_token,
-    )
-
-    mod = AutoModelForCausalLM.from_pretrained(
-        hf_model_name,
-        torch_dtype=torch.float,
-        token=hf_auth_token,
-    )
+    if tokenizer == None:
+        tokenizer = AutoTokenizer.from_pretrained(
+            hf_model_name,
+            use_fast=False,
+            token=hf_auth_token,
+        )
+    if mod == None:
+        mod = AutoModelForCausalLM.from_pretrained(
+            hf_model_name,
+            torch_dtype=torch.float,
+            token=hf_auth_token,
+        )
     schema_json = generate_schema(mod.config.num_hidden_layers)
     state_schema = pytree.treespec_loads(schema_json)
     if streaming_llm:
@@ -165,7 +168,8 @@ def export_transformer_model(
             for name in mod_params:
                 mapper["params." + name] = name
             if external_weight_file:
-                safetensors.torch.save_file(mod_params, external_weight_file)
+                if os.path.exists(external_weight_file) == False:
+                    safetensors.torch.save_file(mod_params, external_weight_file)
 
         elif external_weights == "gguf":
             tensor_mapper = remap_gguf.TensorNameMap(remap_gguf.MODEL_ARCH.LLAMA, HEADS)
diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
@@ -30,6 +30,7 @@ def __init__(
         model=None,
         model_type: str = None,
         compile_to_vmfb: bool = None,
+        tokenizer=None,
     ) -> None:
         self.example_input = example_input
         self.hf_id = hf_id
@@ -38,7 +39,7 @@ def __init__(
         self.auto_config = auto_config
         self.hf_auth_token = hf_auth_token
         self.model = model
-        self.tokenizer = None
+        self.tokenizer = tokenizer
         self.upload_ir = upload_ir
         self.model_type = model_type
         self.compile_to_vmfb = compile_to_vmfb
diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py
@@ -9,6 +9,11 @@
 import os
 import unittest
 import difflib
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import torch
+from accelerate import init_empty_weights
+from transformers.modeling_utils import load_sharded_checkpoint
+import tempfile
 
 os.environ["TORCH_LOGS"] = "dynamic"
 from shark_turbine.aot import *
@@ -18,18 +23,6 @@
     gen_external_params,
 )
 
-quantization = "unquantized"
-precision = "f32"
-gen_external_params(
-    hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-    quantization=quantization,
-    hf_auth_token=None,
-    precision=precision,
-)
-DEFAULT_PROMPT = """<s>[INST] <<SYS>>
-Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> hi what are you? [/INST]
-"""
-
 
 def check_output_string(reference, output):
     # Calculate and print diff
@@ -43,7 +36,45 @@ def check_output_string(reference, output):
     assert reference == output, "".join(diff)
 
 
+quantization = "unquantized"
+precision = "f32"
+
+DEFAULT_PROMPT = """<s>[INST] <<SYS>>
+Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> hi what are you? [/INST]
+"""
+
+
 class StatelessLlamaChecks(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        gen_external_params(
+            hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            quantization=quantization,
+            hf_auth_token=None,
+            precision=precision,
+        )
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            use_fast=False,
+        )
+
+        # The model is first created on the Meta device (with empty weights) and the state dict
+        # is then loaded inside it (shard by shard in the case of a sharded checkpoint).
+        # This avoids using twice the size of model with creating whole model with random weights,
+        # then loading pretrained weights.
+        cls.mod = AutoModelForCausalLM.from_pretrained(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            torch_dtype=torch.float,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.tokenizer = None
+        cls.mod = None
+
     def test_vmfb_comparison(self):
         """
         Test that the vmfb model produces the same output as the torch model
@@ -66,6 +97,8 @@ def test_vmfb_comparison(self):
             device="llvm-cpu",
             target_triple="host",
             upload_ir=upload_ir_var == "upload",
+            mod=self.mod,
+            tokenizer=self.tokenizer,
         )
 
         torch_str_cache_path = (
@@ -77,7 +110,11 @@ def test_vmfb_comparison(self):
                 torch_str = f.read()
         else:
             torch_str = llm_runner.run_torch_llm(
-                "Trelis/Llama-2-7b-chat-hf-function-calling-v2", None, DEFAULT_PROMPT
+                "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+                None,
+                self.DEFAULT_PROMPT,
+                model=self.mod,
+                tokenizer=self.tokenizer,
             )
 
             with open(torch_str_cache_path, "w") as f:
@@ -90,6 +127,7 @@ def test_vmfb_comparison(self):
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             None,
             f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors",
+            tokenizer=self.tokenizer,
         )
         check_output_string(torch_str, turbine_str)
 
@@ -109,6 +147,8 @@ def test_streaming_vmfb_comparison(self):
             target_triple="host",
             streaming_llm=True,
             vmfb_path="streaming_llama.vmfb",
+            mod=self.mod,
+            tokenizer=self.tokenizer,
         )
 
         torch_str_cache_path = (
@@ -124,6 +164,8 @@ def test_streaming_vmfb_comparison(self):
                 None,
                 DEFAULT_PROMPT,
                 streaming_llm=True,
+                model=self.mod,
+                tokenizer=self.tokenizer,
             )
 
             with open(torch_str_cache_path, "w") as f:
@@ -137,6 +179,7 @@ def test_streaming_vmfb_comparison(self):
             None,
             f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors",
             streaming_llm=True,
+            tokenizer=self.tokenizer,
         )
         check_output_string(torch_str, turbine_str)
 
@@ -145,12 +188,16 @@ def test_rerotated_torch_comparison(self):
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             None,
             DEFAULT_PROMPT,
+            model=self.mod,
+            tokenizer=self.tokenizer,
         )
         rotated_torch_str = llm_runner.run_torch_llm(
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             None,
             DEFAULT_PROMPT,
             streaming_llm=True,
+            model=self.mod,
+            tokenizer=self.tokenizer,
         )
         check_output_string(torch_str, rotated_torch_str)