SD3 updates, CLI arguments for multi-device

eagarvey-amd · eagarvey-amd · commit 711403ce89cc · 2024-08-01T20:48:24.000-05:00
diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
@@ -101,6 +101,7 @@ def __init__(
         self.output_counter = 0
         self.dest_type = dest_type
         self.dest_dtype = dest_dtype
+        self.validate = False
 
     def load(
         self,
@@ -252,6 +253,10 @@ def __call__(self, function_name, inputs: list):
         if not isinstance(inputs, list):
             inputs = [inputs]
         inputs = self._validate_or_convert_inputs(function_name, inputs)
+
+        if self.validate:
+            self.save_torch_inputs(inputs)
+
         if self.benchmark:
             output = self._run_and_benchmark(function_name, inputs)
         else:
@@ -261,6 +266,8 @@ def __call__(self, function_name, inputs: list):
         output = self._output_cast(output)
         return output
 
+    # def _run_and_validate(self, iree_fn, torch_fn, inputs: list)
+
 
 class Printer:
     def __init__(self, verbose, start_time, print_time):
diff --git a/models/turbine_models/custom_models/sd3_inference/diffusers_ref.py b/models/turbine_models/custom_models/sd3_inference/diffusers_ref.py
@@ -0,0 +1,49 @@
+from diffusers import StableDiffusion3Pipeline
+import torch
+from datetime import datetime as dt
+
+
+def run_diffusers_cpu(
+    hf_model_name,
+    prompt,
+    negative_prompt,
+    guidance_scale,
+    seed,
+    height,
+    width,
+    num_inference_steps,
+):
+    from diffusers import StableDiffusion3Pipeline
+
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        hf_model_name, torch_dtype=torch.float32
+    )
+    pipe = pipe.to("cpu")
+    generator = torch.Generator().manual_seed(int(seed))
+
+    image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        generator=generator,
+    ).images[0]
+    timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+    image.save(f"diffusers_reference_output_{timestamp}.png")
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
+    run_diffusers_cpu(
+        args.hf_model_name,
+        args.prompt,
+        args.negative_prompt,
+        args.guidance_scale,
+        args.seed,
+        args.height,
+        args.width,
+        args.num_inference_steps,
+    )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py b/models/turbine_models/custom_models/sd3_inference/sd3_schedulers.py
@@ -83,8 +83,8 @@ def prepare_model_input(self, sample, t, timesteps):
             latent_model_input = sample
         return latent_model_input.type(self.dtype), t.type(self.dtype)
 
-    def step(self, noise_pred, t, sample, guidance_scale):
-        self.model._step_index = self.index_for_timestep(t)
+    def step(self, noise_pred, t, sample, guidance_scale, i):
+        self.model._step_index = i
 
         if self.do_classifier_free_guidance:
             noise_preds = noise_pred.chunk(2)
@@ -299,6 +299,7 @@ def export_scheduler_model(
         torch.empty(1, dtype=dtype),
         torch.empty(sample, dtype=dtype),
         torch.empty(1, dtype=dtype),
+        torch.empty([1], dtype=torch.int64),
     ]
 
     fxb = FxProgramsBuilder(scheduler_module)
@@ -361,8 +362,8 @@ class CompiledScheduler(CompiledModule):
     }
     model_metadata_run_step = {
         "model_name": "sd3_scheduler_FlowEulerDiscrete",
-        "input_shapes": [noise_pred_shape, (1,), sample, (1,)],
-        "input_dtypes": [np_dtype, np_dtype, np_dtype, np_dtype],
+        "input_shapes": [noise_pred_shape, (1,), sample, (1,), (1,)],
+        "input_dtypes": [np_dtype, np_dtype, np_dtype, np_dtype, "int64"],
         "output_shapes": [sample],
         "output_dtypes": [np_dtype],
     }
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py b/models/turbine_models/custom_models/sd3_inference/sd3_text_encoders.py
@@ -54,7 +54,6 @@ class TextEncoderModule(torch.nn.Module):
     @torch.no_grad()
     def __init__(
         self,
-        batch_size=1,
     ):
         super().__init__()
         self.dtype = torch.float16
@@ -89,7 +88,6 @@ def __init__(
             load_into(f, self.t5xxl.transformer, "", "cpu", self.dtype)
 
         self.do_classifier_free_guidance = True
-        self.batch_size = batch_size
 
     def get_cond(self, tokens_l, tokens_g, tokens_t5xxl):
         l_out, l_pooled = self.clip_l.forward(tokens_l)
@@ -152,9 +150,7 @@ def export_text_encoders(
             attn_spec=attn_spec,
         )
         return vmfb_path
-    model = TextEncoderModule(
-        batch_size=batch_size,
-    )
+    model = TextEncoderModule(hf_model_name)
     mapper = {}
 
     assert (
diff --git a/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py b/models/turbine_models/custom_models/sd_inference/sd_cmd_opts.py
@@ -177,10 +177,43 @@ def is_valid_file(arg):
     default="fp16",
     help="Precision of Stable Diffusion weights and graph.",
 )
+
+p.add_argument(
+    "--clip_precision",
+    type=str,
+    default=None,
+    help="Precision of CLIP weights and graph.",
+)
+p.add_argument(
+    "--unet_precision",
+    type=str,
+    default=None,
+    help="Precision of CLIP weights and graph.",
+)
+p.add_argument(
+    "--mmdit_precision",
+    type=str,
+    default=None,
+    help="Precision of CLIP weights and graph.",
+)
+p.add_argument(
+    "--vae_precision",
+    type=str,
+    default=None,
+    help="Precision of CLIP weights and graph.",
+)
+
 p.add_argument(
     "--max_length", type=int, default=64, help="Sequence Length of Stable Diffusion"
 )
 
+p.add_argument(
+    "--decomp_attn",
+    default=False,
+    action="store_true",
+    help="Decompose attention at fx graph level",
+)
+
 p.add_argument(
     "--clip_decomp_attn",
     action="store_true",
@@ -205,12 +238,6 @@ def is_valid_file(arg):
     help="Decompose attention for unet only at fx graph level",
 )
 
-p.add_argument(
-    "--decomp_attn",
-    default=False,
-    action="store_true",
-    help="Decompose attention at fx graph level",
-)
 
 p.add_argument(
     "--use_i8_punet",
@@ -270,21 +297,81 @@ def is_valid_file(arg):
 # IREE Compiler Options
 ##############################################################################
 
-p.add_argument("--device", type=str, default="cpu", help="cpu, cuda, vulkan, rocm")
-
 p.add_argument(
-    "--rt_device",
+    "--device",
     type=str,
     default="local-task",
     help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
 )
 
+p.add_argument(
+    "--clip_device",
+    type=str,
+    default=None,
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+p.add_argument(
+    "--unet_device",
+    type=str,
+    default=None,
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+p.add_argument(
+    "--mmdit_device",
+    type=str,
+    default=None,
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+p.add_argument(
+    "--vae_device",
+    type=str,
+    default=None,
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+p.add_argument(
+    "--scheduler_device",
+    type=str,
+    default=None,
+    help="local-task, local-sync, vulkan://0, rocm://0, cuda://0, etc.",
+)
+
 # TODO: Bring in detection for target triple
 p.add_argument(
     "--iree_target_triple",
     type=str,
     default="x86_64-linux-gnu",
-    help="Specify vulkan target triple or rocm/cuda target device.",
+    help="Specify vulkan target triple or rocm/cuda target chip.",
+)
+
+p.add_argument(
+    "--clip_target",
+    type=str,
+    default=None,
+    help="Specify vulkan target triple or rocm/cuda target chip.",
+)
+p.add_argument(
+    "--unet_target",
+    type=str,
+    default=None,
+    help="Specify vulkan target triple or rocm/cuda target chip.",
+)
+p.add_argument(
+    "--mmdit_target",
+    type=str,
+    default=None,
+    help="Specify vulkan target triple or rocm/cuda target chip.",
+)
+p.add_argument(
+    "--vae_target",
+    type=str,
+    default=None,
+    help="Specify vulkan target triple or rocm/cuda target chip.",
+)
+p.add_argument(
+    "--scheduler_target",
+    type=str,
+    default=None,
+    help="Specify vulkan target triple or rocm/cuda target chip.",
 )
 
 p.add_argument("--ireec_flags", type=str, default="", help="extra iree-compile options")
@@ -296,13 +383,6 @@ def is_valid_file(arg):
     help="extra iree-compile options for models with iree_linalg_ext.attention ops.",
 )
 
-p.add_argument(
-    "--attn_spec",
-    type=str,
-    default=None,
-    help="extra iree-compile options for models with iree_linalg_ext.attention ops. Set this to 'default' if you are using mfma-capable hardware with ROCM.",
-)
-
 p.add_argument(
     "--clip_flags",
     type=str,
@@ -331,4 +411,12 @@ def is_valid_file(arg):
     help="extra iree-compile options to send for compiling mmdit. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
 )
 
+p.add_argument(
+    "--attn_spec",
+    type=str,
+    default=None,
+    help="extra iree-compile options for models with iree_linalg_ext.attention ops. Set this to 'default' if you are using mfma-capable hardware with ROCM.",
+)
+
+
 args, unknown = p.parse_known_args()
diff --git a/models/turbine_models/custom_models/sd_inference/sd_pipeline.py b/models/turbine_models/custom_models/sd_inference/sd_pipeline.py
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py