diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index 65675c111..17999ab45 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -56,17 +56,20 @@ def export_clip_model( max_alloc=None, upload_ir=False, ): + input_len = 77 if "google/t5" in hf_model_name: from transformers import T5Tokenizer, T5Model tokenizer = T5Tokenizer.from_pretrained(hf_model_name) text_encoder_model = T5Model.from_pretrained(hf_model_name) + input_len = 512 else: # TODO: Add better filtering mechanism for things that require CLIPProcessor - if hf_model_name == "openai/clip-vit-large-patch14": + if "openai" in hf_model_name: tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") hf_subfolder = "" # CLIPProcessor does not have a subfolder + input_len = 10 else: # Load the tokenizer and text encoder to tokenize and encode the text. tokenizer = CLIPTokenizer.from_pretrained( @@ -102,8 +105,8 @@ class CompiledClip(CompiledModule): def main( self, - inp=AbstractTensor(1, 77, dtype=torch.int64), - decoder_input_ids=AbstractTensor(1, 77, dtype=torch.int64), + inp=AbstractTensor(1, input_len, dtype=torch.int64), + decoder_input_ids=AbstractTensor(1, input_len, dtype=torch.int64), ): return jittable(text_encoder_model.forward)( input_ids=inp, decoder_input_ids=decoder_input_ids @@ -122,7 +125,7 @@ class CompiledClip(CompiledModule): else: params = export_parameters(text_encoder_model) - def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): + def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)): return jittable(text_encoder_model.forward)(input_ids=inp) import_to = "INPUT" if compile_to == "linalg" else "IMPORT" diff --git a/models/turbine_models/custom_models/sd_inference/clip_runner.py b/models/turbine_models/custom_models/sd_inference/clip_runner.py index a44e8e250..c72b5e221 100644 --- a/models/turbine_models/custom_models/sd_inference/clip_runner.py +++ b/models/turbine_models/custom_models/sd_inference/clip_runner.py @@ -3,6 +3,7 @@ from transformers import CLIPTokenizer from iree import runtime as ireert import torch +from PIL import Image parser = argparse.ArgumentParser() @@ -52,21 +53,54 @@ def run_clip( ): runner = vmfbRunner(device, vmfb_path, external_weight_path) - tokenizer = CLIPTokenizer.from_pretrained( - hf_model_name, - subfolder="tokenizer", - token=hf_auth_token, - ) - text_input = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) + if "google/t5" in hf_model_name: + from transformers import T5Tokenizer, T5Model + + tokenizer = T5Tokenizer.from_pretrained(hf_model_name) + text_input = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + # TODO: Integrate with HFTransformerBuilder + else: + if "openai" in hf_model_name: + from transformers import CLIPProcessor + import requests + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") + text_input = tokenizer( + text=prompt, + images=image, + truncation=True, + padding=True, + return_tensors="pt", + ) + else: + hf_subfolder = "tokenizer" + + tokenizer = CLIPTokenizer.from_pretrained( + hf_model_name, + subfolder=hf_subfolder, + token=hf_auth_token, + ) + + text_input = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) example_input = text_input.input_ids inp = [ireert.asdevicearray(runner.config.device, example_input)] + if "google/t5" in hf_model_name: + inp += [ireert.asdevicearray(runner.config.device, example_input)] results = runner.ctx.modules.compiled_clip["main"](*inp) return results @@ -77,13 +111,38 @@ def run_torch_clip(hf_model_name, hf_auth_token, prompt): tokenizer = T5Tokenizer.from_pretrained(hf_model_name) model = T5Model.from_pretrained(hf_model_name) + text_input = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) # TODO: Integrate with HFTransformerBuilder else: if hf_model_name == "openai/clip-vit-large-patch14": from transformers import CLIPProcessor + import requests + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") hf_subfolder = "" # CLIPProcessor does not have a subfolder + from transformers import CLIPTextModel + + model = CLIPTextModel.from_pretrained( + hf_model_name, + subfolder=hf_subfolder, + token=hf_auth_token, + ) + text_input = tokenizer( + text=prompt, + images=image, + truncation=True, + padding=True, + return_tensors="pt", + ) else: hf_subfolder = "text_encoder" @@ -93,20 +152,20 @@ def run_torch_clip(hf_model_name, hf_auth_token, prompt): token=hf_auth_token, ) - from transformers import CLIPTextModel + from transformers import CLIPTextModel - model = CLIPTextModel.from_pretrained( - hf_model_name, - subfolder=hf_subfolder, - token=hf_auth_token, - ) - text_input = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) + model = CLIPTextModel.from_pretrained( + hf_model_name, + subfolder=hf_subfolder, + token=hf_auth_token, + ) + text_input = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) example_input = text_input.input_ids if "google/t5" in hf_model_name: diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py index 2601ee904..bbefd0500 100644 --- a/models/turbine_models/tests/sd_test.py +++ b/models/turbine_models/tests/sd_test.py @@ -21,9 +21,11 @@ import torch import unittest import os +import copy +import platform -arguments = { +default_arguments = { "hf_auth_token": None, "hf_model_name": "CompVis/stable-diffusion-v1-4", "scheduler_id": "PNDM", @@ -42,6 +44,7 @@ "prompt": "a photograph of an astronaut riding a horse", "in_channels": 4, } +UPLOAD_IR = os.environ.get("TURBINE_TANK_ACTION", "not_upload") == "upload" unet_model = unet.UnetModel( @@ -60,20 +63,21 @@ # This is a public model, so no auth required "CompVis/stable-diffusion-v1-4", ) -scheduler = schedulers_dict[arguments["scheduler_id"]] +scheduler = schedulers_dict[default_arguments["scheduler_id"]] scheduler_module = schedulers.Scheduler( - "CompVis/stable-diffusion-v1-4", arguments["num_inference_steps"], scheduler + "CompVis/stable-diffusion-v1-4", default_arguments["num_inference_steps"], scheduler ) + # TODO: this is a mess, don't share args across tests, create a copy for each test class StableDiffusionTest(unittest.TestCase): def testExportT5Model(self): - arguments["hf_model_name"] = "google/t5-v1_1-small" - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") - safe_prefix = "google_t5-v1_1-small" + current_args = copy.deepcopy(default_arguments) + current_args["hf_model_name"] = "google/t5-v1_1-small" + safe_prefix = "t5_v1_1_small" with self.assertRaises(SystemExit) as cm: clip.export_clip_model( - hf_model_name=arguments["hf_model_name"], + hf_model_name=current_args["hf_model_name"], hf_auth_token=None, compile_to="vmfb", external_weights=None, @@ -81,32 +85,37 @@ def testExportT5Model(self): device="cpu", target_triple=None, max_alloc=None, - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments["vmfb_path"] = safe_prefix + ".vmfb" + current_args["vmfb_path"] = safe_prefix + "_clip.vmfb" turbine = clip_runner.run_clip( - arguments["device"], - arguments["prompt"], - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], + current_args["device"], + current_args["prompt"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], None, ) torch_output = clip_runner.run_torch_clip( - arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"] + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["prompt"], ) err = utils.largest_error(torch_output, turbine[0]) - assert err < 9e-5 - os.remove(safe_prefix + ".vmfb") + assert err < 9e-4 + if platform.system() != "Windows": + os.remove(current_args["external_weight_path"]) + os.remove(current_args["vmfb_path"]) + del current_args def testExportClipVitLarge14(self): - arguments["hf_model_name"] = "openai/clip-vit-large-patch14" - safe_prefix = "openai_clip_vit_large_patch14" - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + current_args = copy.deepcopy(default_arguments) + current_args["hf_model_name"] = "openai/clip-vit-large-patch14" + safe_prefix = "clip_vit_large_patch14" with self.assertRaises(SystemExit) as cm: clip.export_clip_model( - hf_model_name=arguments["hf_model_name"], + hf_model_name=current_args["hf_model_name"], hf_auth_token=None, compile_to="vmfb", external_weights="safetensors", @@ -114,30 +123,33 @@ def testExportClipVitLarge14(self): device="cpu", target_triple=None, max_alloc=None, - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments["external_weight_path"] = safe_prefix + ".safetensors" - arguments["vmfb_path"] = safe_prefix + ".vmfb" + current_args["external_weight_path"] = safe_prefix + ".safetensors" + current_args["vmfb_path"] = safe_prefix + "_clip.vmfb" turbine = clip_runner.run_clip( - arguments["device"], - arguments["prompt"], - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], + current_args["device"], + current_args["prompt"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["external_weight_path"], ) torch_output = clip_runner.run_torch_clip( - arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"] + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["prompt"], ) err = utils.largest_error(torch_output, turbine[0]) assert err < 9e-5 - os.remove(safe_prefix + ".safetensors") - os.remove(safe_prefix + ".vmfb") + if platform.system() != "Windows": + os.remove(current_args["external_weight_path"]) + os.remove(current_args["vmfb_path"]) def testExportClipModel(self): - arguments["hf_model_name"] = "CompVis/stable-diffusion-v1-4" - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + current_args = copy.deepcopy(default_arguments) + current_args["hf_model_name"] = "CompVis/stable-diffusion-v1-4" with self.assertRaises(SystemExit) as cm: clip.export_clip_model( # This is a public model, so no auth required @@ -147,21 +159,23 @@ def testExportClipModel(self): "safetensors", "stable_diffusion_v1_4_clip.safetensors", "cpu", - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb" + current_args["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors" + current_args["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb" turbine = clip_runner.run_clip( - arguments["device"], - arguments["prompt"], - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], + current_args["device"], + current_args["prompt"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["external_weight_path"], ) torch_output = clip_runner.run_torch_clip( - arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"] + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["prompt"], ) err = utils.largest_error(torch_output, turbine[0]) assert err < 9e-5 @@ -169,48 +183,48 @@ def testExportClipModel(self): os.remove("stable_diffusion_v1_4_clip.vmfb") def testExportUnetModel(self): - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + current_args = copy.deepcopy(default_arguments) with self.assertRaises(SystemExit) as cm: unet.export_unet_model( unet_model, # This is a public model, so no auth required "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], + current_args["batch_size"], + current_args["height"], + current_args["width"], None, "vmfb", "safetensors", "stable_diffusion_v1_4_unet.safetensors", "cpu", - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb" + current_args["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors" + current_args["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb" sample = torch.rand( - arguments["batch_size"], - arguments["in_channels"], - arguments["height"] // 8, - arguments["width"] // 8, + current_args["batch_size"], + current_args["in_channels"], + current_args["height"] // 8, + current_args["width"] // 8, dtype=torch.float32, ) timestep = torch.zeros(1, dtype=torch.float32) encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32) turbine = unet_runner.run_unet( - arguments["device"], + current_args["device"], sample, timestep, encoder_hidden_states, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["external_weight_path"], ) torch_output = unet_runner.run_torch_unet( - arguments["hf_model_name"], - arguments["hf_auth_token"], + current_args["hf_model_name"], + current_args["hf_auth_token"], sample, timestep, encoder_hidden_states, @@ -221,44 +235,44 @@ def testExportUnetModel(self): os.remove("stable_diffusion_v1_4_unet.vmfb") def testExportVaeModelDecode(self): - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + current_args = copy.deepcopy(default_arguments) with self.assertRaises(SystemExit) as cm: vae.export_vae_model( vae_model, # This is a public model, so no auth required "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], + current_args["batch_size"], + current_args["height"], + current_args["width"], None, "vmfb", "safetensors", "stable_diffusion_v1_4_vae.safetensors", "cpu", variant="decode", - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" + current_args["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" + current_args["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" example_input = torch.rand( - arguments["batch_size"], + current_args["batch_size"], 4, - arguments["height"] // 8, - arguments["width"] // 8, + current_args["height"] // 8, + current_args["width"] // 8, dtype=torch.float32, ) turbine = vae_runner.run_vae( - arguments["device"], + current_args["device"], example_input, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["external_weight_path"], ) torch_output = vae_runner.run_torch_vae( - arguments["hf_model_name"], - arguments["hf_auth_token"], + current_args["hf_model_name"], + current_args["hf_auth_token"], "decode", example_input, ) @@ -270,44 +284,44 @@ def testExportVaeModelDecode(self): # https://github.com/nod-ai/SHARK-Turbine/issues/536 @unittest.expectedFailure def testExportVaeModelEncode(self): - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + current_args = copy.deepcopy(default_arguments) with self.assertRaises(SystemExit) as cm: vae.export_vae_model( vae_model, # This is a public model, so no auth required "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], + current_args["batch_size"], + current_args["height"], + current_args["width"], None, "vmfb", "safetensors", "stable_diffusion_v1_4_vae.safetensors", "cpu", variant="encode", - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" + current_args["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" + current_args["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" example_input = torch.rand( - arguments["batch_size"], + current_args["batch_size"], 3, - arguments["height"], - arguments["width"], + current_args["height"], + current_args["width"], dtype=torch.float32, ) turbine = vae_runner.run_vae( - arguments["device"], + current_args["device"], example_input, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["external_weight_path"], ) torch_output = vae_runner.run_torch_vae( - arguments["hf_model_name"], - arguments["hf_auth_token"], + current_args["hf_model_name"], + current_args["hf_auth_token"], "encode", example_input, ) @@ -318,48 +332,47 @@ def testExportVaeModelEncode(self): @unittest.expectedFailure def testExportPNDMScheduler(self): - upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + current_args = copy.deepcopy(default_arguments) + safe_name = "stable_diffusion_v1_4_scheduler" with self.assertRaises(SystemExit) as cm: schedulers.export_scheduler( scheduler_module, # This is a public model, so no auth required "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], + current_args["batch_size"], + current_args["height"], + current_args["width"], None, "vmfb", "safetensors", "stable_diffusion_v1_4_scheduler.safetensors", "cpu", - upload_ir=upload_ir_var == "upload", + upload_ir=UPLOAD_IR, ) self.assertEqual(cm.exception.code, None) - arguments[ - "external_weight_path" - ] = "stable_diffusion_v1_4_scheduler.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_scheduler.vmfb" + current_args["external_weight_path"] = safe_name + ".safetensors" + current_args["vmfb_path"] = safe_name + ".vmfb" sample = torch.rand( - arguments["batch_size"], + current_args["batch_size"], 4, - arguments["height"] // 8, - arguments["width"] // 8, + current_args["height"] // 8, + current_args["width"] // 8, dtype=torch.float32, ) encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32) turbine = schedulers_runner.run_scheduler( - arguments["device"], + current_args["device"], sample, encoder_hidden_states, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], + current_args["vmfb_path"], + current_args["hf_model_name"], + current_args["hf_auth_token"], + current_args["external_weight_path"], ) torch_output = schedulers_runner.run_torch_scheduler( - arguments["hf_model_name"], + current_args["hf_model_name"], scheduler, - arguments["num_inference_steps"], + current_args["num_inference_steps"], sample, encoder_hidden_states, )