fix e2e tests

dan-garvey · dan-garvey · commit 441077b4d022 · 2024-03-20T13:45:40.000-05:00
diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -56,17 +56,20 @@ def export_clip_model(
     max_alloc=None,
     upload_ir=False,
 ):
+    input_len = 77
     if "google/t5" in hf_model_name:
         from transformers import T5Tokenizer, T5Model
 
         tokenizer = T5Tokenizer.from_pretrained(hf_model_name)
         text_encoder_model = T5Model.from_pretrained(hf_model_name)
+        input_len = 512
 
     else:
         # TODO: Add better filtering mechanism for things that require CLIPProcessor
-        if hf_model_name == "openai/clip-vit-large-patch14":
+        if "openai" in hf_model_name:
             tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
             hf_subfolder = ""  # CLIPProcessor does not have a subfolder
+            input_len = 10
         else:
             # Load the tokenizer and text encoder to tokenize and encode the text.
             tokenizer = CLIPTokenizer.from_pretrained(
@@ -102,8 +105,8 @@ class CompiledClip(CompiledModule):
 
             def main(
                 self,
-                inp=AbstractTensor(1, 77, dtype=torch.int64),
-                decoder_input_ids=AbstractTensor(1, 77, dtype=torch.int64),
+                inp=AbstractTensor(1, input_len, dtype=torch.int64),
+                decoder_input_ids=AbstractTensor(1, input_len, dtype=torch.int64),
             ):
                 return jittable(text_encoder_model.forward)(
                     input_ids=inp, decoder_input_ids=decoder_input_ids
@@ -122,7 +125,7 @@ class CompiledClip(CompiledModule):
             else:
                 params = export_parameters(text_encoder_model)
 
-            def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
+            def main(self, inp=AbstractTensor(1, input_len, dtype=torch.int64)):
                 return jittable(text_encoder_model.forward)(input_ids=inp)
 
     import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
diff --git a/models/turbine_models/custom_models/sd_inference/clip_runner.py b/models/turbine_models/custom_models/sd_inference/clip_runner.py
@@ -3,6 +3,7 @@
 from transformers import CLIPTokenizer
 from iree import runtime as ireert
 import torch
+from PIL import Image
 
 parser = argparse.ArgumentParser()
 
@@ -52,21 +53,54 @@ def run_clip(
 ):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
 
-    tokenizer = CLIPTokenizer.from_pretrained(
-        hf_model_name,
-        subfolder="tokenizer",
-        token=hf_auth_token,
-    )
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
+    if "google/t5" in hf_model_name:
+        from transformers import T5Tokenizer, T5Model
+
+        tokenizer = T5Tokenizer.from_pretrained(hf_model_name)
+        text_input = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+    # TODO: Integrate with HFTransformerBuilder
+    else:
+        if "openai" in hf_model_name:
+            from transformers import CLIPProcessor
+            import requests
+
+            url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+            image = Image.open(requests.get(url, stream=True).raw)
+            tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+            text_input = tokenizer(
+                text=prompt,
+                images=image,
+                truncation=True,
+                padding=True,
+                return_tensors="pt",
+            )
+        else:
+            hf_subfolder = "tokenizer"
+
+            tokenizer = CLIPTokenizer.from_pretrained(
+                hf_model_name,
+                subfolder=hf_subfolder,
+                token=hf_auth_token,
+            )
+
+            text_input = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
     example_input = text_input.input_ids
     inp = [ireert.asdevicearray(runner.config.device, example_input)]
 
+    if "google/t5" in hf_model_name:
+        inp += [ireert.asdevicearray(runner.config.device, example_input)]
     results = runner.ctx.modules.compiled_clip["main"](*inp)
     return results
 
@@ -77,13 +111,38 @@ def run_torch_clip(hf_model_name, hf_auth_token, prompt):
 
         tokenizer = T5Tokenizer.from_pretrained(hf_model_name)
         model = T5Model.from_pretrained(hf_model_name)
+        text_input = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
     # TODO: Integrate with HFTransformerBuilder
     else:
         if hf_model_name == "openai/clip-vit-large-patch14":
             from transformers import CLIPProcessor
+            import requests
+
+            url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+            image = Image.open(requests.get(url, stream=True).raw)
 
             tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
             hf_subfolder = ""  # CLIPProcessor does not have a subfolder
+            from transformers import CLIPTextModel
+
+            model = CLIPTextModel.from_pretrained(
+                hf_model_name,
+                subfolder=hf_subfolder,
+                token=hf_auth_token,
+            )
+            text_input = tokenizer(
+                text=prompt,
+                images=image,
+                truncation=True,
+                padding=True,
+                return_tensors="pt",
+            )
         else:
             hf_subfolder = "text_encoder"
 
@@ -93,20 +152,20 @@ def run_torch_clip(hf_model_name, hf_auth_token, prompt):
                 token=hf_auth_token,
             )
 
-        from transformers import CLIPTextModel
+            from transformers import CLIPTextModel
 
-        model = CLIPTextModel.from_pretrained(
-            hf_model_name,
-            subfolder=hf_subfolder,
-            token=hf_auth_token,
-        )
-    text_input = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
+            model = CLIPTextModel.from_pretrained(
+                hf_model_name,
+                subfolder=hf_subfolder,
+                token=hf_auth_token,
+            )
+            text_input = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
     example_input = text_input.input_ids
 
     if "google/t5" in hf_model_name:
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py