Merge branch 'vllm-project:main' into main

Alexei-V-Ivanov-AMD · May 15, 2024 · e54d228 · e54d228
2 parents a80119d + a5675d3
commit e54d228
Show file tree

Hide file tree

Showing 24 changed files with 882 additions and 1,073 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,19 +1,21 @@
 import contextlib
 import gc
 import os
-from typing import List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import pytest
 import torch
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoProcessor,
-                          LlavaForConditionalGeneration)
+from transformers import (AutoModelForCausalLM, AutoProcessor, AutoTokenizer,
+                          LlavaConfig, LlavaForConditionalGeneration)
 
 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
 from vllm.distributed import destroy_model_parallel
+from vllm.logger import init_logger
 from vllm.sequence import MultiModalData
-from vllm.transformers_utils.tokenizer import get_tokenizer
+
+logger = init_logger(__name__)
 
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
@@ -129,9 +131,7 @@ def example_long_prompts() -> List[str]:
     "float": torch.float,
 }
 
-_VISION_LANGUAGE_MODELS = {
-    "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
-}
+AutoModelForCausalLM.register(LlavaConfig, LlavaForConditionalGeneration)
 
 _EMBEDDING_MODELS = [
     "intfloat/e5-mistral-7b-instruct",
@@ -143,23 +143,14 @@ class HfRunner:
     def __init__(
         self,
         model_name: str,
-        tokenizer_name: Optional[str] = None,
         dtype: str = "half",
     ) -> None:
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+
         self.model_name = model_name
-        if model_name in _VISION_LANGUAGE_MODELS:
-            self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            ).cuda()
-            self.processor = AutoProcessor.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-            )
-        elif model_name in _EMBEDDING_MODELS:
+
+        if model_name in _EMBEDDING_MODELS:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
             self.model = SentenceTransformer(
@@ -172,10 +163,24 @@ def __init__(
                 torch_dtype=torch_dtype,
                 trust_remote_code=True,
             ).cuda()
-            self.processor = None
-        if tokenizer_name is None:
-            tokenizer_name = model_name
-        self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )
+
+        try:
+            self.processor = AutoProcessor.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+        except Exception:
+            logger.warning(
+                "Unable to auto-load processor from HuggingFace for "
+                "model %s. Using tokenizer instead.", model_name)
+            self.processor = self.tokenizer
 
     def generate(
         self,
@@ -187,19 +192,19 @@ def generate(
         if images:
             assert len(prompts) == len(images)
         for i, prompt in enumerate(prompts):
-            if self.model_name not in _VISION_LANGUAGE_MODELS:
-                input_ids = self.tokenizer(prompt,
-                                           return_tensors="pt").input_ids
-                inputs = {"input_ids": input_ids.cuda()}
-            else:
-                image = images[i] if images else None
-                inputs = self.processor(text=prompt,
-                                        images=image,
-                                        return_tensors="pt")
-                inputs = {
-                    key: value.cuda() if value is not None else None
-                    for key, value in inputs.items()
-                }
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+            inputs = {
+                key: value.cuda() if value is not None else None
+                for key, value in inputs.items()
+            }
+
             output_ids = self.model.generate(
                 **inputs,
                 use_cache=True,

diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
@@ -12,7 +12,7 @@
     # "Deci/DeciLM-7b",  # Broken
     # "tiiuae/falcon-7b",  # Broken
     "EleutherAI/gpt-j-6b",
-    # "mosaicml/mpt-7b",  # Broken
+    "mosaicml/mpt-7b",
     # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
 

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
@@ -25,18 +25,18 @@
         'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
         'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
         'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
         'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
+        'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
     ],
     "meta-llama/Meta-Llama-3-8B-Instruct": [
         'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
         'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
         'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
         'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-        'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
+        'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
         'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
         'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
         'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'