ahdbilal · ahdbilal · Sep 9, 2023 · Sep 10, 2023 · Sep 10, 2023 · Sep 11, 2023
diff --git a/examples/Optimized-LLM-Serving-Example.py b/examples/Optimized-LLM-Serving-Example.py
@@ -15,27 +15,22 @@
 # MAGIC %md
 # MAGIC ## Prerequisites
 # MAGIC * Attach a cluster to the notebook with sufficient memory to load MPT-7B. We recommend a cluster with at least 32 GB of memory.
-# MAGIC * (Optional) Install the latest transformers. MPT-7B native support in transformers was added on July 25, 2023. At the time of this notebook release, MPT-7B native support in transformers has not been officially released. For full compatibility of MPT-7B with mlflow, install the latest version from github. Optimized serving will work with older versions of transformers for MPT-7B, but there may be issues with loading the model locally.
-# MAGIC
-# MAGIC To install the latest version of transformers off github, run:
-# MAGIC ```
-# MAGIC %pip install git+https://github.com/huggingface/transformers@main
-# MAGIC ```
 # MAGIC
 # MAGIC
 
 # COMMAND ----------
 
+!pip install -U transformers
+!pip install -U accelerate
+!pip install -U mlflow
 dbutils.library.restartPython()
 
 # COMMAND ----------
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 # If you are using the latest version of transformers that has native MPT support, replace the following line with:
-# model = AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b', low_cpu_mem_usage=True)
-
-model = AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b', low_cpu_mem_usage=True, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b', low_cpu_mem_usage=True)
 
 # COMMAND ----------
 
@@ -56,8 +51,32 @@
 
 # COMMAND ----------
 
+import mlflow
+from mlflow.models.signature import ModelSignature
+from mlflow.types.schema import ColSpec, Schema
+
+
+input_schema = Schema([
+    ColSpec("string", "prompt"),
+    ColSpec("double", "temperature", optional= True),
+    ColSpec("integer", "max_tokens", optional= True),
+    ColSpec("string", "stop", optional= True), # Assuming the inner arrays only contain strings
+    ColSpec("integer", "candidate_count", optional= True)
+])
+
+ouput_schema = Schema([
+    ColSpec('string', 'predictions')
+])
+
+# Create a model signature with just the output schema
+signature = ModelSignature(inputs = input_schema,outputs= ouput_schema)
+signature
+
+# COMMAND ----------
+
 import mlflow
 import numpy as np
+mlflow.set_registry_uri('databricks-uc')
 
 with mlflow.start_run():
     components = {
@@ -67,7 +86,8 @@
     mlflow.transformers.log_model(
         transformers_model=components,
         artifact_path="mpt",
-        registered_model_name="optimized-mpt-7b-example",
+        #signature=signature,
+        registered_model_name="mpt-7b",
         input_example={"prompt": np.array(["Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is Apache Spark?\n\n### Response:\n"]), "max_tokens": np.array([75]), "temperature": np.array([0.0])},
         metadata = {"task": "llm/v1/completions"}
     )
@@ -81,10 +101,10 @@
 
 # COMMAND ----------
 
-endpoint_name = "optimized-mpt-7b-example"
-model_name = "optimized-mpt-7b-example"
+endpoint_name = "mpt-7b"
+model_name = "mpt-7b"
 model_version = "1"
-served_model_workload_size = "Medium"
+served_model_workload_size = "Small"
 served_model_scale_to_zero = False
 
 API_ROOT = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()

diff --git a/examples/Optimized-Llama-Serving-Example.py b/examples/Optimized-Llama-Serving-Example.py
@@ -0,0 +1,222 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Optimized Llama2 Serving Example
+# MAGIC
+# MAGIC Optimized LLM Serving enables you to take state of the art OSS LLMs and deploy them on Databricks Model Serving with automatic optimizations for improved latency and throughput on GPUs. Currently, we support optimizations for Llama2 and Mosaic MPT class of models and will continue introducing more models with optimization support.
+# MAGIC
+# MAGIC This example walks through:
+# MAGIC
+# MAGIC 1. Downloading the model from huggingface transformers
+# MAGIC 2. Logging the model in an optimized serving supported format into the Databricks Unity Catalog or Workspace Registry
+# MAGIC 3. Enabling optimized serving on the model
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Prerequisites
+# MAGIC - Attach a cluster with sufficient memory to the notebook
+# MAGIC - Make sure to have MLflow version 2.7.0 or later installed
+# MAGIC - Make sure to enable "Models in UC", especially when working with models larger than 7B in size
+# MAGIC
+
+# COMMAND ----------
+
+# MAGIC %md 
+# MAGIC ## Step 1: Log the model for Optimized LLM Serving
+
+# COMMAND ----------
+
+# Update/Install required dependencies
+!pip install -U mlflow
+!pip install -U transformers
+!pip install -U accelerate
+dbutils.library.restartPython()
+
+# COMMAND ----------
+
+import huggingface_hub
+#skip this if you are already logged in to hugging face
+#huggingface_hub.login()
+
+# COMMAND ----------
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-chat-hf",
+    torch_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "meta-llama/Llama-2-7b-chat-hf"
+)
+
+# COMMAND ----------
+
+import mlflow
+from mlflow.models.signature import ModelSignature
+from mlflow.types.schema import ColSpec, Schema
+import numpy as np
+
+# Define the model input and output schema
+input_schema = Schema([
+    ColSpec("string", "prompt"),
+    ColSpec("double", "temperature", optional=True),
+    ColSpec("integer", "max_tokens", optional=True),
+    ColSpec("string", "stop", optional=True),
+    ColSpec("integer", "candidate_count", optional=True)
+])
+
+output_schema = Schema([
+    ColSpec('string', 'predictions')
+])
+
+signature = ModelSignature(inputs=input_schema, outputs=output_schema)
+
+# Define an example input
+input_example = {
+    "prompt": np.array([
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n"
+        "What is Apache Spark?\n\n"
+        "### Response:\n"
+    ]),
+    "max_tokens": np.array([75]),
+    "temperature": np.array([0.0])
+}
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC To enable optimized serving, when logging the model, include the extra metadata dictionary when calling `mlflow.transformers.log_model` as shown below:
+# MAGIC
+# MAGIC ```
+# MAGIC metadata = {"task": "llm/v1/completions"}
+# MAGIC ```
+# MAGIC This specifies the API signature used for the model serving endpoint.
+# MAGIC
+
+# COMMAND ----------
+
+import mlflow
+
+# Comment out the line below if not using Models in UC 
+# and simply provide the model name instead of three-level namespace
+mlflow.set_registry_uri('databricks-uc')
+CATALOG = "ml"
+SCHEMA = "models"
+registered_model_name = f"{CATALOG}.{SCHEMA}.llama7b"
+
+# Start a new MLflow run
+with mlflow.start_run():
+    components = {
+        "model": model,
+        "tokenizer": tokenizer,
+    }
+    mlflow.transformers.log_model(
+        transformers_model=components,
+        task = "text-generation",
+        artifact_path="model",
+        registered_model_name=registered_model_name,
+        signature=signature,
+        input_example=input_example,
+        metadata={"task": "llm/v1/completions"}
+    )
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Step 2: Configure and create your model serving GPU endpoint
+# MAGIC
+# MAGIC Modify the cell below to change the endpoint name. After calling the create endpoint API, the logged MPT-7B model will automatically be deployed with optimized LLM Serving!
+
+# COMMAND ----------
+
+# Set the name of the MLflow endpoint
+endpoint_name = "llama7b"
+
+# Name of the registered MLflow model
+model_name = registered_model_name
+
+# Get the latest version of the MLflow model
+model_version = 1
+
+# Specify the type of compute (CPU, GPU_SMALL, GPU_MEDIUM, etc.)
+workload_type = "GPU_MEDIUM" 
+
+# Specify the compute scale-out size(Small, Medium, Large, etc.)
+workload_size = "Small" 
+
+# Specify Scale to Zero(only supported for CPU endpoints)
+scale_to_zero = False 
+
+# Get the API endpoint and token for the current notebook context
+API_ROOT = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get() 
+API_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
+
+# COMMAND ----------
+
+import requests
+import json
+
+data = {
+    "name": endpoint_name,
+    "config": {
+        "served_models": [
+            {
+                "model_name": model_name,
+                "model_version": model_version,
+                "workload_size": workload_size,
+                "scale_to_zero_enabled": scale_to_zero,
+                "workload_type": workload_type,
+            }
+        ]
+    },
+}
+
+headers = {"Context-Type": "text/json", "Authorization": f"Bearer {API_TOKEN}"}
+
+response = requests.post(url=f"{API_ROOT}/api/2.0/serving-endpoints", json=data, headers=headers)
+
+print(json.dumps(response.json(), indent=4))
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## View your endpoint
+# MAGIC To see your more information about your endpoint, go to the "Serving" section on the left navigation bar and search for your endpoint name.
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Step 3: Query your endpoint!
+# MAGIC
+# MAGIC Once your endpoint is ready, you can query it by making an API request. Depending on the model size and complexity, it can take up to 30 minutes or more for the endpoint to get ready.  
+
+# COMMAND ----------
+
+data = {
+    "inputs": {
+        "prompt": [
+            "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is Apache Spark?\n\n### Response:\n"
+        ]
+    },
+    "params": {
+        "max_tokens": 100, 
+        "temperature": 0.0
+    }
+}
+
+headers = {
+    "Context-Type": "text/json",
+    "Authorization": f"Bearer {API_TOKEN}"
+}
+
+response = requests.post(
+    url=f"{API_ROOT}/serving-endpoints/{endpoint_name}/invocations",
+    json=data,
+    headers=headers
+)
+
+print(json.dumps(response.json()))