Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions examples/Optimized-LLM-Serving-Example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,22 @@
# MAGIC %md
# MAGIC ## Prerequisites
# MAGIC * Attach a cluster to the notebook with sufficient memory to load MPT-7B. We recommend a cluster with at least 32 GB of memory.
# MAGIC * (Optional) Install the latest transformers. MPT-7B native support in transformers was added on July 25, 2023. At the time of this notebook release, MPT-7B native support in transformers has not been officially released. For full compatibility of MPT-7B with mlflow, install the latest version from github. Optimized serving will work with older versions of transformers for MPT-7B, but there may be issues with loading the model locally.
# MAGIC
# MAGIC To install the latest version of transformers off github, run:
# MAGIC ```
# MAGIC %pip install git+https://github.com/huggingface/transformers@main
# MAGIC ```
# MAGIC
# MAGIC

# COMMAND ----------

!pip install -U transformers
!pip install -U accelerate
!pip install -U mlflow
dbutils.library.restartPython()

# COMMAND ----------

from transformers import AutoModelForCausalLM, AutoTokenizer

# If you are using the latest version of transformers that has native MPT support, replace the following line with:
# model = AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b', low_cpu_mem_usage=True)

model = AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b', low_cpu_mem_usage=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b', low_cpu_mem_usage=True)

# COMMAND ----------

Expand All @@ -56,8 +51,32 @@

# COMMAND ----------

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import ColSpec, Schema


input_schema = Schema([
ColSpec("string", "prompt"),
ColSpec("double", "temperature", optional= True),
ColSpec("integer", "max_tokens", optional= True),
ColSpec("string", "stop", optional= True), # Assuming the inner arrays only contain strings
ColSpec("integer", "candidate_count", optional= True)
])

ouput_schema = Schema([
ColSpec('string', 'predictions')
])

# Create a model signature with just the output schema
signature = ModelSignature(inputs = input_schema,outputs= ouput_schema)
signature

# COMMAND ----------

import mlflow
import numpy as np
mlflow.set_registry_uri('databricks-uc')

with mlflow.start_run():
components = {
Expand All @@ -67,7 +86,8 @@
mlflow.transformers.log_model(
transformers_model=components,
artifact_path="mpt",
registered_model_name="optimized-mpt-7b-example",
#signature=signature,
registered_model_name="mpt-7b",
input_example={"prompt": np.array(["Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is Apache Spark?\n\n### Response:\n"]), "max_tokens": np.array([75]), "temperature": np.array([0.0])},
metadata = {"task": "llm/v1/completions"}
)
Expand All @@ -81,10 +101,10 @@

# COMMAND ----------

endpoint_name = "optimized-mpt-7b-example"
model_name = "optimized-mpt-7b-example"
endpoint_name = "mpt-7b"
model_name = "mpt-7b"
model_version = "1"
served_model_workload_size = "Medium"
served_model_workload_size = "Small"
served_model_scale_to_zero = False

API_ROOT = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
Expand Down
222 changes: 222 additions & 0 deletions examples/Optimized-Llama-Serving-Example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# Databricks notebook source
# MAGIC %md
# MAGIC # Optimized Llama2 Serving Example
# MAGIC
# MAGIC Optimized LLM Serving enables you to take state of the art OSS LLMs and deploy them on Databricks Model Serving with automatic optimizations for improved latency and throughput on GPUs. Currently, we support optimizations for Llama2 and Mosaic MPT class of models and will continue introducing more models with optimization support.
# MAGIC
# MAGIC This example walks through:
# MAGIC
# MAGIC 1. Downloading the model from huggingface transformers
# MAGIC 2. Logging the model in an optimized serving supported format into the Databricks Unity Catalog or Workspace Registry
# MAGIC 3. Enabling optimized serving on the model

# COMMAND ----------

# MAGIC %md
# MAGIC ## Prerequisites
# MAGIC - Attach a cluster with sufficient memory to the notebook
# MAGIC - Make sure to have MLflow version 2.7.0 or later installed
# MAGIC - Make sure to enable "Models in UC", especially when working with models larger than 7B in size
# MAGIC

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Log the model for Optimized LLM Serving

# COMMAND ----------

# Update/Install required dependencies
!pip install -U mlflow
!pip install -U transformers
!pip install -U accelerate
dbutils.library.restartPython()

# COMMAND ----------

import huggingface_hub
#skip this if you are already logged in to hugging face
#huggingface_hub.login()

# COMMAND ----------

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-chat-hf",
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-2-7b-chat-hf"
)

# COMMAND ----------

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import ColSpec, Schema
import numpy as np

# Define the model input and output schema
input_schema = Schema([
ColSpec("string", "prompt"),
ColSpec("double", "temperature", optional=True),
ColSpec("integer", "max_tokens", optional=True),
ColSpec("string", "stop", optional=True),
ColSpec("integer", "candidate_count", optional=True)
])

output_schema = Schema([
ColSpec('string', 'predictions')
])

signature = ModelSignature(inputs=input_schema, outputs=output_schema)

# Define an example input
input_example = {
"prompt": np.array([
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n"
"What is Apache Spark?\n\n"
"### Response:\n"
]),
"max_tokens": np.array([75]),
"temperature": np.array([0.0])
}

# COMMAND ----------

# MAGIC %md
# MAGIC To enable optimized serving, when logging the model, include the extra metadata dictionary when calling `mlflow.transformers.log_model` as shown below:
# MAGIC
# MAGIC ```
# MAGIC metadata = {"task": "llm/v1/completions"}
# MAGIC ```
# MAGIC This specifies the API signature used for the model serving endpoint.
# MAGIC

# COMMAND ----------

import mlflow

# Comment out the line below if not using Models in UC
# and simply provide the model name instead of three-level namespace
mlflow.set_registry_uri('databricks-uc')
CATALOG = "ml"
SCHEMA = "models"
registered_model_name = f"{CATALOG}.{SCHEMA}.llama7b"

# Start a new MLflow run
with mlflow.start_run():
components = {
"model": model,
"tokenizer": tokenizer,
}
mlflow.transformers.log_model(
transformers_model=components,
task = "text-generation",
artifact_path="model",
registered_model_name=registered_model_name,
signature=signature,
input_example=input_example,
metadata={"task": "llm/v1/completions"}
)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Configure and create your model serving GPU endpoint
# MAGIC
# MAGIC Modify the cell below to change the endpoint name. After calling the create endpoint API, the logged MPT-7B model will automatically be deployed with optimized LLM Serving!

# COMMAND ----------

# Set the name of the MLflow endpoint
endpoint_name = "llama7b"

# Name of the registered MLflow model
model_name = registered_model_name

# Get the latest version of the MLflow model
model_version = 1

# Specify the type of compute (CPU, GPU_SMALL, GPU_MEDIUM, etc.)
workload_type = "GPU_MEDIUM"

# Specify the compute scale-out size(Small, Medium, Large, etc.)
workload_size = "Small"

# Specify Scale to Zero(only supported for CPU endpoints)
scale_to_zero = False

# Get the API endpoint and token for the current notebook context
API_ROOT = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
API_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# COMMAND ----------

import requests
import json

data = {
"name": endpoint_name,
"config": {
"served_models": [
{
"model_name": model_name,
"model_version": model_version,
"workload_size": workload_size,
"scale_to_zero_enabled": scale_to_zero,
"workload_type": workload_type,
}
]
},
}

headers = {"Context-Type": "text/json", "Authorization": f"Bearer {API_TOKEN}"}

response = requests.post(url=f"{API_ROOT}/api/2.0/serving-endpoints", json=data, headers=headers)

print(json.dumps(response.json(), indent=4))

# COMMAND ----------

# MAGIC %md
# MAGIC ## View your endpoint
# MAGIC To see your more information about your endpoint, go to the "Serving" section on the left navigation bar and search for your endpoint name.

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Query your endpoint!
# MAGIC
# MAGIC Once your endpoint is ready, you can query it by making an API request. Depending on the model size and complexity, it can take up to 30 minutes or more for the endpoint to get ready.

# COMMAND ----------

data = {
"inputs": {
"prompt": [
"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is Apache Spark?\n\n### Response:\n"
]
},
"params": {
"max_tokens": 100,
"temperature": 0.0
}
}

headers = {
"Context-Type": "text/json",
"Authorization": f"Bearer {API_TOKEN}"
}

response = requests.post(
url=f"{API_ROOT}/serving-endpoints/{endpoint_name}/invocations",
json=data,
headers=headers
)

print(json.dumps(response.json()))
Loading