diff --git a/README.md b/README.md index ff67e5252..0e0f2835f 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ See Trusses for popular models including: -* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-70b-chat)) +* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-70b-chat)) * 🎨 [Stable Diffusion XL](https://github.com/basetenlabs/truss-examples/tree/main/stable-diffusion-xl-1.0) * 🗣 [Whisper](https://github.com/basetenlabs/truss-examples/tree/main/whisper-truss) diff --git a/bin/generate_truss_examples.py b/bin/generate_truss_examples.py new file mode 100644 index 000000000..583e61bf8 --- /dev/null +++ b/bin/generate_truss_examples.py @@ -0,0 +1,299 @@ +""" +Script to take the Truss examples in https://github.com/basetenlabs/truss-examples, +and generate documentation. + +Usage: +``` +$ poetry run python bin/generate_truss_examples.py +``` +""" +import enum +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path +from typing import List, Optional, Tuple + +import yaml + +DOC_CONFIGURATION_FILE = "doc.yaml" +TRUSS_EXAMPLES_REPO = "https://github.com/basetenlabs/truss-examples" +DESTINATION_DIR = "truss-examples" +MINT_CONFIG_PATH = "docs/mint.json" + + +class FileType(enum.Enum): + YAML = "yaml" + PYTHON = "python" + + +def clone_repo(): + """ + If the destination directory exists, remove it. + Then, clone the given repo into the specified directory. + """ + if Path(DESTINATION_DIR).exists(): + shutil.rmtree(DESTINATION_DIR) + + try: + subprocess.run( + ["git", "clone", TRUSS_EXAMPLES_REPO, DESTINATION_DIR], check=True + ) + print(f"Successfully cloned {TRUSS_EXAMPLES_REPO} to {DESTINATION_DIR}") + except subprocess.CalledProcessError as e: + print(f"Error cloning the repo: {e}") + sys.exit(1) + + +def fetch_file_contents(path: str): + with open(path, "r") as f: + return f.read() + + +def _fetch_example_dirs(root_dir: str) -> List[str]: + """ + Walk through the directory structure from the root directory and + find all directories that have the specified file in it. + """ + dirs_with_file = [] + + for dirpath, _, filenames in os.walk(root_dir): + if DOC_CONFIGURATION_FILE in filenames: + dirs_with_file.append(dirpath) + + return dirs_with_file + + +def _get_example_destination(truss_directory: str) -> Path: + """ + Get the destination directory for the example. + """ + original_path = Path(truss_directory) + folder, example = original_path.parts[1:] + example_file = f"{example}.mdx" + return Path("docs/examples") / folder / example_file + + +def _get_file_type(file_path: str) -> FileType: + extension = Path(file_path).suffix + if extension == ".yaml": + return FileType.YAML + + if extension == ".py": + return FileType.PYTHON + + raise ValueError(f"Unknown file type: {extension}") + + +class ContentBlock: + def formatted_content(self) -> str: + raise NotImplementedError + + +class CodeBlock(ContentBlock): + def __init__(self, file_type: FileType, file_path: str): + self.file_type = file_type + self.file_path = file_path + self.content = "" + + def formatted_content(self) -> str: + """ + Outputs code blocks in the format: + + ```python main.py + def main(): + ... + ``` + """ + return f"\n```{self.file_type.value} {self.file_path}\n{self.content}```" + + +class MarkdownBlock(ContentBlock): + def __init__(self, content: str): + self.content = content + + def formatted_content(self) -> str: + # Remove the first comment and space character, such that + # "# Hello" becomes "Hello + return self.content.strip()[2:] + + +class MarkdownExtractor: + """ + Class that supports ingesting a code file line-by-line, and produces a formatted + mdx file. + """ + + def __init__(self, file_type: FileType, file_path: str): + self.file_type = file_type + self.file_path = file_path + + self.blocks: List[ContentBlock] = [] + self.current_code_block: Optional[CodeBlock] = None + + def ingest(self, line: str): + """ + For each line, check that it is a comment by the presence of "#". + If it is a comment, append it to the blocks. + + If it is not a comment, either append to the current code block, or + create a new code block if this isn't one. + + When this is finished, we can then very easily produce the mdx file. + """ + stripped_line = line.strip() + + # Case of Markdown line + if stripped_line.startswith("#"): + self.current_code_block = None + self.blocks.append(MarkdownBlock(line)) + else: + if self.current_code_block is None: + self.current_code_block = CodeBlock(self.file_type, self.file_path) + self.blocks.append(self.current_code_block) + self.current_code_block.content += line + "\n" + + def _formatted_request_example(self) -> str: + """ + A key part of the mdx file is that each has a block at the + bottom the file. This generates that for the given file by appending all the + CodeBlocks together. + """ + code_blocks = [block for block in self.blocks if isinstance(block, CodeBlock)] + code_content = "".join([code_block.content for code_block in code_blocks]) + + return f"""```{self.file_type.value} {self.file_path}\n{code_content}```""" + + def mdx_content(self) -> Tuple[str, str]: + full_content = "\n".join([block.formatted_content() for block in self.blocks]) + + return ( + full_content + "\n", + self._formatted_request_example(), + ) + + +def _extract_mdx_content_and_code(full_file_path: str, path: str) -> Tuple[str, str]: + file_content = fetch_file_contents(full_file_path) + file_type = _get_file_type(path) + extractor = MarkdownExtractor(file_type, path) + for line in file_content.splitlines(): + extractor.ingest(line) + + return extractor.mdx_content() + + +def _generate_request_example_block(code: str): + return f""" + +{code} + +""" + + +def _generate_truss_example(truss_directory: str): + print("Generating example for: ", truss_directory) + doc_information = yaml.safe_load( + fetch_file_contents(f"{truss_directory}/{DOC_CONFIGURATION_FILE}") + ) + + example_destination = _get_example_destination(truss_directory) + + header = f"""--- +title: "{doc_information["title"]}" +description: "{doc_information["description"]}" +--- +""" + + path_in_examples_repo = "/".join(Path(truss_directory).parts[1:]) + link_to_github = f""" + + + """ + files_to_scrape = doc_information["files"] + + full_content, code_blocks = zip( + *[ + _extract_mdx_content_and_code(Path(truss_directory) / file, file) + for file in files_to_scrape + ] + ) + + full_code_block = "\n".join(code_blocks) + file_content = "\n".join(full_content) + _generate_request_example_block( + full_code_block + ) + example_content = f"""{header}\n{link_to_github}\n{file_content}""" + path_to_example = Path(example_destination) + path_to_example.parent.mkdir(parents=True, exist_ok=True) + + path_to_example.write_text(example_content) + + +def _format_group_name(group_name: str) -> str: + """ + This function takes the parent directory name in, and converts it + into a more human readable format for the table of contents. + + Note that parent directory names are assumed to be in the format: + * 1_introduction/... (becomes "Introduction") + * 2_image_classification/... (becomes "Image classification") + * 3_llms/... (becomes "LLMs") + """ + lowercase_name = " ".join(group_name.split("_")[1:]) + # Capitalize the first letter. We do this rather than + # use .capitalize() or .title() because we want to preserve + # the case of subsequent letters + return lowercase_name[0].upper() + lowercase_name[1:] + + +def update_toc(example_dirs: List[str]): + """ + Update the table of contents in the README.md file. + + Parameters: + example_dirs: List of directories as strings in the form "truss-examples-2/..." + """ + + # Exclude the root directory ("truss_examples") from the path + transformed_example_paths = [Path(example).parts[1:] for example in example_dirs] + + mint_config = json.loads(fetch_file_contents(MINT_CONFIG_PATH)) + navigation = mint_config["navigation"] + + examples_section = [item for item in navigation if item["group"] == "Examples"][0] + + # Sort examples by the group name + examples_section["pages"] = [ + f"examples/{example_path[0]}/{example_path[1]}" + for example_path in sorted( + transformed_example_paths, key=lambda example: example[0] + ) + ] + + serialized_mint_config = json.dumps(mint_config, indent=2) + Path(MINT_CONFIG_PATH).write_text(serialized_mint_config) + + +def generate_truss_examples(): + """ + Walk through the Truss examples repo, and for each + of the examples in the repo, generate documentation. + + Finish the process by updating the table of contents. + """ + clone_repo() + + example_dirs = _fetch_example_dirs(DESTINATION_DIR) + for truss_directory in example_dirs: + _generate_truss_example(truss_directory) + + update_toc(example_dirs) + + +if __name__ == "__main__": + generate_truss_examples() diff --git a/docs/examples/1_introduction/getting-started-bert.mdx b/docs/examples/1_introduction/getting-started-bert.mdx new file mode 100644 index 000000000..9273b2538 --- /dev/null +++ b/docs/examples/1_introduction/getting-started-bert.mdx @@ -0,0 +1,166 @@ +--- +title: "Getting Started" +description: "Building your first Truss" +--- + + + + + +In this example, we go through building your first Truss model. We'll be using the HuggingFace transformers +library to build a text classification model that can detect sentiment of text. + +# Step 1: Implementing the model + +Set up imports for this model. In this example, we simply use the HuggingFace transformers library. + +```python model/model.py +from transformers import pipeline + +``` +Every Truss model must implement a `Model` class. This class must have: + * an `__init__` function + * a `load` function + * a `predict` function + +In the `__init__` function, set up any variables that will be used in the `load` and `predict` functions. + +```python model/model.py +class Model: + def __init__(self, **kwargs): + self._model = None + +``` +In the `load` function of the Truss, we implement logic +involved in downloading the model and loading it into memory. +For this Truss example, we define a HuggingFace pipeline, and choose +the `text-classification` task, which uses BERT for text classification under the hood. + +Note that the the load function runs when the + +```python model/model.py + def load(self): + self._model = pipeline("text-classification") + +``` +In the `predict` function of the Truss, we implement logic related +to actual inference. For this example, we just call the HuggingFace pipeline +that we set up in the `load` function. + +```python model/model.py + def predict(self, model_input): + return self._model(model_input) +``` + +# Step 2: Writing the config.yaml + +Each Truss has a config.yaml file where we can configure +options related to the deployment. It's in this file where +we can define requirements, resources, and runtime options like +secrets and environment variables + +### Basic Options + +In this section, we can define basic metadata about the model, +such as the name, and the Python version to build with. + +```yaml config.yaml +model_name: bert +python_version: py310 +model_metadata: + example_model_input: { "text": "Hello my name is {MASK}" } + + +``` +### Set up python requirements + +In this section, we define any pip requirements that +we need to run the model. To run this, we need PyTorch +and Tranformers. + +```yaml config.yaml +requirements: + - torch==2.0.1 + - transformers==4.33.2 + +``` +### Configure the resources needed + +In this section, we can configure resources +needed to deploy this model. Here, we have no need for a GPU +so we leave the accelerator section blank. + +```yaml config.yaml +resources: + accelerator: null + cpu: '1' + memory: 2Gi + use_gpu: false + +``` +### Other config options + +Truss also has provisions for adding other runtime options +packages. In this example, we don't need these, so we leave +this empty for now. + +```yaml config.yaml +secrets: {} +system_packages: [] +environment_variables: {} +external_package_dirs: [] + +``` +# Step 3: Deploying & running inference + +Deploy the model with the following command: + +```bash +$ truss push +``` + +And then you can performance inference with: +``` +$ truss predict -d '"Truss is awesome!"' +``` + + +```python model/model.py +from transformers import pipeline + +class Model: + def __init__(self, **kwargs): + self._model = None + + def load(self): + self._model = pipeline("text-classification") + + def predict(self, model_input): + return self._model(model_input) +``` +```yaml config.yaml +model_name: bert +python_version: py310 +model_metadata: + example_model_input: { "text": "Hello my name is {MASK}" } + + +requirements: + - torch==2.0.1 + - transformers==4.33.2 + +resources: + accelerator: null + cpu: '1' + memory: 2Gi + use_gpu: false + +secrets: {} +system_packages: [] +environment_variables: {} +external_package_dirs: [] + +``` + diff --git a/docs/examples/2_image_classification/clip.mdx b/docs/examples/2_image_classification/clip.mdx new file mode 100644 index 000000000..9f7d104f9 --- /dev/null +++ b/docs/examples/2_image_classification/clip.mdx @@ -0,0 +1,181 @@ +--- +title: "Image Classification" +description: "Deploy a CLIP model to classify images" +--- + + + + + +In this example, we create a Truss that uses [CLIP](https://openai.com/research/clip) to classify images, +using some pre-defined labels. The input to this Truss will be an image, the output will be a classification. + +One of the major things to note about this example is that since the inputs are images, we need to have +some mechanism for downloading the image. To accomplish this, we have the user pass a downloadable URL to +the Truss, and in the Truss code, download the image. To do this efficiently, we will make use of the +`preprocess` method in Truss. + +# Set up imports and constants + +For our CLIP Truss, we will be using the Hugging Face transformers library, as well as +`pillow` for image processing. + +```python model/model.py +import requests +from typing import Dict +from PIL import Image +from transformers import CLIPProcessor, CLIPModel + +``` +This is the CLIP model from Hugging Face that we will use for this example. + +```python model/model.py +CHECKPOINT = "openai/clip-vit-base-patch32" + +``` +# Define the Truss + +In the `load` method, we load in the pretrained CLIP model from the +Hugging Face checkpoint specified above. + +```python model/model.py +class Model: + def __init__(self, **kwargs) -> None: + self._processor = None + self._model = None + + def load(self): + """ + Loads the CLIP model and processor checkpoints. + """ + self._model = CLIPModel.from_pretrained(CHECKPOINT) + self._processor = CLIPProcessor.from_pretrained(CHECKPOINT) + +``` +In the `preprocess` method, we download the image from the url and preprocess it. +This method is a part of the Truss class, and is designed to be used for any logic +involving IO, like in this case, downloading an image. + +It is called before the predict method in a separate thread, and is not subject to the same +concurrency limits as the predict method, so can be called many times in parallel. +This makes it such that the predict method is not unnecessarily blocked on IO-bound +tasks, and helps improve the throughput of the Truss. See our [guide to concurrency](../guides/concurrency) +for more info. + +```python model/model.py + def preprocess(self, request: Dict) -> Dict: + + image = Image.open(requests.get(request.pop("url"), stream=True).raw) + request["inputs"] = self._processor( + text=["a photo of a cat", "a photo of a dog"], # Define preset labels to use + images=image, + return_tensors="pt", + padding=True + ) + return request + +``` +The `predict` method performs the actual inference, and outputs a probability associated +with each of the labels defined earlier. + +```python model/model.py + def predict(self, request: Dict) -> Dict: + """ + This performs the actual classification. The predict method is subject to + the predict concurrency constraints. + """ + outputs = self._model(**request["inputs"]) + logits_per_image = outputs.logits_per_image + return logits_per_image.softmax(dim=1).tolist() +``` + +# Set up the config.yaml + +The main section that needs to be filled out +to run CLIP is the `requirements` section, where we need +to include `transformers`, for the model pipeline, and `pillow`, +for image processing. + +```yaml config.yaml +model_name: clip-example +requirements: +- transformers==4.32.0 +- pillow==10.0.0 +- torch==2.0.1 +model_metadata: + example_model_input: {"url": "https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?auto=compress&cs=tinysrgb&w=1600"} +resources: + cpu: "3" + memory: 14Gi + use_gpu: true + accelerator: A10G +``` +# Deploy the model + +Deploy the CLIP model like you would other Trusses, with: +```bash +$ truss push +``` +You can then invoke the model with: +```bash +$ truss predict -d '{"image_url": "https://source.unsplash.com/gKXKBY-C-Dk/300x300""]}' --published +``` + + +```python model/model.py +import requests +from typing import Dict +from PIL import Image +from transformers import CLIPProcessor, CLIPModel + +CHECKPOINT = "openai/clip-vit-base-patch32" + +class Model: + def __init__(self, **kwargs) -> None: + self._processor = None + self._model = None + + def load(self): + """ + Loads the CLIP model and processor checkpoints. + """ + self._model = CLIPModel.from_pretrained(CHECKPOINT) + self._processor = CLIPProcessor.from_pretrained(CHECKPOINT) + + def preprocess(self, request: Dict) -> Dict: + + image = Image.open(requests.get(request.pop("url"), stream=True).raw) + request["inputs"] = self._processor( + text=["a photo of a cat", "a photo of a dog"], # Define preset labels to use + images=image, + return_tensors="pt", + padding=True + ) + return request + + def predict(self, request: Dict) -> Dict: + """ + This performs the actual classification. The predict method is subject to + the predict concurrency constraints. + """ + outputs = self._model(**request["inputs"]) + logits_per_image = outputs.logits_per_image + return logits_per_image.softmax(dim=1).tolist() +``` +```yaml config.yaml +model_name: clip-example +requirements: +- transformers==4.32.0 +- pillow==10.0.0 +- torch==2.0.1 +model_metadata: + example_model_input: {"url": "https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?auto=compress&cs=tinysrgb&w=1600"} +resources: + cpu: "3" + memory: 14Gi + use_gpu: true + accelerator: A10G +``` + diff --git a/docs/examples/3_LLMs/llm-with-streaming.mdx b/docs/examples/3_LLMs/llm-with-streaming.mdx new file mode 100644 index 000000000..23a72e7fe --- /dev/null +++ b/docs/examples/3_LLMs/llm-with-streaming.mdx @@ -0,0 +1,263 @@ +--- +title: "LLM with Streaming" +description: "Building an LLM with streaming output" +--- + + + + + +In this example, we go through a Truss that serves an LLM, and streams the output to the client. + +# Why Streaming? + +For certain ML models, generations can take a long time. Especially with LLMs, a long output could take +10 - 20 seconds to generate. However, because LLMs generate tokens in sequence, useful output can be +made available to users sooner. To support this, in Truss, we support streaming output. In this example, +we build a Truss that streams the output of the Falcon-7B model. + +# Set up the imports and key constants + +In this example, we use the HuggingFace transformers library to build a text generation model. + +```python model/model.py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer +from typing import Dict +from threading import Thread + +``` +We use the instruct version of the Falcon-7B model, and have some defaults +for inference parameters. + +```python model/model.py +CHECKPOINT = "tiiuae/falcon-7b-instruct" +DEFAULT_MAX_NEW_TOKENS = 150 +DEFAULT_TOP_P = 0.95 + + +``` +# Define the load function + +In the `load` function of the Truss, we implement logic +involved in downloading the model and loading it into memory. + +```python model/model.py +class Model: + def __init__(self, **kwargs) -> None: + self.tokenizer = None + self.model = None + + def load(self): + self.tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) +``` + + +```python model/model.py + self.tokenizer.pad_token = self.tokenizer.eos_token_id + self.model = AutoModelForCausalLM.from_pretrained( + CHECKPOINT, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + device_map="auto", + ) +``` +# Define the predict function + +In the `predict` function of the Truss, we implement the actual +inference logic. The two main steps are: +* Tokenize the input +* Call the model's `generate` function, ensuring that we pass a +`TextIteratorStreamer`. This is what gives us streaming output, and +and also do this in a Thread, so that it does not block the main +invocation. +* Return a generator that iterates over the `TextIteratorStreamer` object + +```python model/model.py + def predict(self, request: Dict) -> Dict: + prompt = request.pop("prompt") + inputs = self.tokenizer( + prompt, + return_tensors="pt", + max_length=512, + truncation=True, + padding=True + ) + input_ids = inputs["input_ids"].to("cuda") + +``` +Instantiate the Streamer object, which we'll later use for +returning the output to users. + +```python model/model.py + streamer = TextIteratorStreamer(self.tokenizer) + generation_config = GenerationConfig( + temperature=1, + top_p=DEFAULT_TOP_P, + top_k=40, + ) + +``` +When creating the generation parameters, ensure to pass the `streamer` object +that we created previously. + +```python model/model.py + with torch.no_grad(): + generation_kwargs = { + "input_ids": input_ids, + "generation_config": generation_config, + "return_dict_in_generate": True, + "output_scores": True, + "pad_token_id": self.tokenizer.eos_token_id, + "max_new_tokens": DEFAULT_MAX_NEW_TOKENS, + "streamer": streamer + } + +``` +Spawn a thread to run the generation, so that it does not block the main +thread. + +```python model/model.py + thread = Thread( + target=self.model.generate, + kwargs=generation_kwargs + ) + thread.start() + +``` +In Truss, the way to achieve streaming output is to return a generator +that yields content. In this example, we yield the output of the `streamer`, +which produces output and yields it until the generation is complete. + +We define this `inner` function to create our generator. + +```python model/model.py + def inner(): + for text in streamer: + yield text + thread.join() + + return inner() +``` + +# Setting up the config.yaml + +Running Falcon 7B requires torch, transformers, +and a few other related libraries. + +```yaml config.yaml +model_name: "LLM with Streaming" +model_metadata: + example_model_input: {"prompt": "what is the meaning of life"} +requirements: +- torch==2.0.1 +- peft==0.4.0 +- scipy==1.11.1 +- sentencepiece==0.1.99 +- accelerate==0.21.0 +- bitsandbytes==0.41.1 +- einops==0.6.1 +- transformers==4.31.0 +``` +## Configure resources for Falcon + +Note that we need an A10G to run this model. + +```yaml config.yaml +resources: + cpu: "3" + memory: 14Gi + use_gpu: true + accelerator: A10G +``` + + +```python model/model.py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer +from typing import Dict +from threading import Thread + +CHECKPOINT = "tiiuae/falcon-7b-instruct" +DEFAULT_MAX_NEW_TOKENS = 150 +DEFAULT_TOP_P = 0.95 + + +class Model: + def __init__(self, **kwargs) -> None: + self.tokenizer = None + self.model = None + + def load(self): + self.tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) + self.tokenizer.pad_token = self.tokenizer.eos_token_id + self.model = AutoModelForCausalLM.from_pretrained( + CHECKPOINT, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + device_map="auto", + ) + def predict(self, request: Dict) -> Dict: + prompt = request.pop("prompt") + inputs = self.tokenizer( + prompt, + return_tensors="pt", + max_length=512, + truncation=True, + padding=True + ) + input_ids = inputs["input_ids"].to("cuda") + + streamer = TextIteratorStreamer(self.tokenizer) + generation_config = GenerationConfig( + temperature=1, + top_p=DEFAULT_TOP_P, + top_k=40, + ) + + with torch.no_grad(): + generation_kwargs = { + "input_ids": input_ids, + "generation_config": generation_config, + "return_dict_in_generate": True, + "output_scores": True, + "pad_token_id": self.tokenizer.eos_token_id, + "max_new_tokens": DEFAULT_MAX_NEW_TOKENS, + "streamer": streamer + } + + thread = Thread( + target=self.model.generate, + kwargs=generation_kwargs + ) + thread.start() + + def inner(): + for text in streamer: + yield text + thread.join() + + return inner() +``` +```yaml config.yaml +model_name: "LLM with Streaming" +model_metadata: + example_model_input: {"prompt": "what is the meaning of life"} +requirements: +- torch==2.0.1 +- peft==0.4.0 +- scipy==1.11.1 +- sentencepiece==0.1.99 +- accelerate==0.21.0 +- bitsandbytes==0.41.1 +- einops==0.6.1 +- transformers==4.31.0 +resources: + cpu: "3" + memory: 14Gi + use_gpu: true + accelerator: A10G +``` + diff --git a/docs/examples/3_LLMs/llm.mdx b/docs/examples/3_LLMs/llm.mdx new file mode 100644 index 000000000..927017b85 --- /dev/null +++ b/docs/examples/3_LLMs/llm.mdx @@ -0,0 +1,205 @@ +--- +title: "LLM" +description: "Building an LLM" +--- + + + + + +In this example, we go through a Truss that serves an LLM. We +use the model Mistral-7B, which is a general-purpose LLM that +can used for a variety of tasks, like summarization, question-answering, +translation, and others. + +# Set up the imports and key constants + +In this example, we use the Huggingface transformers library to build a text generation model. + +```python model/model.py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + + +``` +We use the 7B version of the Mistral model. + +```python model/model.py +CHECKPOINT = "mistralai/Mistral-7B-v0.1" + +``` +# Define the `Model` class and load function + +In the `load` function of the Truss, we implement logic involved in +downloading and setting up the model. For this LLM, we use the `Auto` +classes in `transformers` to instantiate our Mistral model. + +```python model/model.py +class Model: + def __init__(self, **kwargs) -> None: + self.tokenizer = None + self.model = None + + def load(self): + self.model = AutoModelForCausalLM.from_pretrained( + CHECKPOINT, + torch_dtype=torch.float16, + device_map="auto") + + self.tokenizer = AutoTokenizer.from_pretrained( + CHECKPOINT, + ) + +``` +# Define the `predict` function + +In the predict function, we implement the actual inference logic. The steps +here are: + * Set up the generation params. We have defaults for both of these, but +adjusting the values will have an impact on the model output + * Tokenize the input + * Generate the output + * Use tokenizer to decode the output + +```python model/model.py + def predict(self, request: dict): + prompt = request.pop("prompt") + generate_args = { + "max_new_tokens": request.get("max_new_tokens", 128), + "temperature": request.get("temperature", 1.0), + "top_p": request.get("top_p", 0.95), + "top_k": request.get("top_p", 50), + "repetition_penalty": 1.0, + "no_repeat_ngram_size": 0, + "use_cache": True, + "do_sample": True, + "eos_token_id": self.tokenizer.eos_token_id, + "pad_token_id": self.tokenizer.pad_token_id, + } + + input_ids = self.tokenizer( + prompt, + return_tensors="pt" + ).input_ids.cuda() + + with torch.no_grad(): + output = self.model.generate( + inputs=input_ids, + **generate_args + ) + return self.tokenizer.decode(output[0]) +``` + +# Setting up the config.yaml + +Running Mistral 7B requires a few libraries, such as +`torch`, `transformers` and a couple others. + +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "What is the meaning of life?"} +model_name: Mistral 7B +python_version: py311 +requirements: +- transformers==4.34.0 +- sentencepiece==0.1.99 +- accelerate==0.23.0 +- torch==2.0.1 +``` +## Configure resources for Mistral + +Note that we need an A10G to run this model. + +```yaml config.yaml +resources: + accelerator: A10G + use_gpu: true +secrets: {} +system_packages: [] +``` +# Deploy the model + +Deploy the model like you would other Trusses, with: +```bash +$ truss push +``` +You can then invoke the model with: +```bash +$ truss predict -d '{"inputs": "What is a large language model?"}' +``` + + +```python model/model.py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + + +CHECKPOINT = "mistralai/Mistral-7B-v0.1" + +class Model: + def __init__(self, **kwargs) -> None: + self.tokenizer = None + self.model = None + + def load(self): + self.model = AutoModelForCausalLM.from_pretrained( + CHECKPOINT, + torch_dtype=torch.float16, + device_map="auto") + + self.tokenizer = AutoTokenizer.from_pretrained( + CHECKPOINT, + ) + + def predict(self, request: dict): + prompt = request.pop("prompt") + generate_args = { + "max_new_tokens": request.get("max_new_tokens", 128), + "temperature": request.get("temperature", 1.0), + "top_p": request.get("top_p", 0.95), + "top_k": request.get("top_p", 50), + "repetition_penalty": 1.0, + "no_repeat_ngram_size": 0, + "use_cache": True, + "do_sample": True, + "eos_token_id": self.tokenizer.eos_token_id, + "pad_token_id": self.tokenizer.pad_token_id, + } + + input_ids = self.tokenizer( + prompt, + return_tensors="pt" + ).input_ids.cuda() + + with torch.no_grad(): + output = self.model.generate( + inputs=input_ids, + **generate_args + ) + return self.tokenizer.decode(output[0]) +``` +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "What is the meaning of life?"} +model_name: Mistral 7B +python_version: py311 +requirements: +- transformers==4.34.0 +- sentencepiece==0.1.99 +- accelerate==0.23.0 +- torch==2.0.1 +resources: + accelerator: A10G + use_gpu: true +secrets: {} +system_packages: [] +``` + diff --git a/docs/examples/4_image_generation/sdxl.mdx b/docs/examples/4_image_generation/sdxl.mdx new file mode 100644 index 000000000..e7b7565c5 --- /dev/null +++ b/docs/examples/4_image_generation/sdxl.mdx @@ -0,0 +1,374 @@ +--- +title: "Text-to-image" +description: "Building a text-to-image model with SDXL" +--- + + + + + +In this example, we go through a Truss that serves a text-to-image model. We +use SDXL 1.0, which is one of the highest performing text-to-image models out +there today. + +# Set up imports and torch settings + +In this example, we use the Huggingface diffusers library to build our text-to-image model. + +```python model/model.py +from diffusers import DiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler +import torch +import base64 +from PIL import Image +from io import BytesIO +from typing import Any +import time + +``` +The following line is needed to enable TF32 on NVIDIA GPUs + +```python model/model.py +torch.backends.cuda.matmul.allow_tf32 = True + +``` +# Define the `Model` class and load function + +In the `load` function of the Truss, we implement logic involved in +downloading and setting up the model. For this model, we use the +`DiffusionPipeline` class in `diffusers` to instantiate our SDXL pipeline, +and configure a number of relevant parameters. + +See the [diffusers docs](https://huggingface.co/docs/diffusers/index) for details +on all of these parameters. + +```python model/model.py +class Model: + def __init__(self, **kwargs): + self._model = None + + def load(self): + vae = AutoencoderKL.from_pretrained( + "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 + ) + self.pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + vae=vae, + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True, + ) + + self.pipe.unet.to(memory_format=torch.channels_last) + self.pipe.to('cuda') + self.pipe.enable_xformers_memory_efficient_attention() + + self.refiner = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-1.0", + text_encoder_2=self.pipe.text_encoder_2, + vae=self.pipe.vae, + torch_dtype=torch.float16, + use_safetensors=True, + variant="fp16", + ) + self.refiner.to("cuda") + self.refiner.enable_xformers_memory_efficient_attention() + +``` +This is a utility function for converting PIL image to base64. + +```python model/model.py + def convert_to_b64(self, image: Image) -> str: + buffered = BytesIO() + image.save(buffered, format="JPEG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + return img_b64 + +``` +# Define the predict function + +The `predict` function contains the actual inference logic. The steps here are: + * Setting up the generation params. We have defaults for these, and some, such +as the `scheduler`, are somewhat complicated + * Running the Diffusion Pipeline + * If `use_refiner` is set to `True`, we run the refiner model on the output + * Convert the resulting image to base64 and return it + +```python model/model.py + def predict(self, model_input: Any) -> Any: + prompt = model_input.pop("prompt") + negative_prompt = model_input.pop("negative_prompt", None) + use_refiner = model_input.pop("use_refiner", True) + num_inference_steps = model_input.pop("num_inference_steps", 30) + denoising_frac = model_input.pop("denoising_frac", 0.8) + end_cfg_frac = model_input.pop("end_cfg_frac", 0.4) + guidance_scale = model_input.pop("guidance_scale", 7.5) + seed = model_input.pop("seed", None) + + scheduler = model_input.pop("scheduler", None) # Default: EulerDiscreteScheduler (works pretty well) + +``` +Set the scheduler based on the user's input. +See possible schedulers: https://huggingface.co/docs/diffusers/api/schedulers/overview for +what the tradeoffs are. + +```python model/model.py + if scheduler == "DPM++ 2M": + self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config) + elif scheduler == "DPM++ 2M Karras": + self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, use_karras_sigmas=True) + elif scheduler == "DPM++ 2M SDE Karras": + self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True) + + generator = None + if seed is not None: + torch.manual_seed(seed) + generator = [torch.Generator(device="cuda").manual_seed(seed)] + + if not use_refiner: + denoising_frac = 1.0 + + start_time = time.time() + image = self.pipe(prompt=prompt, + negative_prompt=negative_prompt, + generator=generator, + end_cfg = end_cfg_frac, + num_inference_steps=num_inference_steps, + denoising_end=denoising_frac, + guidance_scale=guidance_scale, + output_type="latent" if use_refiner else "pil").images[0] + scheduler = self.pipe.scheduler + if use_refiner: + self.refiner.scheduler = scheduler + image = self.refiner(prompt=prompt, + negative_prompt=negative_prompt, + generator=generator, + end_cfg = end_cfg_frac, + num_inference_steps=num_inference_steps, + denoising_start=denoising_frac, + guidance_scale=guidance_scale, + image=image[None, :]).images[0] + +``` +Convert the results to base64, and return them. + +```python model/model.py + b64_results = self.convert_to_b64(image) + end_time = time.time() - start_time + + print(f"Time: {end_time:.2f} seconds") + + return {"status": "success", "data": b64_results, "time": end_time} +``` + +# Setting up the config yaml + +Running SDXL requires a handful of Python libraries, including +diffusers, transformers, and others. + +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "A tree in a field under the night sky", "use_refiner": true} +model_name: Stable Diffusion XL +python_version: py39 +requirements: +- transformers==4.34.0 +- accelerate==0.23.0 +- safetensors==0.4.0 +- git+https://github.com/basetenlabs/diffusers.git@9a353290b1497023d4745a719ec02c50f680499a +- invisible-watermark>=0.2.0 +- xformers==0.0.22 +``` +## Configuring resources for SDXL 1.0 + +Note that we need an A10G to run this model. + +```yaml config.yaml +resources: + accelerator: A10G + cpu: 3500m + memory: 20Gi + use_gpu: true +secrets: {} +``` +## System Packages + +Running diffusers requires `ffmpeg` and a couple other system +packages. + +```yaml config.yaml +system_packages: +- ffmpeg +- libsm6 +- libxext6 +``` +## Enabling Caching + +SDXL is a very large model, and downloading it could take up to 10 minutes. This means +that the cold start time for this model is long. We can solve that by using our build +caching feature. This moves the model download to the build stage of your model-- +caching the model will take about 10 minutes initially but you will get ~9s cold starts +subsequently. + +To enable caching, add the following to the config: +```yaml +hf_cache: + - repo_id: madebyollin/sdxl-vae-fp16-fix + allow_patterns: + - config.json + - diffusion_pytorch_model.safetensors + - repo_id: stabilityai/stable-diffusion-xl-base-1.0 + allow_patterns: + - "*.json" + - "*.fp16.safetensors" + - sd_xl_base_1.0.safetensors + - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0 + allow_patterns: + - "*.json" + - "*.fp16.safetensors" + - sd_xl_refiner_1.0.safetensors +``` +# Deploy the model + +Deploy the model like you would other Trusses, with: +```bash +$ truss push +``` +You can then invoke the model with: +```bash +$ truss predict -d '{"prompt": "A tree in a field under the night sky", "use_refiner": true}' +``` + + +```python model/model.py +from diffusers import DiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler +import torch +import base64 +from PIL import Image +from io import BytesIO +from typing import Any +import time + +torch.backends.cuda.matmul.allow_tf32 = True + +class Model: + def __init__(self, **kwargs): + self._model = None + + def load(self): + vae = AutoencoderKL.from_pretrained( + "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 + ) + self.pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + vae=vae, + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True, + ) + + self.pipe.unet.to(memory_format=torch.channels_last) + self.pipe.to('cuda') + self.pipe.enable_xformers_memory_efficient_attention() + + self.refiner = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-1.0", + text_encoder_2=self.pipe.text_encoder_2, + vae=self.pipe.vae, + torch_dtype=torch.float16, + use_safetensors=True, + variant="fp16", + ) + self.refiner.to("cuda") + self.refiner.enable_xformers_memory_efficient_attention() + + def convert_to_b64(self, image: Image) -> str: + buffered = BytesIO() + image.save(buffered, format="JPEG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + return img_b64 + + def predict(self, model_input: Any) -> Any: + prompt = model_input.pop("prompt") + negative_prompt = model_input.pop("negative_prompt", None) + use_refiner = model_input.pop("use_refiner", True) + num_inference_steps = model_input.pop("num_inference_steps", 30) + denoising_frac = model_input.pop("denoising_frac", 0.8) + end_cfg_frac = model_input.pop("end_cfg_frac", 0.4) + guidance_scale = model_input.pop("guidance_scale", 7.5) + seed = model_input.pop("seed", None) + + scheduler = model_input.pop("scheduler", None) # Default: EulerDiscreteScheduler (works pretty well) + + if scheduler == "DPM++ 2M": + self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config) + elif scheduler == "DPM++ 2M Karras": + self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, use_karras_sigmas=True) + elif scheduler == "DPM++ 2M SDE Karras": + self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True) + + generator = None + if seed is not None: + torch.manual_seed(seed) + generator = [torch.Generator(device="cuda").manual_seed(seed)] + + if not use_refiner: + denoising_frac = 1.0 + + start_time = time.time() + image = self.pipe(prompt=prompt, + negative_prompt=negative_prompt, + generator=generator, + end_cfg = end_cfg_frac, + num_inference_steps=num_inference_steps, + denoising_end=denoising_frac, + guidance_scale=guidance_scale, + output_type="latent" if use_refiner else "pil").images[0] + scheduler = self.pipe.scheduler + if use_refiner: + self.refiner.scheduler = scheduler + image = self.refiner(prompt=prompt, + negative_prompt=negative_prompt, + generator=generator, + end_cfg = end_cfg_frac, + num_inference_steps=num_inference_steps, + denoising_start=denoising_frac, + guidance_scale=guidance_scale, + image=image[None, :]).images[0] + + b64_results = self.convert_to_b64(image) + end_time = time.time() - start_time + + print(f"Time: {end_time:.2f} seconds") + + return {"status": "success", "data": b64_results, "time": end_time} +``` +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "A tree in a field under the night sky", "use_refiner": true} +model_name: Stable Diffusion XL +python_version: py39 +requirements: +- transformers==4.34.0 +- accelerate==0.23.0 +- safetensors==0.4.0 +- git+https://github.com/basetenlabs/diffusers.git@9a353290b1497023d4745a719ec02c50f680499a +- invisible-watermark>=0.2.0 +- xformers==0.0.22 +resources: + accelerator: A10G + cpu: 3500m + memory: 20Gi + use_gpu: true +secrets: {} +system_packages: +- ffmpeg +- libsm6 +- libxext6 +``` + diff --git a/docs/examples/6_high_performance/cached-weights.mdx b/docs/examples/6_high_performance/cached-weights.mdx new file mode 100644 index 000000000..83c39a912 --- /dev/null +++ b/docs/examples/6_high_performance/cached-weights.mdx @@ -0,0 +1,212 @@ +--- +title: "Fast Cold Starts with Cached Weights" +description: "Deploy a language model, with the model weights cached at build time" +--- + + + + + +In this example, we go through a Truss that serves an LLM, and _caches_ the weights +at build time. Loading model weights for any model can often be the most time-consuming +part of starting a model. Caching the weights at build time means that the weights +will be baked into the Truss image, and will be available _immediately_ when your model +replica starts. This means that **cold starts** will be _significantly faster_ with this approach. + +# Implementing the `Model` class + +With weight caching, you don't have to change anything about how the `Model` class +is implemented to take advantage of the weight caching. + +```python model/model.py +from typing import Dict, List + +import torch +from transformers import LlamaForCausalLM, LlamaTokenizer + +DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant." + +B_INST, E_INST = "[INST]", "[/INST]" +B_SYS, E_SYS = "<>\n", "\n<>\n\n" +CHECKPOINT = "NousResearch/Llama-2-7b-chat-hf" + + +def format_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str: + return f"{B_INST} {B_SYS} {system_prompt} {E_SYS} {prompt} {E_INST}" + + +class Model: + def __init__(self, **kwargs) -> None: + self.model = None + self.tokenizer = None + + def load(self): + self.model = LlamaForCausalLM.from_pretrained( + CHECKPOINT, + torch_dtype=torch.float16, + device_map="auto" + ) + self.tokenizer = LlamaTokenizer.from_pretrained( + CHECKPOINT + ) + + def predict(self, request: Dict) -> Dict[str, List]: + prompt = request.pop("prompt") + input_ids = self.tokenizer(format_prompt(prompt), return_tensors="pt").input_ids.cuda() + + outputs = self.model.generate( + inputs=input_ids, + do_sample=True, + num_beams=1, + max_new_tokens=100 + ) + response = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] + + return {"response": response} + + +``` + +# Setting up the config.yaml + +The `config.yaml` file is where you need to include the changes to +actually cache the weights at build time. + +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "What is the meaning of life?"} +model_name: Llama with Cached Weights +python_version: py39 +requirements: +- accelerate==0.21.0 +- safetensors==0.3.2 +- torch==2.0.1 +- transformers==4.34.0 +- sentencepiece==0.1.99 +- protobuf==4.24.4 +``` +# Configuring the hf_cache + +To cache model weights, set the `hf_cache` key. +The `repo_id` field allows you to specify a Huggingface +repo to pull down and cache at build-time, and the `ignore_patterns` +field allows you to specify files to ignore. If this is specified, then +this repo won't have to be pulled during runtime. + +Check out the [guide](https://truss.baseten.co/guides/model-cache) for more info. + +```yaml config.yaml +hf_cache: +- repo_id: "NousResearch/Llama-2-7b-chat-hf" + ignore_patterns: + - "*.bin" + +``` +The remaining config options are again, similar to what you would +configure for the model without the weight caching. + +```yaml config.yaml +resources: + cpu: "4" + memory: 30Gi + use_gpu: True + accelerator: A10G +secrets: {} +``` +# Deploy the model + +Deploy the model like you would other Trusses, with: +```bash +$ truss push +``` + + The build step will take longer than with the normal + Llama Truss, since bundling the model weights is now happening during the build. + The deploy step & scale-ups will happen much faster with this approach. + + +You can then invoke the model with: +```bash +$ truss predict -d '{"inputs": "What is a large language model?"}' +``` + + +```python model/model.py +from typing import Dict, List + +import torch +from transformers import LlamaForCausalLM, LlamaTokenizer + +DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant." + +B_INST, E_INST = "[INST]", "[/INST]" +B_SYS, E_SYS = "<>\n", "\n<>\n\n" +CHECKPOINT = "NousResearch/Llama-2-7b-chat-hf" + + +def format_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str: + return f"{B_INST} {B_SYS} {system_prompt} {E_SYS} {prompt} {E_INST}" + + +class Model: + def __init__(self, **kwargs) -> None: + self.model = None + self.tokenizer = None + + def load(self): + self.model = LlamaForCausalLM.from_pretrained( + CHECKPOINT, + torch_dtype=torch.float16, + device_map="auto" + ) + self.tokenizer = LlamaTokenizer.from_pretrained( + CHECKPOINT + ) + + def predict(self, request: Dict) -> Dict[str, List]: + prompt = request.pop("prompt") + input_ids = self.tokenizer(format_prompt(prompt), return_tensors="pt").input_ids.cuda() + + outputs = self.model.generate( + inputs=input_ids, + do_sample=True, + num_beams=1, + max_new_tokens=100 + ) + response = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] + + return {"response": response} + + +``` +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "What is the meaning of life?"} +model_name: Llama with Cached Weights +python_version: py39 +requirements: +- accelerate==0.21.0 +- safetensors==0.3.2 +- torch==2.0.1 +- transformers==4.34.0 +- sentencepiece==0.1.99 +- protobuf==4.24.4 +hf_cache: +- repo_id: "NousResearch/Llama-2-7b-chat-hf" + ignore_patterns: + - "*.bin" + +resources: + cpu: "4" + memory: 30Gi + use_gpu: True + accelerator: A10G +secrets: {} +``` + diff --git a/docs/examples/6_high_performance/tgi.mdx b/docs/examples/6_high_performance/tgi.mdx new file mode 100644 index 000000000..96bb35f9f --- /dev/null +++ b/docs/examples/6_high_performance/tgi.mdx @@ -0,0 +1,102 @@ +--- +title: "High Performance LLM with TGI" +description: "Deploy a language model with TGI" +--- + + + + + +[TGI](https://github.com/huggingface/text-generation-inference/tree/main) is a model server optimized for +language models. In this example, we put together a Truss that serves the model Falcon 7B using TGI. + +For Trusses that use TGI, there is no user code to define, so there is only a config.yaml file. +You can run any model that supports TGI. + +```yaml config.yaml +build: + arguments: +``` +The endpoint argument has two options: + * **generate**: This returns the response as JSON when the full response is generated + * **generate_stream**: If you choose this option, results will be streamed as they are ready, using + server-sent events + +```yaml config.yaml + endpoint: generate_stream +``` +Select the model that you'd like to use with TGI + +```yaml config.yaml + model_id: tiiuae/falcon-7b +``` +The `model_server` parameter allows you to specify a supported backend (in this example, TGI) + +```yaml config.yaml + model_server: TGI +``` +Another important parameter to configure if you are choosing TGI is the `predict_concurrency`. +One of the main benefits of TGI is continuous batching -- in which multiple requests can be +processed at the same time. Without `predict_concurrency` set to a high enough number, you cannot take advantage of this +feature. + +```yaml config.yaml +runtime: + predict_concurrency: 128 +``` +The remaining config options listed are standard Truss Config options. + +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"inputs": "what is the meaning of life"} +model_name: Falcon-TGI +python_version: py39 +requirements: [] +resources: + accelerator: A10G + cpu: "4" + memory: 16Gi + use_gpu: true +secrets: {} +system_packages: [] +``` +# Deploy the model + +Deploy the TGI model like you would other Trusses, with: +```bash +$ truss push +``` +You can then invoke the model with: +```bash +$ truss predict -d '{"inputs": "What is a large language model?", "parameters": {"max_new_tokens": 128, "sample": true}}' --published +``` + + +```yaml config.yaml +build: + arguments: + endpoint: generate_stream + model_id: tiiuae/falcon-7b + model_server: TGI +runtime: + predict_concurrency: 128 +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"inputs": "what is the meaning of life"} +model_name: Falcon-TGI +python_version: py39 +requirements: [] +resources: + accelerator: A10G + cpu: "4" + memory: 16Gi + use_gpu: true +secrets: {} +system_packages: [] +``` + diff --git a/docs/examples/6_high_performance/vllm.mdx b/docs/examples/6_high_performance/vllm.mdx new file mode 100644 index 000000000..94be1b90a --- /dev/null +++ b/docs/examples/6_high_performance/vllm.mdx @@ -0,0 +1,100 @@ +--- +title: "High Performance LLM with vLLM" +description: "Deploy a language model with vLLM" +--- + + + + + +[vLLM](https://github.com/vllm-project/vllm) is a Python-based package that optimizes the Attention +layer in Transformer models. By better allocating memory used during the attention computation, +vLLM can reduce the memory footprint of a model and significantly improve inference speed. Truss +supports vLLM out of the box, so you can deploy vLLM-optimized models with ease. + + + +```yaml config.yaml +build: + arguments: +``` +vLLM supports multiple types of endpoints: + * Completions -- Follows the same API as the [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions) + * ChatCommpletions -- Follows the same API as the [OpenAI ChatCompletions API](https://platform.openai.com/docs/api-reference/chat) + +```yaml config.yaml + endpoint: Completions +``` +Select which vLLM-compatible model you'd like to use + +```yaml config.yaml + model: facebook/opt-125M +``` +The `model_server` parameter allows you to specify TGI + +```yaml config.yaml + model_server: VLLM +``` +Another important parameter to configure if you are choosing vLLM is the `predict_concurrency`. +One of the main benefits of vLLM is continuous batching -- in which multiple requests can be +processed at the same time. Without predict_concurrency, you cannot take advantage of this +feature. + +```yaml config.yaml +runtime: + predict_concurrency: 128 +``` +The remaining config options listed are standard Truss Config options. + +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: {} +model_name: OPT-125M +python_version: py39 +requirements: [] +resources: + accelerator: T4 + cpu: "4" + memory: 16Gi + use_gpu: true +secrets: {} +system_packages: [] +``` +# Deploy the model + +Deploy the vLLM model like you would other Trusses, with: +```bash +$ truss push +``` +You can then invoke the model with: +```bash +$ truss predict -d '{"prompt": "What is a large language model?", "model": "facebook/opt-125M"}' --published +``` + + +```yaml config.yaml +build: + arguments: + endpoint: Completions + model: facebook/opt-125M + model_server: VLLM +runtime: + predict_concurrency: 128 +environment_variables: {} +external_package_dirs: [] +model_metadata: {} +model_name: OPT-125M +python_version: py39 +requirements: [] +resources: + accelerator: T4 + cpu: "4" + memory: 16Gi + use_gpu: true +secrets: {} +system_packages: [] +``` + diff --git a/docs/examples/7_misc/private-huggingface-model.mdx b/docs/examples/7_misc/private-huggingface-model.mdx new file mode 100644 index 000000000..e84e459b6 --- /dev/null +++ b/docs/examples/7_misc/private-huggingface-model.mdx @@ -0,0 +1,137 @@ +--- +title: "Private Hugging Face Model" +description: "Load a model that requires authentication with Hugging Face" +--- + + + + + +In this example, we build a Truss that uses a model that +requires Hugging Face authentication. The steps for loading a model +from Hugging Face are: + +1. Create an [access token](https://huggingface.co/settings/tokens) on your Hugging Face account. +2. Add the `hf_access_token`` key to your config.yaml secrets and value to your [Baseten account](https://app.baseten.co/settings/secrets). +3. Add `use_auth_token` when creating the actual model. + +# Setting up the model + +In this example, we use a private version of the [BERT base model](https://huggingface.co/bert-base-uncased). +The model is publicly available, but for the purposes of our example, we copied it into a private +model repository, with the path "baseten/docs-example-gated-model". + +First, like with other Hugging Face models, start by importing the `pipeline` function from the +transformers library, and defining the `Model` class. + +```python model/model.py +from transformers import pipeline + +class Model: +``` +An important step in loading a model that requires authentication is to +have access to the secrets defined for this model. We pull these out of +the keyword args in the `__init__` function. + +```python model/model.py + def __init__(self, **kwargs) -> None: + self._secrets = kwargs["secrets"] + self._model = None + + def load(self): +``` +Ensure that when you define the `pipeline`, we use the `use_auth_token` parameter, +pass the `hf_access_token` secret that is on our Baseten account. + +```python model/model.py + self._model = pipeline( + "fill-mask", + model="baseten/docs-example-gated-model", + use_auth_token=self._secrets["hf_access_token"] + ) + + def predict(self, model_input): + return self._model(model_input) +``` + +# Setting up the config.yaml + +The main things that need to be set up in the config are +`requirements`, which need to include Hugging Face transformers, +and the secrets. + +```yaml config.yaml +environment_variables: {} +model_name: private-model +python_version: py39 +requirements: +- torch==2.0.1 +- transformers==4.30.2 +resources: + cpu: "1" + memory: 2Gi + use_gpu: false + accelerator: null +``` +To make the `hf_access_token` available in the Truss, we need to include +it in the config. Setting the value to `null` here means that the value +will be set by the Baseten secrets manager. + +```yaml config.yaml +secrets: + hf_access_token: null +system_packages: [] +``` +# Deploying the model + +An important note for deploying models with secrets is that +you must use the `--trusted` flag to give the model access to +secrets stored on the remote secrets manager. + +```bash +$ truss push --trusted +``` + +After the model finishes deploying, you can invoke it with: +```bash +$ truss predict -d '"It is a [MASK] world"' +``` + + +```python model/model.py +from transformers import pipeline + +class Model: + def __init__(self, **kwargs) -> None: + self._secrets = kwargs["secrets"] + self._model = None + + def load(self): + self._model = pipeline( + "fill-mask", + model="baseten/docs-example-gated-model", + use_auth_token=self._secrets["hf_access_token"] + ) + + def predict(self, model_input): + return self._model(model_input) +``` +```yaml config.yaml +environment_variables: {} +model_name: private-model +python_version: py39 +requirements: +- torch==2.0.1 +- transformers==4.30.2 +resources: + cpu: "1" + memory: 2Gi + use_gpu: false + accelerator: null +secrets: + hf_access_token: null +system_packages: [] +``` + diff --git a/docs/examples/7_misc/system-packages.mdx b/docs/examples/7_misc/system-packages.mdx new file mode 100644 index 000000000..029f5f6ee --- /dev/null +++ b/docs/examples/7_misc/system-packages.mdx @@ -0,0 +1,140 @@ +--- +title: "Model with system packages" +description: "Deploy a model with both Python and system dependencies" +--- + + + + + +In this example, we build a Truss with a model that requires specific system packages. + +To add system packages to your Truss, you can add a `system_packages` key to your config.yaml file, +for instance: +To add system packages to your model serving environment, open config.yaml and +update the system_packages key with a list of apt-installable Debian packages: + +```yaml config.yaml +system_packages: + - tesseract-ocr +``` + +For this example, we use the [LayoutLM Document QA](https://huggingface.co/impira/layoutlm-document-qa) model, +a multimodal model that answers questions about provided invoice documents. This model requires a system +package, tesseract-ocr, which needs to be included in the model serving environment. + +# Setting up the model.py + +For this model, we use the HuggingFace transformers library, and the the document-question-answering task. + +```python model/model.py +from transformers import pipeline + + +class Model: + def __init__(self, **kwargs) -> None: + self._model = None + + def load(self): + self._model = pipeline( + "document-question-answering", + model="impira/layoutlm-document-qa", + ) + + def predict(self, model_input): + return self._model( + model_input["url"], + model_input["prompt"] + ) +``` + +# Setting up the config.yaml file + +The main items that need to be configured in the config.yaml file are requirements +and `system_packages` sections. + +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"url": "https://templates.invoicehome.com/invoice-template-us-neat-750px.png", "prompt": "What is the invoice number?"} +model_name: LayoutLM Document QA +python_version: py39 +``` +Specify the versions of the Python requirements that are needed. +Always pin exact versions for your Python dependencies. The ML/AI space moves fast, so you want to have an up-to-date version of each package while also being protected from breaking changes. + +```yaml config.yaml +requirements: +- Pillow==10.0.0 +- pytesseract==0.3.10 +- torch==2.0.1 +- transformers==4.30.2 +resources: + cpu: "4" + memory: 16Gi + use_gpu: false + accelerator: null +secrets: {} +``` +The system_packages section is the other important bit here, you can +add any package that's available via `apt` on Debian. + +```yaml config.yaml +system_packages: +- tesseract-ocr +``` +# Deploy the model +```bash +$ truss push +``` +You can then invoke the model with: +``` +$ truss predict -d '{"url": "https://templates.invoicehome.com/invoice-template-us-neat-750px.png", "prompt": "What is the invoice number?"}' +``` + + +```python model/model.py +from transformers import pipeline + + +class Model: + def __init__(self, **kwargs) -> None: + self._model = None + + def load(self): + self._model = pipeline( + "document-question-answering", + model="impira/layoutlm-document-qa", + ) + + def predict(self, model_input): + return self._model( + model_input["url"], + model_input["prompt"] + ) +``` +```yaml config.yaml +environment_variables: {} +external_package_dirs: [] +model_metadata: + example_model_input: {"url": "https://templates.invoicehome.com/invoice-template-us-neat-750px.png", "prompt": "What is the invoice number?"} +model_name: LayoutLM Document QA +python_version: py39 +requirements: +- Pillow==10.0.0 +- pytesseract==0.3.10 +- torch==2.0.1 +- transformers==4.30.2 +resources: + cpu: "4" + memory: 16Gi + use_gpu: false + accelerator: null +secrets: {} +system_packages: +- tesseract-ocr +``` + diff --git a/docs/mint.json b/docs/mint.json index d291aade6..9e0149f8b 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -65,13 +65,16 @@ { "group": "Examples", "pages": [ - "examples/private-model", - "examples/system-packages", - "examples/streaming", - "examples/pre-process", - "examples/performance/cached-weights", - "examples/performance/tgi-server", - "examples/performance/vllm-server" + "examples/1_introduction/getting-started-bert", + "examples/2_image_classification/clip", + "examples/3_LLMs/llm", + "examples/3_LLMs/llm-with-streaming", + "examples/4_image_generation/sdxl", + "examples/6_high_performance/vllm", + "examples/6_high_performance/tgi", + "examples/6_high_performance/cached-weights", + "examples/7_misc/private-huggingface-model", + "examples/7_misc/system-packages" ] }, { @@ -169,7 +172,7 @@ ], "analytics": { "gtm": { - "tagId": "GTM-WXD4NQTW" + "tagId": "GTM-WXD4NQTW" } } } diff --git a/docs/welcome.mdx b/docs/welcome.mdx index 8c9b1ddf0..e6340350a 100644 --- a/docs/welcome.mdx +++ b/docs/welcome.mdx @@ -11,9 +11,9 @@ description: "The simplest way to serve AI/ML models in production" See Trusses for popular models including: -* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-70b-chat)) -* 🎨 [Stable Diffusion XL](https://github.com/basetenlabs/truss-examples/tree/main/stable-diffusion-xl-1.0) -* 🗣 [Whisper](https://github.com/basetenlabs/truss-examples/tree/main/whisper-truss) +* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-70b-chat)) +* 🎨 [Stable Diffusion XL](https://github.com/basetenlabs/truss-examples/tree/main/model_library/stable-diffusion-xl-1.0) +* 🗣 [Whisper](https://github.com/basetenlabs/truss-examples/tree/main/model_library/whisper-truss) and [dozens more examples on GitHub](https://github.com/basetenlabs/truss-examples/). diff --git a/pyproject.toml b/pyproject.toml index 2be5f8db8..b6d27d31f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "truss" -version = "0.7.12rc1" +version = "0.7.13" description = "A seamless bridge from model development to model delivery" license = "MIT" readme = "README.md"