diff --git a/README.md b/README.md
index ff67e5252..0e0f2835f 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 See Trusses for popular models including:
 
-* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-70b-chat))
+* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-70b-chat))
 * 🎨 [Stable Diffusion XL](https://github.com/basetenlabs/truss-examples/tree/main/stable-diffusion-xl-1.0)
 * 🗣 [Whisper](https://github.com/basetenlabs/truss-examples/tree/main/whisper-truss)
 
diff --git a/bin/generate_truss_examples.py b/bin/generate_truss_examples.py
new file mode 100644
index 000000000..583e61bf8
--- /dev/null
+++ b/bin/generate_truss_examples.py
@@ -0,0 +1,299 @@
+"""
+Script to take the Truss examples in https://github.com/basetenlabs/truss-examples,
+and generate documentation.
+
+Usage:
+```
+$ poetry run python bin/generate_truss_examples.py
+```
+"""
+import enum
+import json
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import yaml
+
+DOC_CONFIGURATION_FILE = "doc.yaml"
+TRUSS_EXAMPLES_REPO = "https://github.com/basetenlabs/truss-examples"
+DESTINATION_DIR = "truss-examples"
+MINT_CONFIG_PATH = "docs/mint.json"
+
+
+class FileType(enum.Enum):
+    YAML = "yaml"
+    PYTHON = "python"
+
+
+def clone_repo():
+    """
+    If the destination directory exists, remove it.
+    Then, clone the given repo into the specified directory.
+    """
+    if Path(DESTINATION_DIR).exists():
+        shutil.rmtree(DESTINATION_DIR)
+
+    try:
+        subprocess.run(
+            ["git", "clone", TRUSS_EXAMPLES_REPO, DESTINATION_DIR], check=True
+        )
+        print(f"Successfully cloned {TRUSS_EXAMPLES_REPO} to {DESTINATION_DIR}")
+    except subprocess.CalledProcessError as e:
+        print(f"Error cloning the repo: {e}")
+        sys.exit(1)
+
+
+def fetch_file_contents(path: str):
+    with open(path, "r") as f:
+        return f.read()
+
+
+def _fetch_example_dirs(root_dir: str) -> List[str]:
+    """
+    Walk through the directory structure from the root directory and
+    find all directories that have the specified file in it.
+    """
+    dirs_with_file = []
+
+    for dirpath, _, filenames in os.walk(root_dir):
+        if DOC_CONFIGURATION_FILE in filenames:
+            dirs_with_file.append(dirpath)
+
+    return dirs_with_file
+
+
+def _get_example_destination(truss_directory: str) -> Path:
+    """
+    Get the destination directory for the example.
+    """
+    original_path = Path(truss_directory)
+    folder, example = original_path.parts[1:]
+    example_file = f"{example}.mdx"
+    return Path("docs/examples") / folder / example_file
+
+
+def _get_file_type(file_path: str) -> FileType:
+    extension = Path(file_path).suffix
+    if extension == ".yaml":
+        return FileType.YAML
+
+    if extension == ".py":
+        return FileType.PYTHON
+
+    raise ValueError(f"Unknown file type: {extension}")
+
+
+class ContentBlock:
+    def formatted_content(self) -> str:
+        raise NotImplementedError
+
+
+class CodeBlock(ContentBlock):
+    def __init__(self, file_type: FileType, file_path: str):
+        self.file_type = file_type
+        self.file_path = file_path
+        self.content = ""
+
+    def formatted_content(self) -> str:
+        """
+        Outputs code blocks in the format:
+
+        ```python main.py
+        def main():
+            ...
+        ```
+        """
+        return f"\n```{self.file_type.value} {self.file_path}\n{self.content}```"
+
+
+class MarkdownBlock(ContentBlock):
+    def __init__(self, content: str):
+        self.content = content
+
+    def formatted_content(self) -> str:
+        # Remove the first comment and space character, such that
+        # "# Hello" becomes "Hello
+        return self.content.strip()[2:]
+
+
+class MarkdownExtractor:
+    """
+    Class that supports ingesting a code file line-by-line, and produces a formatted
+    mdx file.
+    """
+
+    def __init__(self, file_type: FileType, file_path: str):
+        self.file_type = file_type
+        self.file_path = file_path
+
+        self.blocks: List[ContentBlock] = []
+        self.current_code_block: Optional[CodeBlock] = None
+
+    def ingest(self, line: str):
+        """
+        For each line, check that it is a comment by the presence of "#".
+        If it is a comment, append it to the blocks.
+
+        If it is not a comment, either append to the current code block, or
+        create a new code block if this isn't one.
+
+        When this is finished, we can then very easily produce the mdx file.
+        """
+        stripped_line = line.strip()
+
+        # Case of Markdown line
+        if stripped_line.startswith("#"):
+            self.current_code_block = None
+            self.blocks.append(MarkdownBlock(line))
+        else:
+            if self.current_code_block is None:
+                self.current_code_block = CodeBlock(self.file_type, self.file_path)
+                self.blocks.append(self.current_code_block)
+            self.current_code_block.content += line + "\n"
+
+    def _formatted_request_example(self) -> str:
+        """
+        A key part of the mdx file is that each has a <RequestExample> block at the
+        bottom the file. This generates that for the given file by appending all the
+        CodeBlocks together.
+        """
+        code_blocks = [block for block in self.blocks if isinstance(block, CodeBlock)]
+        code_content = "".join([code_block.content for code_block in code_blocks])
+
+        return f"""```{self.file_type.value} {self.file_path}\n{code_content}```"""
+
+    def mdx_content(self) -> Tuple[str, str]:
+        full_content = "\n".join([block.formatted_content() for block in self.blocks])
+
+        return (
+            full_content + "\n",
+            self._formatted_request_example(),
+        )
+
+
+def _extract_mdx_content_and_code(full_file_path: str, path: str) -> Tuple[str, str]:
+    file_content = fetch_file_contents(full_file_path)
+    file_type = _get_file_type(path)
+    extractor = MarkdownExtractor(file_type, path)
+    for line in file_content.splitlines():
+        extractor.ingest(line)
+
+    return extractor.mdx_content()
+
+
+def _generate_request_example_block(code: str):
+    return f"""
+<RequestExample>
+{code}
+</RequestExample>
+"""
+
+
+def _generate_truss_example(truss_directory: str):
+    print("Generating example for: ", truss_directory)
+    doc_information = yaml.safe_load(
+        fetch_file_contents(f"{truss_directory}/{DOC_CONFIGURATION_FILE}")
+    )
+
+    example_destination = _get_example_destination(truss_directory)
+
+    header = f"""---
+title: "{doc_information["title"]}"
+description: "{doc_information["description"]}"
+---
+"""
+
+    path_in_examples_repo = "/".join(Path(truss_directory).parts[1:])
+    link_to_github = f"""
+        <Card
+          title="View on Github"
+          icon="github" href="{TRUSS_EXAMPLES_REPO}/tree/main/{path_in_examples_repo}">
+        </Card>
+    """
+    files_to_scrape = doc_information["files"]
+
+    full_content, code_blocks = zip(
+        *[
+            _extract_mdx_content_and_code(Path(truss_directory) / file, file)
+            for file in files_to_scrape
+        ]
+    )
+
+    full_code_block = "\n".join(code_blocks)
+    file_content = "\n".join(full_content) + _generate_request_example_block(
+        full_code_block
+    )
+    example_content = f"""{header}\n{link_to_github}\n{file_content}"""
+    path_to_example = Path(example_destination)
+    path_to_example.parent.mkdir(parents=True, exist_ok=True)
+
+    path_to_example.write_text(example_content)
+
+
+def _format_group_name(group_name: str) -> str:
+    """
+    This function takes the parent directory name in, and converts it
+    into a more human readable format for the table of contents.
+
+    Note that parent directory names are assumed to be in the format:
+    * 1_introduction/... (becomes "Introduction")
+    * 2_image_classification/... (becomes "Image classification")
+    * 3_llms/... (becomes "LLMs")
+    """
+    lowercase_name = " ".join(group_name.split("_")[1:])
+    # Capitalize the first letter. We do this rather than
+    # use .capitalize() or .title() because we want to preserve
+    # the case of subsequent letters
+    return lowercase_name[0].upper() + lowercase_name[1:]
+
+
+def update_toc(example_dirs: List[str]):
+    """
+    Update the table of contents in the README.md file.
+
+    Parameters:
+    example_dirs: List of directories as strings in the form "truss-examples-2/..."
+    """
+
+    # Exclude the root directory ("truss_examples") from the path
+    transformed_example_paths = [Path(example).parts[1:] for example in example_dirs]
+
+    mint_config = json.loads(fetch_file_contents(MINT_CONFIG_PATH))
+    navigation = mint_config["navigation"]
+
+    examples_section = [item for item in navigation if item["group"] == "Examples"][0]
+
+    # Sort examples by the group name
+    examples_section["pages"] = [
+        f"examples/{example_path[0]}/{example_path[1]}"
+        for example_path in sorted(
+            transformed_example_paths, key=lambda example: example[0]
+        )
+    ]
+
+    serialized_mint_config = json.dumps(mint_config, indent=2)
+    Path(MINT_CONFIG_PATH).write_text(serialized_mint_config)
+
+
+def generate_truss_examples():
+    """
+    Walk through the Truss examples repo, and for each
+    of the examples in the repo, generate documentation.
+
+    Finish the process by updating the table of contents.
+    """
+    clone_repo()
+
+    example_dirs = _fetch_example_dirs(DESTINATION_DIR)
+    for truss_directory in example_dirs:
+        _generate_truss_example(truss_directory)
+
+    update_toc(example_dirs)
+
+
+if __name__ == "__main__":
+    generate_truss_examples()
diff --git a/docs/examples/1_introduction/getting-started-bert.mdx b/docs/examples/1_introduction/getting-started-bert.mdx
new file mode 100644
index 000000000..9273b2538
--- /dev/null
+++ b/docs/examples/1_introduction/getting-started-bert.mdx
@@ -0,0 +1,166 @@
+---
+title: "Getting Started"
+description: "Building your first Truss"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/1_introduction/getting-started-bert">
+        </Card>
+
+In this example, we go through building your first Truss model. We'll be using the HuggingFace transformers
+library to build a text classification model that can detect sentiment of text.
+
+# Step 1: Implementing the model
+
+Set up imports for this model. In this example, we simply use the HuggingFace transformers library.
+
+```python model/model.py
+from transformers import pipeline
+
+```
+Every Truss model must implement a `Model` class. This class must have:
+ * an `__init__` function
+ * a `load` function
+ * a `predict` function
+
+In the `__init__` function, set up any variables that will be used in the `load` and `predict` functions.
+
+```python model/model.py
+class Model:
+    def __init__(self, **kwargs):
+        self._model = None
+
+```
+In the `load` function of the Truss, we implement logic
+involved in downloading the model and loading it into memory.
+For this Truss example, we define a HuggingFace pipeline, and choose
+the `text-classification` task, which uses BERT for text classification under the hood.
+
+Note that the the load function runs when the
+
+```python model/model.py
+    def load(self):
+        self._model = pipeline("text-classification")
+
+```
+In the `predict` function of the Truss, we implement logic related
+to actual inference. For this example,  we just call the HuggingFace pipeline
+that we set up in the `load` function.
+
+```python model/model.py
+    def predict(self, model_input):
+        return self._model(model_input)
+```
+
+# Step 2: Writing the config.yaml
+
+Each Truss has a config.yaml file where we can configure
+options related to the deployment. It's in this file where
+we can define requirements, resources, and runtime options like
+secrets and environment variables
+
+### Basic Options
+
+In this section, we can define basic metadata about the model,
+such as the name, and the Python version to build with.
+
+```yaml config.yaml
+model_name: bert
+python_version: py310
+model_metadata:
+  example_model_input: { "text": "Hello my name is {MASK}" }
+
+
+```
+### Set up python requirements
+
+In this section, we define any pip requirements that
+we need to run the model. To run this, we need PyTorch
+and Tranformers.
+
+```yaml config.yaml
+requirements:
+  - torch==2.0.1
+  - transformers==4.33.2
+
+```
+### Configure the resources needed
+
+In this section, we can configure resources
+needed to deploy this model. Here, we have no need for a GPU
+so we leave the accelerator section blank.
+
+```yaml config.yaml
+resources:
+  accelerator: null
+  cpu: '1'
+  memory: 2Gi
+  use_gpu: false
+
+```
+### Other config options
+
+Truss also has provisions for adding other runtime options
+packages. In this example, we don't need these, so we leave
+this empty for now.
+
+```yaml config.yaml
+secrets: {}
+system_packages: []
+environment_variables: {}
+external_package_dirs: []
+
+```
+# Step 3: Deploying & running inference
+
+Deploy the model with the following command:
+
+```bash
+$ truss push
+```
+
+And then you can performance inference with:
+```
+$ truss predict -d '"Truss is awesome!"'
+```
+
+<RequestExample>
+```python model/model.py
+from transformers import pipeline
+
+class Model:
+    def __init__(self, **kwargs):
+        self._model = None
+
+    def load(self):
+        self._model = pipeline("text-classification")
+
+    def predict(self, model_input):
+        return self._model(model_input)
+```
+```yaml config.yaml
+model_name: bert
+python_version: py310
+model_metadata:
+  example_model_input: { "text": "Hello my name is {MASK}" }
+
+
+requirements:
+  - torch==2.0.1
+  - transformers==4.33.2
+
+resources:
+  accelerator: null
+  cpu: '1'
+  memory: 2Gi
+  use_gpu: false
+
+secrets: {}
+system_packages: []
+environment_variables: {}
+external_package_dirs: []
+
+```
+</RequestExample>
diff --git a/docs/examples/2_image_classification/clip.mdx b/docs/examples/2_image_classification/clip.mdx
new file mode 100644
index 000000000..9f7d104f9
--- /dev/null
+++ b/docs/examples/2_image_classification/clip.mdx
@@ -0,0 +1,181 @@
+---
+title: "Image Classification"
+description: "Deploy a CLIP model to classify images"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/2_image_classification/clip">
+        </Card>
+
+In this example, we create a Truss that uses [CLIP](https://openai.com/research/clip) to classify images,
+using some pre-defined labels. The input to this Truss will be an image, the output will be a classification.
+
+One of the major things to note about this example is that since the inputs are images, we need to have
+some mechanism for downloading the image. To accomplish this, we have the user pass a downloadable URL to
+the Truss, and in the Truss code, download the image. To do this efficiently, we will make use of the
+`preprocess` method in Truss.
+
+# Set up imports and constants
+
+For our CLIP Truss, we will be using the Hugging Face transformers library, as well as
+`pillow` for image processing.
+
+```python model/model.py
+import requests
+from typing import Dict
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+
+```
+This is the CLIP model from Hugging Face that we will use for this example.
+
+```python model/model.py
+CHECKPOINT = "openai/clip-vit-base-patch32"
+
+```
+# Define the Truss
+
+In the `load` method, we load in the pretrained CLIP model from the
+Hugging Face checkpoint specified above.
+
+```python model/model.py
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self._processor = None
+        self._model = None
+
+    def load(self):
+        """
+        Loads the CLIP model and processor checkpoints.
+        """
+        self._model = CLIPModel.from_pretrained(CHECKPOINT)
+        self._processor = CLIPProcessor.from_pretrained(CHECKPOINT)
+
+```
+In the `preprocess` method, we download the image from the url and preprocess it.
+This method is a part of the Truss class, and is designed to be used for any logic
+involving IO, like in this case, downloading an image.
+
+It is called before the predict method in a separate thread, and is not subject to the same
+concurrency limits as the predict method, so can be called many times in parallel.
+This makes it such that the predict method is not unnecessarily blocked on IO-bound
+tasks, and helps improve the throughput of the Truss. See our [guide to concurrency](../guides/concurrency)
+for more info.
+
+```python model/model.py
+    def preprocess(self, request: Dict) -> Dict:
+
+        image = Image.open(requests.get(request.pop("url"), stream=True).raw)
+        request["inputs"] = self._processor(
+            text=["a photo of a cat", "a photo of a dog"], # Define preset labels to use
+            images=image,
+            return_tensors="pt",
+            padding=True
+        )
+        return request
+
+```
+The `predict` method performs the actual inference, and outputs a probability associated
+with each of the labels defined earlier.
+
+```python model/model.py
+    def predict(self, request: Dict) -> Dict:
+        """
+        This performs the actual classification. The predict method is subject to
+        the predict concurrency constraints.
+        """
+        outputs = self._model(**request["inputs"])
+        logits_per_image = outputs.logits_per_image
+        return logits_per_image.softmax(dim=1).tolist()
+```
+
+# Set up the config.yaml
+
+The main section that needs to be filled out
+to run CLIP is the `requirements` section, where we need
+to include `transformers`, for the model pipeline, and `pillow`,
+for image processing.
+
+```yaml config.yaml
+model_name: clip-example
+requirements:
+- transformers==4.32.0
+- pillow==10.0.0
+- torch==2.0.1
+model_metadata:
+  example_model_input: {"url": "https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?auto=compress&cs=tinysrgb&w=1600"}
+resources:
+  cpu: "3"
+  memory: 14Gi
+  use_gpu: true
+  accelerator: A10G
+```
+# Deploy the model
+
+Deploy the CLIP model like you would other Trusses, with:
+```bash
+$ truss push
+```
+You can then invoke the model with:
+```bash
+$ truss predict -d '{"image_url": "https://source.unsplash.com/gKXKBY-C-Dk/300x300""]}' --published
+```
+
+<RequestExample>
+```python model/model.py
+import requests
+from typing import Dict
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+
+CHECKPOINT = "openai/clip-vit-base-patch32"
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self._processor = None
+        self._model = None
+
+    def load(self):
+        """
+        Loads the CLIP model and processor checkpoints.
+        """
+        self._model = CLIPModel.from_pretrained(CHECKPOINT)
+        self._processor = CLIPProcessor.from_pretrained(CHECKPOINT)
+
+    def preprocess(self, request: Dict) -> Dict:
+
+        image = Image.open(requests.get(request.pop("url"), stream=True).raw)
+        request["inputs"] = self._processor(
+            text=["a photo of a cat", "a photo of a dog"], # Define preset labels to use
+            images=image,
+            return_tensors="pt",
+            padding=True
+        )
+        return request
+
+    def predict(self, request: Dict) -> Dict:
+        """
+        This performs the actual classification. The predict method is subject to
+        the predict concurrency constraints.
+        """
+        outputs = self._model(**request["inputs"])
+        logits_per_image = outputs.logits_per_image
+        return logits_per_image.softmax(dim=1).tolist()
+```
+```yaml config.yaml
+model_name: clip-example
+requirements:
+- transformers==4.32.0
+- pillow==10.0.0
+- torch==2.0.1
+model_metadata:
+  example_model_input: {"url": "https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?auto=compress&cs=tinysrgb&w=1600"}
+resources:
+  cpu: "3"
+  memory: 14Gi
+  use_gpu: true
+  accelerator: A10G
+```
+</RequestExample>
diff --git a/docs/examples/3_LLMs/llm-with-streaming.mdx b/docs/examples/3_LLMs/llm-with-streaming.mdx
new file mode 100644
index 000000000..23a72e7fe
--- /dev/null
+++ b/docs/examples/3_LLMs/llm-with-streaming.mdx
@@ -0,0 +1,263 @@
+---
+title: "LLM with Streaming"
+description: "Building an LLM with streaming output"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/3_LLMs/llm-with-streaming">
+        </Card>
+
+In this example, we go through a Truss that serves an LLM, and streams the output to the client.
+
+# Why Streaming?
+
+For certain ML models, generations can take a long  time. Especially with LLMs, a long output could take
+10 - 20 seconds to generate. However, because LLMs generate tokens in sequence, useful output can be
+made available to users sooner. To support this, in Truss, we support streaming output. In this example,
+we build a Truss that streams the output of the Falcon-7B model.
+
+# Set up the imports and key constants
+
+In this example, we use the HuggingFace transformers library to build a text generation model.
+
+```python model/model.py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer
+from typing import Dict
+from threading import Thread
+
+```
+We use the instruct version of the Falcon-7B model, and have some defaults
+for inference parameters.
+
+```python model/model.py
+CHECKPOINT = "tiiuae/falcon-7b-instruct"
+DEFAULT_MAX_NEW_TOKENS = 150
+DEFAULT_TOP_P = 0.95
+
+
+```
+# Define the load function
+
+In the `load` function of the Truss, we implement logic
+involved in downloading the model and loading it into memory.
+
+```python model/model.py
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.tokenizer = None
+        self.model = None
+
+    def load(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
+```
+
+
+```python model/model.py
+        self.tokenizer.pad_token = self.tokenizer.eos_token_id
+        self.model = AutoModelForCausalLM.from_pretrained(
+            CHECKPOINT,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+```
+# Define the predict function
+
+In the `predict` function of the Truss, we implement the actual
+inference logic. The two main steps are:
+* Tokenize the input
+* Call the model's `generate` function, ensuring that we pass a
+`TextIteratorStreamer`. This is what gives us streaming output, and
+and also do this in a Thread, so that it does not block the main
+invocation.
+* Return a generator that iterates over the `TextIteratorStreamer` object
+
+```python model/model.py
+    def predict(self, request: Dict) -> Dict:
+        prompt = request.pop("prompt")
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            max_length=512,
+            truncation=True,
+            padding=True
+        )
+        input_ids = inputs["input_ids"].to("cuda")
+
+```
+Instantiate the Streamer object, which we'll later use for
+returning the output to users.
+
+```python model/model.py
+        streamer = TextIteratorStreamer(self.tokenizer)
+        generation_config = GenerationConfig(
+            temperature=1,
+            top_p=DEFAULT_TOP_P,
+            top_k=40,
+        )
+
+```
+When creating the generation parameters, ensure to pass the `streamer` object
+that we created previously.
+
+```python model/model.py
+        with torch.no_grad():
+            generation_kwargs = {
+                "input_ids": input_ids,
+                "generation_config": generation_config,
+                "return_dict_in_generate": True,
+                "output_scores": True,
+                "pad_token_id": self.tokenizer.eos_token_id,
+                "max_new_tokens": DEFAULT_MAX_NEW_TOKENS,
+                "streamer": streamer
+            }
+
+```
+Spawn a thread to run the generation, so that it does not block the main
+thread.
+
+```python model/model.py
+            thread = Thread(
+                target=self.model.generate,
+                kwargs=generation_kwargs
+            )
+            thread.start()
+
+```
+In Truss, the way to achieve streaming output is to return a generator
+that yields content. In this example, we yield the output of the `streamer`,
+which produces output and yields it until the generation is complete.
+
+We define this `inner` function to create our generator.
+
+```python model/model.py
+            def inner():
+                for text in streamer:
+                    yield text
+                thread.join()
+
+            return inner()
+```
+
+# Setting up the config.yaml
+
+Running Falcon 7B requires torch, transformers,
+and a few other related libraries.
+
+```yaml config.yaml
+model_name: "LLM with Streaming"
+model_metadata:
+    example_model_input: {"prompt": "what is the meaning of life"}
+requirements:
+- torch==2.0.1
+- peft==0.4.0
+- scipy==1.11.1
+- sentencepiece==0.1.99
+- accelerate==0.21.0
+- bitsandbytes==0.41.1
+- einops==0.6.1
+- transformers==4.31.0
+```
+## Configure resources for Falcon
+
+Note that we need an A10G to run this model.
+
+```yaml config.yaml
+resources:
+  cpu: "3"
+  memory: 14Gi
+  use_gpu: true
+  accelerator: A10G
+```
+
+<RequestExample>
+```python model/model.py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer
+from typing import Dict
+from threading import Thread
+
+CHECKPOINT = "tiiuae/falcon-7b-instruct"
+DEFAULT_MAX_NEW_TOKENS = 150
+DEFAULT_TOP_P = 0.95
+
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.tokenizer = None
+        self.model = None
+
+    def load(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
+        self.tokenizer.pad_token = self.tokenizer.eos_token_id
+        self.model = AutoModelForCausalLM.from_pretrained(
+            CHECKPOINT,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+    def predict(self, request: Dict) -> Dict:
+        prompt = request.pop("prompt")
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            max_length=512,
+            truncation=True,
+            padding=True
+        )
+        input_ids = inputs["input_ids"].to("cuda")
+
+        streamer = TextIteratorStreamer(self.tokenizer)
+        generation_config = GenerationConfig(
+            temperature=1,
+            top_p=DEFAULT_TOP_P,
+            top_k=40,
+        )
+
+        with torch.no_grad():
+            generation_kwargs = {
+                "input_ids": input_ids,
+                "generation_config": generation_config,
+                "return_dict_in_generate": True,
+                "output_scores": True,
+                "pad_token_id": self.tokenizer.eos_token_id,
+                "max_new_tokens": DEFAULT_MAX_NEW_TOKENS,
+                "streamer": streamer
+            }
+
+            thread = Thread(
+                target=self.model.generate,
+                kwargs=generation_kwargs
+            )
+            thread.start()
+
+            def inner():
+                for text in streamer:
+                    yield text
+                thread.join()
+
+            return inner()
+```
+```yaml config.yaml
+model_name: "LLM with Streaming"
+model_metadata:
+    example_model_input: {"prompt": "what is the meaning of life"}
+requirements:
+- torch==2.0.1
+- peft==0.4.0
+- scipy==1.11.1
+- sentencepiece==0.1.99
+- accelerate==0.21.0
+- bitsandbytes==0.41.1
+- einops==0.6.1
+- transformers==4.31.0
+resources:
+  cpu: "3"
+  memory: 14Gi
+  use_gpu: true
+  accelerator: A10G
+```
+</RequestExample>
diff --git a/docs/examples/3_LLMs/llm.mdx b/docs/examples/3_LLMs/llm.mdx
new file mode 100644
index 000000000..927017b85
--- /dev/null
+++ b/docs/examples/3_LLMs/llm.mdx
@@ -0,0 +1,205 @@
+---
+title: "LLM"
+description: "Building an LLM"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/3_LLMs/llm">
+        </Card>
+
+In this example, we go through a Truss that serves an LLM. We
+use the model Mistral-7B, which is a general-purpose LLM that
+can used for a variety of tasks, like summarization, question-answering,
+translation, and others.
+
+# Set up the imports and key constants
+
+In this example, we use the Huggingface transformers library to build a text generation model.
+
+```python model/model.py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+
+```
+We use the 7B version of the Mistral model.
+
+```python model/model.py
+CHECKPOINT = "mistralai/Mistral-7B-v0.1"
+
+```
+# Define the `Model` class and load function
+
+In the `load` function of the Truss, we implement logic involved in
+downloading and setting up the model. For this LLM, we use the `Auto`
+classes in `transformers` to instantiate our Mistral model.
+
+```python model/model.py
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.tokenizer = None
+        self.model = None
+
+    def load(self):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            CHECKPOINT,
+            torch_dtype=torch.float16,
+            device_map="auto")
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            CHECKPOINT,
+        )
+
+```
+# Define the `predict` function
+
+In the predict function, we implement the actual inference logic. The steps
+here are:
+  * Set up the generation params. We have defaults for both of these, but
+adjusting the values will have an impact on the model output
+  * Tokenize the input
+  * Generate the output
+  * Use tokenizer to decode the output
+
+```python model/model.py
+    def predict(self, request: dict):
+        prompt = request.pop("prompt")
+        generate_args = {
+            "max_new_tokens": request.get("max_new_tokens", 128),
+            "temperature": request.get("temperature", 1.0),
+            "top_p": request.get("top_p", 0.95),
+            "top_k": request.get("top_p", 50),
+            "repetition_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "use_cache": True,
+            "do_sample": True,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+
+        input_ids = self.tokenizer(
+            prompt,
+            return_tensors="pt"
+        ).input_ids.cuda()
+
+        with torch.no_grad():
+            output = self.model.generate(
+                inputs=input_ids,
+                **generate_args
+            )
+            return self.tokenizer.decode(output[0])
+```
+
+# Setting up the config.yaml
+
+Running Mistral 7B requires a few libraries, such as
+`torch`, `transformers` and a couple others.
+
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "What is the meaning of life?"}
+model_name: Mistral 7B
+python_version: py311
+requirements:
+- transformers==4.34.0
+- sentencepiece==0.1.99
+- accelerate==0.23.0
+- torch==2.0.1
+```
+## Configure resources for Mistral
+
+Note that we need an A10G to run this model.
+
+```yaml config.yaml
+resources:
+  accelerator: A10G
+  use_gpu: true
+secrets: {}
+system_packages: []
+```
+# Deploy the model
+
+Deploy the model like you would other Trusses, with:
+```bash
+$ truss push
+```
+You can then invoke the model with:
+```bash
+$ truss predict -d '{"inputs": "What is a large language model?"}'
+```
+
+<RequestExample>
+```python model/model.py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+
+CHECKPOINT = "mistralai/Mistral-7B-v0.1"
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.tokenizer = None
+        self.model = None
+
+    def load(self):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            CHECKPOINT,
+            torch_dtype=torch.float16,
+            device_map="auto")
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            CHECKPOINT,
+        )
+
+    def predict(self, request: dict):
+        prompt = request.pop("prompt")
+        generate_args = {
+            "max_new_tokens": request.get("max_new_tokens", 128),
+            "temperature": request.get("temperature", 1.0),
+            "top_p": request.get("top_p", 0.95),
+            "top_k": request.get("top_p", 50),
+            "repetition_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "use_cache": True,
+            "do_sample": True,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+
+        input_ids = self.tokenizer(
+            prompt,
+            return_tensors="pt"
+        ).input_ids.cuda()
+
+        with torch.no_grad():
+            output = self.model.generate(
+                inputs=input_ids,
+                **generate_args
+            )
+            return self.tokenizer.decode(output[0])
+```
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "What is the meaning of life?"}
+model_name: Mistral 7B
+python_version: py311
+requirements:
+- transformers==4.34.0
+- sentencepiece==0.1.99
+- accelerate==0.23.0
+- torch==2.0.1
+resources:
+  accelerator: A10G
+  use_gpu: true
+secrets: {}
+system_packages: []
+```
+</RequestExample>
diff --git a/docs/examples/4_image_generation/sdxl.mdx b/docs/examples/4_image_generation/sdxl.mdx
new file mode 100644
index 000000000..e7b7565c5
--- /dev/null
+++ b/docs/examples/4_image_generation/sdxl.mdx
@@ -0,0 +1,374 @@
+---
+title: "Text-to-image"
+description: "Building a text-to-image model with SDXL"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/4_image_generation/sdxl">
+        </Card>
+
+In this example, we go through a Truss that serves a text-to-image model. We
+use SDXL 1.0, which is one of the highest performing text-to-image models out
+there today.
+
+# Set up imports and torch settings
+
+In this example, we use the Huggingface diffusers library to build our text-to-image model.
+
+```python model/model.py
+from diffusers import DiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler
+import torch
+import base64
+from PIL import Image
+from io import BytesIO
+from typing import Any
+import time
+
+```
+The following line is needed to enable TF32 on NVIDIA GPUs
+
+```python model/model.py
+torch.backends.cuda.matmul.allow_tf32 = True
+
+```
+# Define the `Model` class and load function
+
+In the `load` function of the Truss, we implement logic involved in
+downloading and setting up the model. For this model, we use the
+`DiffusionPipeline` class in `diffusers` to instantiate our SDXL pipeline,
+and configure a number of relevant parameters.
+
+See the [diffusers docs](https://huggingface.co/docs/diffusers/index) for details
+on all of these parameters.
+
+```python model/model.py
+class Model:
+    def __init__(self, **kwargs):
+        self._model = None
+
+    def load(self):
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+        )
+        self.pipe = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            vae=vae,
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+        )
+
+        self.pipe.unet.to(memory_format=torch.channels_last)
+        self.pipe.to('cuda')
+        self.pipe.enable_xformers_memory_efficient_attention()
+
+        self.refiner = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-refiner-1.0",
+            text_encoder_2=self.pipe.text_encoder_2,
+            vae=self.pipe.vae,
+            torch_dtype=torch.float16,
+            use_safetensors=True,
+            variant="fp16",
+        )
+        self.refiner.to("cuda")
+        self.refiner.enable_xformers_memory_efficient_attention()
+
+```
+This is a utility function for converting PIL image to base64.
+
+```python model/model.py
+    def convert_to_b64(self, image: Image) -> str:
+        buffered = BytesIO()
+        image.save(buffered, format="JPEG")
+        img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return img_b64
+
+```
+# Define the predict function
+
+The `predict` function contains the actual inference logic. The steps here are:
+  * Setting up the generation params. We have defaults for these, and some, such
+as the `scheduler`, are somewhat complicated
+  * Running the Diffusion Pipeline
+  * If `use_refiner` is set to `True`, we run the refiner model on the output
+  * Convert the resulting image to base64 and return it
+
+```python model/model.py
+    def predict(self, model_input: Any) -> Any:
+        prompt = model_input.pop("prompt")
+        negative_prompt = model_input.pop("negative_prompt", None)
+        use_refiner = model_input.pop("use_refiner", True)
+        num_inference_steps = model_input.pop("num_inference_steps", 30)
+        denoising_frac = model_input.pop("denoising_frac", 0.8)
+        end_cfg_frac = model_input.pop("end_cfg_frac", 0.4)
+        guidance_scale = model_input.pop("guidance_scale", 7.5)
+        seed = model_input.pop("seed", None)
+
+        scheduler = model_input.pop("scheduler", None) # Default: EulerDiscreteScheduler (works pretty well)
+
+```
+Set the scheduler based on the user's input.
+See possible schedulers: https://huggingface.co/docs/diffusers/api/schedulers/overview for
+what the tradeoffs are.
+
+```python model/model.py
+        if scheduler == "DPM++ 2M":
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
+        elif scheduler == "DPM++ 2M Karras":
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, use_karras_sigmas=True)
+        elif scheduler == "DPM++ 2M SDE Karras":
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)
+
+        generator = None
+        if seed is not None:
+            torch.manual_seed(seed)
+            generator = [torch.Generator(device="cuda").manual_seed(seed)]
+
+        if not use_refiner:
+            denoising_frac = 1.0
+
+        start_time = time.time()
+        image = self.pipe(prompt=prompt,
+                          negative_prompt=negative_prompt,
+                          generator=generator,
+                          end_cfg = end_cfg_frac,
+                          num_inference_steps=num_inference_steps,
+                          denoising_end=denoising_frac,
+                          guidance_scale=guidance_scale,
+                          output_type="latent" if use_refiner else "pil").images[0]
+        scheduler = self.pipe.scheduler
+        if use_refiner:
+            self.refiner.scheduler = scheduler
+            image = self.refiner(prompt=prompt,
+                                 negative_prompt=negative_prompt,
+                                generator=generator,
+                                 end_cfg = end_cfg_frac,
+                                 num_inference_steps=num_inference_steps,
+                                 denoising_start=denoising_frac,
+                                 guidance_scale=guidance_scale,
+                                 image=image[None, :]).images[0]
+
+```
+Convert the results to base64, and return them.
+
+```python model/model.py
+        b64_results = self.convert_to_b64(image)
+        end_time = time.time() - start_time
+
+        print(f"Time: {end_time:.2f} seconds")
+
+        return {"status": "success", "data": b64_results, "time": end_time}
+```
+
+# Setting up the config yaml
+
+Running SDXL requires a handful of Python libraries, including
+diffusers, transformers, and others.
+
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "A tree in a field under the night sky", "use_refiner": true}
+model_name: Stable Diffusion XL
+python_version: py39
+requirements:
+- transformers==4.34.0
+- accelerate==0.23.0
+- safetensors==0.4.0
+- git+https://github.com/basetenlabs/diffusers.git@9a353290b1497023d4745a719ec02c50f680499a
+- invisible-watermark>=0.2.0
+- xformers==0.0.22
+```
+## Configuring resources for SDXL 1.0
+
+Note that we need an A10G to run this model.
+
+```yaml config.yaml
+resources:
+  accelerator: A10G
+  cpu: 3500m
+  memory: 20Gi
+  use_gpu: true
+secrets: {}
+```
+## System Packages
+
+Running diffusers requires `ffmpeg` and a couple other system
+packages.
+
+```yaml config.yaml
+system_packages:
+- ffmpeg
+- libsm6
+- libxext6
+```
+## Enabling Caching
+
+SDXL is a very large model, and downloading it could take up to 10 minutes. This means
+that the cold start time for this model is long. We can solve that by using our build
+caching feature. This moves the model download to the build stage of your model--
+caching the model will take about 10 minutes initially but you will get ~9s cold starts
+subsequently.
+
+To enable caching, add the following to the config:
+```yaml
+hf_cache:
+  - repo_id: madebyollin/sdxl-vae-fp16-fix
+    allow_patterns:
+      - config.json
+      - diffusion_pytorch_model.safetensors
+  - repo_id: stabilityai/stable-diffusion-xl-base-1.0
+    allow_patterns:
+      - "*.json"
+      - "*.fp16.safetensors"
+      - sd_xl_base_1.0.safetensors
+  - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
+    allow_patterns:
+      - "*.json"
+      - "*.fp16.safetensors"
+      - sd_xl_refiner_1.0.safetensors
+```
+# Deploy the model
+
+Deploy the model like you would other Trusses, with:
+```bash
+$ truss push
+```
+You can then invoke the model with:
+```bash
+$ truss predict -d '{"prompt": "A tree in a field under the night sky", "use_refiner": true}'
+```
+
+<RequestExample>
+```python model/model.py
+from diffusers import DiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler
+import torch
+import base64
+from PIL import Image
+from io import BytesIO
+from typing import Any
+import time
+
+torch.backends.cuda.matmul.allow_tf32 = True
+
+class Model:
+    def __init__(self, **kwargs):
+        self._model = None
+
+    def load(self):
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+        )
+        self.pipe = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            vae=vae,
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+        )
+
+        self.pipe.unet.to(memory_format=torch.channels_last)
+        self.pipe.to('cuda')
+        self.pipe.enable_xformers_memory_efficient_attention()
+
+        self.refiner = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-refiner-1.0",
+            text_encoder_2=self.pipe.text_encoder_2,
+            vae=self.pipe.vae,
+            torch_dtype=torch.float16,
+            use_safetensors=True,
+            variant="fp16",
+        )
+        self.refiner.to("cuda")
+        self.refiner.enable_xformers_memory_efficient_attention()
+
+    def convert_to_b64(self, image: Image) -> str:
+        buffered = BytesIO()
+        image.save(buffered, format="JPEG")
+        img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return img_b64
+
+    def predict(self, model_input: Any) -> Any:
+        prompt = model_input.pop("prompt")
+        negative_prompt = model_input.pop("negative_prompt", None)
+        use_refiner = model_input.pop("use_refiner", True)
+        num_inference_steps = model_input.pop("num_inference_steps", 30)
+        denoising_frac = model_input.pop("denoising_frac", 0.8)
+        end_cfg_frac = model_input.pop("end_cfg_frac", 0.4)
+        guidance_scale = model_input.pop("guidance_scale", 7.5)
+        seed = model_input.pop("seed", None)
+
+        scheduler = model_input.pop("scheduler", None) # Default: EulerDiscreteScheduler (works pretty well)
+
+        if scheduler == "DPM++ 2M":
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
+        elif scheduler == "DPM++ 2M Karras":
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, use_karras_sigmas=True)
+        elif scheduler == "DPM++ 2M SDE Karras":
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)
+
+        generator = None
+        if seed is not None:
+            torch.manual_seed(seed)
+            generator = [torch.Generator(device="cuda").manual_seed(seed)]
+
+        if not use_refiner:
+            denoising_frac = 1.0
+
+        start_time = time.time()
+        image = self.pipe(prompt=prompt,
+                          negative_prompt=negative_prompt,
+                          generator=generator,
+                          end_cfg = end_cfg_frac,
+                          num_inference_steps=num_inference_steps,
+                          denoising_end=denoising_frac,
+                          guidance_scale=guidance_scale,
+                          output_type="latent" if use_refiner else "pil").images[0]
+        scheduler = self.pipe.scheduler
+        if use_refiner:
+            self.refiner.scheduler = scheduler
+            image = self.refiner(prompt=prompt,
+                                 negative_prompt=negative_prompt,
+                                generator=generator,
+                                 end_cfg = end_cfg_frac,
+                                 num_inference_steps=num_inference_steps,
+                                 denoising_start=denoising_frac,
+                                 guidance_scale=guidance_scale,
+                                 image=image[None, :]).images[0]
+
+        b64_results = self.convert_to_b64(image)
+        end_time = time.time() - start_time
+
+        print(f"Time: {end_time:.2f} seconds")
+
+        return {"status": "success", "data": b64_results, "time": end_time}
+```
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "A tree in a field under the night sky", "use_refiner": true}
+model_name: Stable Diffusion XL
+python_version: py39
+requirements:
+- transformers==4.34.0
+- accelerate==0.23.0
+- safetensors==0.4.0
+- git+https://github.com/basetenlabs/diffusers.git@9a353290b1497023d4745a719ec02c50f680499a
+- invisible-watermark>=0.2.0
+- xformers==0.0.22
+resources:
+  accelerator: A10G
+  cpu: 3500m
+  memory: 20Gi
+  use_gpu: true
+secrets: {}
+system_packages:
+- ffmpeg
+- libsm6
+- libxext6
+```
+</RequestExample>
diff --git a/docs/examples/6_high_performance/cached-weights.mdx b/docs/examples/6_high_performance/cached-weights.mdx
new file mode 100644
index 000000000..83c39a912
--- /dev/null
+++ b/docs/examples/6_high_performance/cached-weights.mdx
@@ -0,0 +1,212 @@
+---
+title: "Fast Cold Starts with Cached Weights"
+description: "Deploy a language model, with the model weights cached at build time"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/6_high_performance/cached-weights">
+        </Card>
+
+In this example, we go through a Truss that serves an LLM, and _caches_ the weights
+at build time. Loading model weights for any model can often be the most time-consuming
+part of starting a model. Caching the weights at build time means that the weights
+will be baked into the Truss image, and will be available _immediately_ when your model
+replica starts. This means that **cold starts** will be _significantly faster_ with this approach.
+
+# Implementing the `Model` class
+
+With weight caching, you don't have to change anything about how the `Model` class
+is implemented to take advantage of the weight caching.
+
+```python model/model.py
+from typing import Dict, List
+
+import torch
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant."
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+CHECKPOINT = "NousResearch/Llama-2-7b-chat-hf"
+
+
+def format_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
+    return f"{B_INST} {B_SYS} {system_prompt} {E_SYS} {prompt} {E_INST}"
+
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.model = None
+        self.tokenizer = None
+
+    def load(self):
+        self.model = LlamaForCausalLM.from_pretrained(
+            CHECKPOINT,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.tokenizer = LlamaTokenizer.from_pretrained(
+            CHECKPOINT
+        )
+
+    def predict(self, request: Dict) -> Dict[str, List]:
+        prompt = request.pop("prompt")
+        input_ids = self.tokenizer(format_prompt(prompt), return_tensors="pt").input_ids.cuda()
+
+        outputs = self.model.generate(
+            inputs=input_ids,
+            do_sample=True,
+            num_beams=1,
+            max_new_tokens=100
+        )
+        response = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+
+        return {"response": response}
+
+
+```
+
+# Setting up the config.yaml
+
+The `config.yaml` file is where you need to include the changes to
+actually cache the weights at build time.
+
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "What is the meaning of life?"}
+model_name: Llama with Cached Weights
+python_version: py39
+requirements:
+- accelerate==0.21.0
+- safetensors==0.3.2
+- torch==2.0.1
+- transformers==4.34.0
+- sentencepiece==0.1.99
+- protobuf==4.24.4
+```
+# Configuring the hf_cache
+
+To cache model weights, set the `hf_cache` key.
+The `repo_id` field allows you to specify a Huggingface
+repo to pull down and cache at build-time, and the `ignore_patterns`
+field allows you to specify files to ignore. If this is specified, then
+this repo won't have to be pulled during runtime.
+
+Check out the [guide](https://truss.baseten.co/guides/model-cache) for more info.
+
+```yaml config.yaml
+hf_cache:
+- repo_id: "NousResearch/Llama-2-7b-chat-hf"
+  ignore_patterns:
+  - "*.bin"
+
+```
+The remaining config options are again, similar to what you would
+configure for the model without the weight caching.
+
+```yaml config.yaml
+resources:
+  cpu: "4"
+  memory: 30Gi
+  use_gpu: True
+  accelerator: A10G
+secrets: {}
+```
+# Deploy the model
+
+Deploy the model like you would other Trusses, with:
+```bash
+$ truss push
+```
+ <Note>
+ The build step will take  longer than with the normal
+ Llama Truss, since bundling the model weights is now happening during the build.
+ The deploy step & scale-ups will happen much faster with this approach.
+ </Note>
+
+You can then invoke the model with:
+```bash
+$ truss predict -d '{"inputs": "What is a large language model?"}'
+```
+
+<RequestExample>
+```python model/model.py
+from typing import Dict, List
+
+import torch
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant."
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+CHECKPOINT = "NousResearch/Llama-2-7b-chat-hf"
+
+
+def format_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
+    return f"{B_INST} {B_SYS} {system_prompt} {E_SYS} {prompt} {E_INST}"
+
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.model = None
+        self.tokenizer = None
+
+    def load(self):
+        self.model = LlamaForCausalLM.from_pretrained(
+            CHECKPOINT,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.tokenizer = LlamaTokenizer.from_pretrained(
+            CHECKPOINT
+        )
+
+    def predict(self, request: Dict) -> Dict[str, List]:
+        prompt = request.pop("prompt")
+        input_ids = self.tokenizer(format_prompt(prompt), return_tensors="pt").input_ids.cuda()
+
+        outputs = self.model.generate(
+            inputs=input_ids,
+            do_sample=True,
+            num_beams=1,
+            max_new_tokens=100
+        )
+        response = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+
+        return {"response": response}
+
+
+```
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "What is the meaning of life?"}
+model_name: Llama with Cached Weights
+python_version: py39
+requirements:
+- accelerate==0.21.0
+- safetensors==0.3.2
+- torch==2.0.1
+- transformers==4.34.0
+- sentencepiece==0.1.99
+- protobuf==4.24.4
+hf_cache:
+- repo_id: "NousResearch/Llama-2-7b-chat-hf"
+  ignore_patterns:
+  - "*.bin"
+
+resources:
+  cpu: "4"
+  memory: 30Gi
+  use_gpu: True
+  accelerator: A10G
+secrets: {}
+```
+</RequestExample>
diff --git a/docs/examples/6_high_performance/tgi.mdx b/docs/examples/6_high_performance/tgi.mdx
new file mode 100644
index 000000000..96bb35f9f
--- /dev/null
+++ b/docs/examples/6_high_performance/tgi.mdx
@@ -0,0 +1,102 @@
+---
+title: "High Performance LLM with TGI"
+description: "Deploy a language model with TGI"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/6_high_performance/tgi">
+        </Card>
+
+[TGI](https://github.com/huggingface/text-generation-inference/tree/main) is a model server optimized for
+language models. In this example, we put together a Truss that serves the model Falcon 7B using TGI.
+
+For Trusses that use TGI, there is no user code to define, so there is only a config.yaml file.
+You can run any model that supports TGI.
+
+```yaml config.yaml
+build:
+  arguments:
+```
+The endpoint argument has two options:
+  * **generate**: This returns the response as JSON when the full response is generated
+  * **generate_stream**: If you choose this option, results will be streamed as they are ready, using
+    server-sent events
+
+```yaml config.yaml
+    endpoint: generate_stream
+```
+Select the model that you'd like to use with TGI
+
+```yaml config.yaml
+    model_id: tiiuae/falcon-7b
+```
+The `model_server` parameter allows you to specify a supported backend (in this example, TGI)
+
+```yaml config.yaml
+  model_server: TGI
+```
+Another important parameter to configure if you are choosing TGI is the `predict_concurrency`.
+One of the main benefits of TGI is continuous batching -- in which multiple requests can be
+processed at the same time. Without `predict_concurrency` set to a high enough number, you cannot take advantage of this
+feature.
+
+```yaml config.yaml
+runtime:
+  predict_concurrency: 128
+```
+The remaining config options listed are standard Truss Config options.
+
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"inputs": "what is the meaning of life"}
+model_name: Falcon-TGI
+python_version: py39
+requirements: []
+resources:
+  accelerator: A10G
+  cpu: "4"
+  memory: 16Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+```
+# Deploy the model
+
+Deploy the TGI model like you would other Trusses, with:
+```bash
+$ truss push
+```
+You can then invoke the model with:
+```bash
+$ truss predict -d '{"inputs": "What is a large language model?", "parameters": {"max_new_tokens": 128, "sample": true}}' --published
+```
+
+<RequestExample>
+```yaml config.yaml
+build:
+  arguments:
+    endpoint: generate_stream
+    model_id: tiiuae/falcon-7b
+  model_server: TGI
+runtime:
+  predict_concurrency: 128
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"inputs": "what is the meaning of life"}
+model_name: Falcon-TGI
+python_version: py39
+requirements: []
+resources:
+  accelerator: A10G
+  cpu: "4"
+  memory: 16Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+```
+</RequestExample>
diff --git a/docs/examples/6_high_performance/vllm.mdx b/docs/examples/6_high_performance/vllm.mdx
new file mode 100644
index 000000000..94be1b90a
--- /dev/null
+++ b/docs/examples/6_high_performance/vllm.mdx
@@ -0,0 +1,100 @@
+---
+title: "High Performance LLM with vLLM"
+description: "Deploy a language model with vLLM"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/6_high_performance/vllm">
+        </Card>
+
+[vLLM](https://github.com/vllm-project/vllm) is a Python-based package that optimizes the Attention
+layer in Transformer models. By better allocating memory used during the attention computation,
+vLLM can reduce the memory footprint of a model and significantly improve inference speed. Truss
+supports vLLM out of the box, so you can deploy vLLM-optimized models with ease.
+
+
+
+```yaml config.yaml
+build:
+  arguments:
+```
+vLLM supports multiple types of endpoints:
+  * Completions -- Follows the same API as the [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)
+  * ChatCommpletions --  Follows the same API as the [OpenAI ChatCompletions API](https://platform.openai.com/docs/api-reference/chat)
+
+```yaml config.yaml
+    endpoint: Completions
+```
+Select which vLLM-compatible model you'd like to use
+
+```yaml config.yaml
+    model: facebook/opt-125M
+```
+The `model_server` parameter allows you to specify TGI
+
+```yaml config.yaml
+  model_server: VLLM
+```
+Another important parameter to configure if you are choosing vLLM is the `predict_concurrency`.
+One of the main benefits of vLLM is continuous batching -- in which multiple requests can be
+processed at the same time. Without predict_concurrency, you cannot take advantage of this
+feature.
+
+```yaml config.yaml
+runtime:
+  predict_concurrency: 128
+```
+The remaining config options listed are standard Truss Config options.
+
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata: {}
+model_name: OPT-125M
+python_version: py39
+requirements: []
+resources:
+  accelerator: T4
+  cpu: "4"
+  memory: 16Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+```
+# Deploy the model
+
+Deploy the vLLM model like you would other Trusses, with:
+```bash
+$ truss push
+```
+You can then invoke the model with:
+```bash
+$ truss predict -d '{"prompt": "What is a large language model?", "model": "facebook/opt-125M"}' --published
+```
+
+<RequestExample>
+```yaml config.yaml
+build:
+  arguments:
+    endpoint: Completions
+    model: facebook/opt-125M
+  model_server: VLLM
+runtime:
+  predict_concurrency: 128
+environment_variables: {}
+external_package_dirs: []
+model_metadata: {}
+model_name: OPT-125M
+python_version: py39
+requirements: []
+resources:
+  accelerator: T4
+  cpu: "4"
+  memory: 16Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+```
+</RequestExample>
diff --git a/docs/examples/7_misc/private-huggingface-model.mdx b/docs/examples/7_misc/private-huggingface-model.mdx
new file mode 100644
index 000000000..e84e459b6
--- /dev/null
+++ b/docs/examples/7_misc/private-huggingface-model.mdx
@@ -0,0 +1,137 @@
+---
+title: "Private Hugging Face Model"
+description: "Load a model that requires authentication with Hugging Face"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/7_misc/private-huggingface-model">
+        </Card>
+
+In this example, we build a Truss that uses a model that
+requires Hugging Face authentication. The steps for loading a model
+from Hugging Face are:
+
+1. Create an [access token](https://huggingface.co/settings/tokens) on your Hugging Face account.
+2. Add the `hf_access_token`` key to your config.yaml secrets and value to your [Baseten account](https://app.baseten.co/settings/secrets).
+3. Add `use_auth_token` when creating the actual model.
+
+# Setting up the model
+
+In this example, we use a private version of the [BERT base model](https://huggingface.co/bert-base-uncased).
+The model is publicly available, but for the purposes of our example, we copied it into a private
+model repository, with the path "baseten/docs-example-gated-model".
+
+First, like with other Hugging Face models, start by importing the `pipeline` function from the
+transformers library, and defining the `Model` class.
+
+```python model/model.py
+from transformers import pipeline
+
+class Model:
+```
+An important step in loading a model that requires authentication is to
+have access to the secrets defined for this model. We pull these out of
+the keyword args in the `__init__` function.
+
+```python model/model.py
+    def __init__(self, **kwargs) -> None:
+        self._secrets = kwargs["secrets"]
+        self._model = None
+
+    def load(self):
+```
+Ensure that when you define the `pipeline`, we use the `use_auth_token` parameter,
+pass the `hf_access_token` secret that is on our Baseten account.
+
+```python model/model.py
+        self._model = pipeline(
+            "fill-mask",
+            model="baseten/docs-example-gated-model",
+            use_auth_token=self._secrets["hf_access_token"]
+        )
+
+    def predict(self, model_input):
+        return self._model(model_input)
+```
+
+# Setting up the config.yaml
+
+The main things that need to be set up in the config are
+`requirements`, which need to include Hugging Face transformers,
+and the secrets.
+
+```yaml config.yaml
+environment_variables: {}
+model_name: private-model
+python_version: py39
+requirements:
+- torch==2.0.1
+- transformers==4.30.2
+resources:
+  cpu: "1"
+  memory: 2Gi
+  use_gpu: false
+  accelerator: null
+```
+To make the `hf_access_token` available in the Truss, we need to include
+it in the config. Setting the value to `null` here means that the value
+will be set by the Baseten secrets manager.
+
+```yaml config.yaml
+secrets:
+  hf_access_token: null
+system_packages: []
+```
+# Deploying the model
+
+An important note for deploying models with secrets is that
+you must use the `--trusted` flag to give the model access to
+secrets stored on the remote secrets manager.
+
+```bash
+$ truss push --trusted
+```
+
+After the model finishes deploying, you can invoke it with:
+```bash
+$ truss predict -d '"It is a [MASK] world"'
+```
+
+<RequestExample>
+```python model/model.py
+from transformers import pipeline
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self._secrets = kwargs["secrets"]
+        self._model = None
+
+    def load(self):
+        self._model = pipeline(
+            "fill-mask",
+            model="baseten/docs-example-gated-model",
+            use_auth_token=self._secrets["hf_access_token"]
+        )
+
+    def predict(self, model_input):
+        return self._model(model_input)
+```
+```yaml config.yaml
+environment_variables: {}
+model_name: private-model
+python_version: py39
+requirements:
+- torch==2.0.1
+- transformers==4.30.2
+resources:
+  cpu: "1"
+  memory: 2Gi
+  use_gpu: false
+  accelerator: null
+secrets:
+  hf_access_token: null
+system_packages: []
+```
+</RequestExample>
diff --git a/docs/examples/7_misc/system-packages.mdx b/docs/examples/7_misc/system-packages.mdx
new file mode 100644
index 000000000..029f5f6ee
--- /dev/null
+++ b/docs/examples/7_misc/system-packages.mdx
@@ -0,0 +1,140 @@
+---
+title: "Model with system packages"
+description: "Deploy a model with both Python and system dependencies"
+---
+
+
+        <Card
+          title="View on Github"
+          icon="github" href="https://github.com/basetenlabs/truss-examples/tree/main/7_misc/system-packages">
+        </Card>
+
+In this example, we build a Truss with a model that requires specific system packages.
+
+To add system packages to your Truss, you can add a `system_packages` key to your config.yaml file,
+for instance:
+To add system packages to your model serving environment, open config.yaml and
+update the system_packages key with a list of apt-installable Debian packages:
+
+```yaml config.yaml
+system_packages:
+ - tesseract-ocr
+```
+
+For this example, we use the [LayoutLM Document QA](https://huggingface.co/impira/layoutlm-document-qa) model,
+a multimodal model that answers questions about provided invoice documents. This model requires a system
+package, tesseract-ocr, which needs to be included in the model serving environment.
+
+# Setting up the model.py
+
+For this model, we use the HuggingFace transformers library, and the the document-question-answering task.
+
+```python model/model.py
+from transformers import pipeline
+
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self._model = None
+
+    def load(self):
+        self._model = pipeline(
+            "document-question-answering",
+            model="impira/layoutlm-document-qa",
+        )
+
+    def predict(self, model_input):
+        return self._model(
+            model_input["url"],
+            model_input["prompt"]
+        )
+```
+
+# Setting up the config.yaml file
+
+The main items that need to be configured in the config.yaml file are requirements
+and `system_packages` sections.
+
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+    example_model_input: {"url": "https://templates.invoicehome.com/invoice-template-us-neat-750px.png", "prompt": "What is the invoice number?"}
+model_name: LayoutLM Document QA
+python_version: py39
+```
+Specify the versions of the Python requirements that are needed.
+<Note>Always pin exact versions for your Python dependencies. The ML/AI space moves fast, so you want to have an up-to-date version of each package while also being protected from breaking changes.</Note>
+
+```yaml config.yaml
+requirements:
+- Pillow==10.0.0
+- pytesseract==0.3.10
+- torch==2.0.1
+- transformers==4.30.2
+resources:
+  cpu: "4"
+  memory: 16Gi
+  use_gpu: false
+  accelerator: null
+secrets: {}
+```
+The system_packages section is the other important bit here, you can
+add any package that's available via `apt` on Debian.
+
+```yaml config.yaml
+system_packages:
+- tesseract-ocr
+```
+# Deploy the model
+```bash
+$ truss push
+```
+You can then invoke the model with:
+```
+$ truss predict -d '{"url": "https://templates.invoicehome.com/invoice-template-us-neat-750px.png", "prompt": "What is the invoice number?"}'
+```
+
+<RequestExample>
+```python model/model.py
+from transformers import pipeline
+
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self._model = None
+
+    def load(self):
+        self._model = pipeline(
+            "document-question-answering",
+            model="impira/layoutlm-document-qa",
+        )
+
+    def predict(self, model_input):
+        return self._model(
+            model_input["url"],
+            model_input["prompt"]
+        )
+```
+```yaml config.yaml
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+    example_model_input: {"url": "https://templates.invoicehome.com/invoice-template-us-neat-750px.png", "prompt": "What is the invoice number?"}
+model_name: LayoutLM Document QA
+python_version: py39
+requirements:
+- Pillow==10.0.0
+- pytesseract==0.3.10
+- torch==2.0.1
+- transformers==4.30.2
+resources:
+  cpu: "4"
+  memory: 16Gi
+  use_gpu: false
+  accelerator: null
+secrets: {}
+system_packages:
+- tesseract-ocr
+```
+</RequestExample>
diff --git a/docs/mint.json b/docs/mint.json
index d291aade6..9e0149f8b 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -65,13 +65,16 @@
     {
       "group": "Examples",
       "pages": [
-        "examples/private-model",
-        "examples/system-packages",
-        "examples/streaming",
-        "examples/pre-process",
-        "examples/performance/cached-weights",
-        "examples/performance/tgi-server",
-        "examples/performance/vllm-server"
+        "examples/1_introduction/getting-started-bert",
+        "examples/2_image_classification/clip",
+        "examples/3_LLMs/llm",
+        "examples/3_LLMs/llm-with-streaming",
+        "examples/4_image_generation/sdxl",
+        "examples/6_high_performance/vllm",
+        "examples/6_high_performance/tgi",
+        "examples/6_high_performance/cached-weights",
+        "examples/7_misc/private-huggingface-model",
+        "examples/7_misc/system-packages"
       ]
     },
     {
@@ -169,7 +172,7 @@
   ],
   "analytics": {
     "gtm": {
-        "tagId": "GTM-WXD4NQTW"
+      "tagId": "GTM-WXD4NQTW"
     }
   }
 }
diff --git a/docs/welcome.mdx b/docs/welcome.mdx
index 8c9b1ddf0..e6340350a 100644
--- a/docs/welcome.mdx
+++ b/docs/welcome.mdx
@@ -11,9 +11,9 @@ description: "The simplest way to serve AI/ML models in production"
 
 See Trusses for popular models including:
 
-* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-70b-chat))
-* 🎨 [Stable Diffusion XL](https://github.com/basetenlabs/truss-examples/tree/main/stable-diffusion-xl-1.0)
-* 🗣 [Whisper](https://github.com/basetenlabs/truss-examples/tree/main/whisper-truss)
+* 🦙 [Llama 2 7B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-7b-chat) ([13B](https://github.com/basetenlabs/truss-examples/tree/main/model_library/llama-2-13b-chat)) ([70B](https://github.com/basetenlabs/truss-examples/tree/main/llama-2-70b-chat))
+* 🎨 [Stable Diffusion XL](https://github.com/basetenlabs/truss-examples/tree/main/model_library/stable-diffusion-xl-1.0)
+* 🗣 [Whisper](https://github.com/basetenlabs/truss-examples/tree/main/model_library/whisper-truss)
 
 and [dozens more examples on GitHub](https://github.com/basetenlabs/truss-examples/).
 
diff --git a/pyproject.toml b/pyproject.toml
index 2be5f8db8..b6d27d31f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.7.12rc1"
+version = "0.7.13"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"