update: docker image & readme

c0sogi · Aug 16, 2023 · 1f111ba · 1f111ba
1 parent 7da7b60
commit 1f111ba
Show file tree

Hide file tree

Showing 8 changed files with 67 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ repositories/
 .vscode/
 .test-venv/
 .temp/
-PRIVATE_*
+PRIVATE_*
+private/*
diff --git a/build_shared_lib.py b/build_shared_lib.py
@@ -1,5 +1,6 @@
 # flake8: noqa
 
+from argparse import ArgumentParser
 from llama_api.utils.llama_cpp import (
     build_shared_lib,
     CPU_ARGS,  # Only use CPU
@@ -8,8 +9,24 @@
 )
 from os import environ
 
+ARGS = {
+    "CPU": CPU_ARGS,
+    "METAL": METAL_ARGS,
+    "CUBLAS": CUBLAS_ARGS,
+    "CUDA": CUBLAS_ARGS,
+}
 
 if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-b",
+        "--build_type",
+        type=lambda s: str(s).upper(),
+        default="CPU",
+        choices=["CPU", "METAL", "CUBLAS", "CUDA"],
+        help="Build type",
+    )
+
     environ["FORCE_CMAKE"] = "1"
-    environ["CMAKE_ARGS"] = CPU_ARGS  # EDIT THIS LINE TO CHANGE BUILD TYPE !!!
+    environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type]
     build_shared_lib()
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
@@ -5,7 +5,7 @@ volumes:
 
 services:
   llama-api:
-    image: cosogi/llama-api:230814
+    image: cosogi/llama-api:230816
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230814
+    image: cosogi/llama-api:230816
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1

diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
@@ -2,7 +2,7 @@
 Use same format as OpenAI API"""
 
 
-from asyncio import CancelledError, Task, create_task
+from asyncio import Task, create_task
 from contextlib import asynccontextmanager, contextmanager
 from dataclasses import dataclass, field
 from functools import partial
@@ -138,7 +138,7 @@ async def get_wix_with_semaphore(
     wix_meta = wix_metas[choice(candidates)]
     async with wix_meta.semaphore:
         if await request.is_disconnected():
-            raise CancelledError("Request is disconnected")
+            return
         wix_meta.processed_key = request_key
         yield wix_meta.wix
 

diff --git a/main.py b/main.py
@@ -62,7 +62,7 @@
         help="Apply xformers' memory-efficient optimizations",
     )
     parser.add_argument(
-        "--disable-embeddings",
+        "--no-embed",
         action="store_true",
         help="Disable embeddings endpoint",
     )
@@ -80,6 +80,6 @@
             "LLAMA_API_XFORMERS": "1" if args.xformers else "",
             "LLAMA_API_API_KEY": args.api_key or "",
             "FORCE_CUDA": "1" if args.force_cuda else "",
-            "LLAMA_API_EMBEDDINGS": "1" if not args.disable_embeddings else "",
+            "LLAMA_API_EMBEDDINGS": "1" if not args.no_embed else "",
         },
     )
diff --git a/readme.md b/readme.md
@@ -3,11 +3,51 @@ This project aims to provide a simple way to run **LLama.cpp** and **Exllama** m
 
 You can use this server to run the models in your own application, or use it as a standalone API server!
 
+## Before you start
+
+1. **Python 3.8 / 3.9 / 3.10 / 3.11** is required to run the server. You can download it from https://www.python.org/downloads/
+
+2. **llama.cpp**: To use llama.cpp, and if you are **Windows** user, download [CMake](https://cmake.org/download/) to compile library.
+
+3. **ExLlama**: To use ExLlama, install the prerequisites of this [repository](https://github.com/turboderp/exllama). Maybe **Windows** user needs to install both [MSVC 2022](https://visualstudio.microsoft.com/downloads/) and [CUDA Toolkit 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive).
+
+
+
+## How to run server
+
+All required packages will be installed automatically with this command.
+
+```bash
+python -m main --install-pkgs
+```
+
+If you already have all required packages installed, you can skip the installation with this command.
+```bash
+python -m main
+```
+Options:
+```b
+usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-i] [-c] [--skip-torch-install] [--skip-tf-install] [--skip-compile] [-k API_KEY] [-x] [--no-embed]
+
+options:
+  -h, --help            show this help message and exit
+  -p PORT, --port PORT  Port to run the server on; default is 8000
+  -w MAX_WORKERS, --max-workers MAX_WORKERS
+                        Maximum number of process workers to run; default is 1
+  -i, --install-pkgs    Install all required packages before running the server
+  -c, --force-cuda      Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
+  --skip-torch-install  Skip installing pytorch, if `install-pkgs` is set
+  --skip-tf-install     Skip installing tensorflow, if `install-pkgs` is set
+  --skip-compile        Skip compiling the shared library of LLaMA C++ code
+  -k API_KEY, --api-key API_KEY
+                        API key to use for the server
+  -x, --xformers        Apply xformers' memory-efficient optimizations
+  --no-embed            Disable embeddings endpoint
+```
 
 ### Unique features
 
 1. **On-Demand Model Loading**
-   > **Caution:** There is a bug where VRAM does not get freed when unloading, if **cuBLAS** is used in **llama.cpp**. This issue has been reported for a while but it's still unresolved.
    - The project tries to load the model defined in `model_definitions.py` into the worker process when it is sent along with the request JSON body. The worker continually uses the cached model and when a request for a different model comes in, it unloads the existing model and loads the new one. 
 
 2. **Parallelism and Concurrency Enabled**
@@ -16,13 +56,6 @@ You can use this server to run the models in your own application, or use it as
 3. **Auto Dependency Installation**
    - The project automatically do git clones and installs the required dependencies, including **pytorch** and **tensorflow**, when the server is started. This is done by checking the `pyproject.toml` or `requirements.txt` file in the root directory of this project or other repositories. `pyproject.toml` will be parsed into `requirements.txt` with `poetry`. If you want to add more dependencies, simply add them to the file.
 
-## Before you start
-
-1. **Python 3.8 / 3.9 / 3.10 / 3.11** is required to run the server. You can download it from https://www.python.org/downloads/
-
-2. **llama.cpp**: To use llama.cpp, and if you are **Windows** user, download [CMake](https://cmake.org/download/) to compile library.
-
-3. **ExLlama**: To use ExLlama, install the prerequisites of this [repository](https://github.com/turboderp/exllama). Maybe **Windows** user needs to install both [MSVC 2022](https://visualstudio.microsoft.com/downloads/) and [CUDA Toolkit 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive).
 
 ## How to download the models
 
@@ -62,31 +95,7 @@ The path of the model has to be the folder name. Let's say, **orca_mini_7b**, wh
 ## Where to define the models
 Define llama.cpp & exllama models in `model_definitions.py`. You can define all necessary parameters to load the models there. Refer to the example in the file.
 
-## How to run server
-
-All required packages will be installed automatically with this command.
-
-```bash
-python -m main --install-pkgs
-```
 
-If you already have all required packages installed, you can skip the installation with this command.
-```bash
-python -m main
-```
-Options:
-```b
-  -h, --help            show this help message and exit
-  -p PORT, --port PORT  Port to run the server on; default is 8000
-  -w MAX_WORKERS, --max-workers MAX_WORKERS
-                        Maximum number of process workers to run; default is 1
-  --install-pkgs        Install all required packages before running the server
-  --force-cuda          Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
-  --skip-torch-install  Skip installing pytorch, if `install-pkgs` is set
-  --skip-tf-install     Skip installing tensorflow, if `install-pkgs` is set
-  -k API_KEY, --api-key API_KEY
-                        API key to use for the server
-```
 
 ## Usage: Text Completion
 Now, you can send a request to the server.

diff --git a/requirements-all.txt b/requirements-all.txt