From 729f154501833e7bc23f1cde66a3f45c494fa8ce Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Thu, 17 Aug 2023 21:41:03 +0900
Subject: [PATCH 01/18] Refactored code

---
 llama_api/modules/exllama.py    | 215 +++++++++++++++++++++-----------
 llama_api/modules/llama_cpp.py  |  20 ++-
 llama_api/server/pools/llama.py |  13 +-
 llama_api/server/routers/v1.py  |   2 +-
 llama_api/utils/completions.py  |  22 +---
 5 files changed, 175 insertions(+), 97 deletions(-)

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index bd868c3..2c7e81f 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -2,6 +2,7 @@
 # flake8: noqa
 from gc import collect
 from os import environ
+from time import time
 
 from ..utils.logger import ApiLogger
 
@@ -21,6 +22,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
@@ -32,13 +34,13 @@
 from torch.nn.functional import log_softmax
 
 from ..logits.base import BaseLogitProcessor
-from ..schemas.models import ExllamaModel
-from ..utils.completions import (
-    make_chat_completion,
-    make_chat_completion_chunk,
-    make_completion,
-    make_completion_chunk,
+from ..schemas.api import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    Completion,
+    CompletionChunk,
 )
+from ..schemas.models import ExllamaModel
 from ..utils.dependency import import_repository
 from ..utils.system import deallocate_memory
 from .base import BaseCompletionGenerator
@@ -440,32 +442,41 @@ def generate_completion_with_streaming(
     ) -> Iterator["CompletionChunk"]:
         completion_id = settings.completion_id
         model = self.model_name
-        last_token: Optional[str] = None
-        generated_text: str = ""
+        generated_text = ""  # type: str
         for token in _generate_text_with_streaming(
             self, prompt=prompt, settings=settings
         ):
             generated_text += token
-            if last_token is not None:
-                yield make_completion_chunk(
-                    id=completion_id,
-                    model=model,
-                    text=last_token,
-                    finish_reason=None,
-                )
-            last_token = token
-        yield make_completion_chunk(
-            id=completion_id,
-            model=model,
-            text=last_token if last_token is not None else "",
-            finish_reason="length"
-            if self._completion_status.get(
-                completion_id,
-                _encode(self.tokenizer, generated_text).shape[1],
-            )
-            >= settings.max_tokens
-            else "stop",
-        )
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": int(time()),
+                "model": model,
+                "choices": [
+                    {
+                        "text": token,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": None,
+                    }
+                ],
+            }
+        yield {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": int(time()),
+            "model": model,
+            "choices": [
+                {
+                    "text": "",
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": _get_finish_reason(
+                        self, settings, completion_id, generated_text
+                    ),
+                }
+            ],
+        }
 
     def generate_completion(
         self, prompt: str, settings: "TextGenerationSettings"
@@ -476,20 +487,31 @@ def generate_completion(
                 self, prompt=prompt, settings=settings
             )
         )
-        n_prompt_tokens = _encode(self.tokenizer, prompt).shape[1]
-        n_completion_tokens = self._completion_status.get(
+        prompt_tokens = _encode(self.tokenizer, prompt).shape[1]
+        completion_tokens = self._completion_status.get(
             completion_id, _encode(self.tokenizer, generated_text).shape[1]
         )
-        return make_completion(
-            id=completion_id,
-            model=self.model_name,
-            text=generated_text,
-            prompt_tokens=n_prompt_tokens,
-            completion_tokens=n_completion_tokens,
-            finish_reason="length"
-            if n_completion_tokens >= settings.max_tokens
-            else "stop",
-        )
+        return {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": int(time()),
+            "model": self.model_name,
+            "choices": [
+                {
+                    "text": generated_text,
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": _get_finish_reason(
+                        self, settings, completion_id, generated_text
+                    ),
+                }
+            ],
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        }
 
     def generate_chat_completion_with_streaming(
         self,
@@ -499,31 +521,52 @@ def generate_chat_completion_with_streaming(
         completion_id = settings.completion_id
         prompt = self.convert_messages_into_prompt(messages, settings=settings)
         model = self.model_name
-        last_token: Optional[str] = None
-        generated_text: str = ""
+        generated_text = ""  # type: str
+        yield {
+            "id": completion_id,
+            "object": "chat.completion.chunk",
+            "created": int(time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"role": "assistant"},
+                    "finish_reason": None,
+                }
+            ],
+        }
         for token in _generate_text_with_streaming(
             self, prompt=prompt, settings=settings
         ):
             generated_text += token
-            if last_token is not None:
-                yield make_chat_completion_chunk(
-                    id=completion_id,
-                    model=model,
-                    content=last_token,
-                    finish_reason=None,
-                )
-            last_token = token
-        yield make_chat_completion_chunk(
-            id=completion_id,
-            model=model,
-            content=last_token if last_token is not None else "",
-            finish_reason="length"
-            if self._completion_status.get(
-                completion_id,
-                _encode(self.tokenizer, generated_text).shape[1],
-            )
-            else "stop",
-        )
+            yield {
+                "id": completion_id,
+                "object": "chat.completion.chunk",
+                "created": int(time()),
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {"content": token},
+                        "finish_reason": None,
+                    }
+                ],
+            }
+        yield {
+            "id": completion_id,
+            "object": "chat.completion.chunk",
+            "created": int(time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": _get_finish_reason(
+                        self, settings, completion_id, generated_text
+                    ),
+                }
+            ],
+        }
 
     def generate_chat_completion(
         self,
@@ -541,16 +584,29 @@ def generate_chat_completion(
         completion_tokens = self._completion_status.get(
             completion_id, _encode(self.tokenizer, generated_text).shape[1]
         )
-        return make_chat_completion(
-            id=completion_id,
-            model=self.model_name,
-            content=generated_text,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            finish_reason="length"
-            if completion_tokens >= settings.max_tokens
-            else "stop",
-        )
+        return {
+            "id": completion_id,
+            "object": "chat.completion",
+            "created": int(time()),
+            "model": self.model_name,
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": generated_text,
+                    },
+                    "index": 0,
+                    "finish_reason": _get_finish_reason(
+                        self, settings, completion_id, generated_text
+                    ),
+                }
+            ],
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        }
 
     def encode(self, text: str) -> List[int]:
         assert self._tokenizer is not None, "Tokenizer is not initialized"
@@ -618,3 +674,20 @@ def _encode(
         ids = result[0] if isinstance(result, tuple) else result
         assert isinstance(ids, Tensor)
         return ids
+
+
+def _get_finish_reason(
+    cg: ExllamaCompletionGenerator,
+    settings: TextGenerationSettings,
+    completion_id: str,
+    generated_text: str,
+) -> Literal["length", "stop"]:
+    return (
+        "length"
+        if cg._completion_status.get(
+            completion_id,
+            _encode(cg.tokenizer, generated_text).shape[1],
+        )
+        >= settings.max_tokens
+        else "stop"
+    )
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index d88e3e4..4652d63 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -1,6 +1,7 @@
 """Wrapper for llama_cpp to generate text completions."""
 from inspect import signature
 from typing import (
+    Callable,
     Iterator,
     List,
     Optional,
@@ -27,22 +28,33 @@
 
 logger = ApiLogger(__name__)
 logger.info("🦙 llama-cpp-python repository found!")
-build_shared_lib(logger=logger)
 with import_repository(
     git_path="https://github.com/abetlen/llama-cpp-python",
     disk_path="repositories/llama_cpp",
     options=["--recurse-submodules"],
 ):
+    build_shared_lib(logger=logger)
     from repositories.llama_cpp import llama_cpp
 
 
+class LogitsProcessorList(
+    List[Callable[[List[int], List[float]], List[float]]]
+):
+    def __call__(
+        self, input_ids: List[int], scores: List[float]
+    ) -> List[float]:
+        for processor in self:
+            scores = processor(input_ids, scores)
+        return scores
+
+
 def _create_completion(
     client: llama_cpp.Llama,
     prompt: str,
     stream: bool,
     settings: TextGenerationSettings,
 ) -> Union[Completion, Iterator[CompletionChunk]]:
-    logit_processors = llama_cpp.LogitsProcessorList(
+    logit_processors = LogitsProcessorList(
         [
             processor.without_torch
             for processor in BaseCompletionGenerator.get_logit_processors(
@@ -53,7 +65,7 @@ def _create_completion(
             )
         ]
     )
-    return client.create_completion(  # type: ignore
+    return client.create_completion(
         stream=stream,
         prompt=prompt,
         max_tokens=settings.max_tokens,
@@ -69,7 +81,7 @@ def _create_completion(
         mirostat_mode=settings.mirostat_mode,
         mirostat_tau=settings.mirostat_tau,
         mirostat_eta=settings.mirostat_eta,
-        logits_processor=logit_processors if logit_processors else None,
+        logits_processor=logit_processors if logit_processors else None,  # type: ignore  # noqa: E501
         stop=settings.stop,
     )
 
diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
index c3f5756..6df7fec 100644
--- a/llama_api/server/pools/llama.py
+++ b/llama_api/server/pools/llama.py
@@ -70,10 +70,15 @@ def get_model_names() -> List[str]:
 
 def get_model(model_name: str) -> "BaseLLMModel":
     """Get a model from the model_definitions.py file"""
-    with logger.log_any_error(
-        f"Error getting model: {model_name}", exc_info=None
-    ):
-        return getattr(model_definitions, model_name)
+    try:
+        llm_model = getattr(model_definitions, model_name)
+        assert isinstance(
+            llm_model, BaseLLMModel
+        ), f"Not a LLM model: {model_name}"
+        return llm_model
+    except Exception as e:
+        logger.error(e)
+        raise ValueError(f"Model path does not exist: {model_name}")
 
 
 def get_completion_generator(
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index b2aeb47..a9a4fba 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -178,7 +178,7 @@ async def get_event_publisher(
         CreateChatCompletionRequest,
         CreateCompletionRequest,
     ],
-    inner_send_chan: MemoryObjectSendStream,
+    inner_send_chan: MemoryObjectSendStream[bytes],
     task: "Task[None]",
     interrupt_signal: Event,
     iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]],
diff --git a/llama_api/utils/completions.py b/llama_api/utils/completions.py
index d12b8ed..4353b81 100644
--- a/llama_api/utils/completions.py
+++ b/llama_api/utils/completions.py
@@ -1,5 +1,5 @@
 from time import time
-from typing import TYPE_CHECKING, Iterator, Literal, Optional
+from typing import Iterator, Literal, Optional
 from uuid import uuid4
 
 from ..schemas.api import (
@@ -70,13 +70,10 @@ def make_chat_completion(
 
 
 def make_chat_completion_from_json(
-    json_data: dict,  # type: ignore
+    json_data: dict,
     index: int = 0,
 ) -> ChatCompletion:
     """Make ChatCompletion from json data(dict)"""
-    if TYPE_CHECKING:
-        # A hacky way to make mypy happy
-        json_data: ChatCompletion = json_data  # type: ignore
     usage = json_data.get("usage")
     if usage is None:
         usage = CompletionUsage(
@@ -146,12 +143,9 @@ def make_chat_completion_chunk(
 
 
 def make_chat_completion_chunk_from_json(
-    json_data: dict,  # type: ignore
+    json_data: dict,
 ) -> ChatCompletionChunk:
     """Make ChatCompletionChunk from json data(dict)"""
-    if TYPE_CHECKING:
-        # A hacky way to make mypy happy
-        json_data: ChatCompletionChunk = json_data  # type: ignore
     delta = json_data["choices"][0]["delta"]
     function_call = delta.get("function_call")
     if function_call:
@@ -203,12 +197,9 @@ def make_completion_chunk(
 
 
 def make_completion_chunk_from_json(
-    json_data: dict,  # type: ignore
+    json_data: dict,
 ) -> CompletionChunk:
     """Make CompletionChunk from json data(dict)"""
-    if TYPE_CHECKING:
-        # A hacky way to make mypy happy
-        json_data: CompletionChunk = json_data  # type: ignore
     choice = json_data["choices"][0]
     return make_completion_chunk(
         id=json_data["id"],
@@ -259,13 +250,10 @@ def make_completion(
 
 
 def make_completion_from_json(
-    json_data: dict,  # type: ignore
+    json_data: dict,
     index: int = 0,
 ) -> Completion:
     """Make Completion from json data(dict)"""
-    if TYPE_CHECKING:
-        # A hacky way to make mypy happy
-        json_data: Completion = json_data  # type: ignore
     usage = json_data.get("usage")
     if usage is None:
         usage = CompletionUsage(

From ac8318e9e3241266ac28df50b2b68838de9976f5 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Thu, 17 Aug 2023 21:43:14 +0900
Subject: [PATCH 02/18] Added lock to completion generator

---
 llama_api/mixins/lock.py        | 21 +++++++++++++++++++++
 llama_api/modules/base.py       |  7 ++++++-
 llama_api/server/pools/llama.py | 22 ++++++++++++----------
 3 files changed, 39 insertions(+), 11 deletions(-)
 create mode 100644 llama_api/mixins/lock.py

diff --git a/llama_api/mixins/lock.py b/llama_api/mixins/lock.py
new file mode 100644
index 0000000..9941e10
--- /dev/null
+++ b/llama_api/mixins/lock.py
@@ -0,0 +1,21 @@
+from threading import Lock
+from typing import Optional
+
+
+class LockMixin:
+    _lock: Optional[Lock] = None
+
+    @property
+    def lock(self) -> Lock:
+        """Get the lock."""
+        if self._lock is None:
+            self._lock = Lock()
+        return self._lock
+
+    def acquire_lock(self) -> None:
+        """Acquire the lock."""
+        self.lock.acquire()
+
+    def release_lock(self) -> None:
+        """Release the lock."""
+        self.lock.release()
diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
index 61b1e7e..424395f 100644
--- a/llama_api/modules/base.py
+++ b/llama_api/modules/base.py
@@ -4,6 +4,7 @@
 from typing import Any, Iterator, List, TypeVar
 
 from ..mixins.interrupt import InterruptMixin
+from ..mixins.lock import LockMixin
 from ..mixins.logits import LogitsMixin
 from ..mixins.prompt_utils import PromptUtilsMixin
 from ..schemas.api import (
@@ -35,7 +36,11 @@ def model_path_resolved(self) -> str:
 
 
 class BaseCompletionGenerator(
-    ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin
+    ABC,
+    PromptUtilsMixin,
+    InterruptMixin,
+    LogitsMixin,
+    LockMixin,
 ):
     """Base class for all completion generators."""
 
diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
index 6df7fec..c2db637 100644
--- a/llama_api/server/pools/llama.py
+++ b/llama_api/server/pools/llama.py
@@ -56,7 +56,9 @@ def completion_generator_manager(
     """Context manager for completion generators."""
     completion_generator = get_completion_generator(body)
     completion_generator.interrupt_signal = interrupt_signal
+    completion_generator.acquire_lock()
     yield completion_generator
+    completion_generator.release_lock()
     completion_generator.interrupt_signal = None
 
 
@@ -92,19 +94,19 @@ def get_completion_generator(
     If the model is not cached, create a new one.
     If the cache is full, delete the oldest completion generator."""
 
+    # Check if the model is an OpenAI model
+    openai_replacement_models: Dict[str, str] = getattr(
+        model_definitions, "openai_replacement_models", {}
+    )
+    if body.model in openai_replacement_models:
+        body.model = openai_replacement_models[body.model]
+        body.is_openai = True
+    llm_model = get_model(body.model)
+
     with logger.log_any_error(
-        f"Error getting a completion generator of {body.model}"
+        f"Error getting a completion generator of {body.model}",
     ):
-        # Check if the model is an OpenAI model
-        openai_replacement_models: Dict[str, str] = getattr(
-            model_definitions, "openai_replacement_models", {}
-        )
-        if body.model in openai_replacement_models:
-            body.model = openai_replacement_models[body.model]
-            body.is_openai = True
-
         # Check if the model is defined in LLMModels enum
-        llm_model = get_model(body.model)
 
         # Check if the model is cached. If so, return the cached one.
         for completion_generator in completion_generators:

From 38427681ce273f96b2a9e2e8282e8133562b76e8 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Thu, 17 Aug 2023 22:34:51 +0900
Subject: [PATCH 03/18] fix typo

---
 llama_api/modules/exllama.py   | 2 +-
 llama_api/utils/completions.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index 2c7e81f..b773261 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -678,7 +678,7 @@ def _encode(
 
 def _get_finish_reason(
     cg: ExllamaCompletionGenerator,
-    settings: TextGenerationSettings,
+    settings: "TextGenerationSettings",
     completion_id: str,
     generated_text: str,
 ) -> Literal["length", "stop"]:
diff --git a/llama_api/utils/completions.py b/llama_api/utils/completions.py
index 4353b81..6b696f3 100644
--- a/llama_api/utils/completions.py
+++ b/llama_api/utils/completions.py
@@ -305,9 +305,7 @@ def convert_text_completion_chunks_to_chat(
                 choices=[
                     ChatCompletionChunkChoice(
                         index=0,
-                        delta={
-                            "role": "assistant",
-                        },
+                        delta={"role": "assistant"},
                         finish_reason=None,
                     )
                 ],

From de83eedc16fc707b08206319961a143911ff2ec3 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Fri, 18 Aug 2023 19:20:42 +0900
Subject: [PATCH 04/18] Do git clone repos when server startup

---
 llama_api/modules/exllama.py      | 22 ++++++----------------
 llama_api/modules/exllama_lora.py | 16 ++++++++--------
 llama_api/modules/llama_cpp.py    | 15 +++------------
 llama_api/server/app_settings.py  |  5 +++++
 llama_api/shared/config.py        | 21 ++++++++++++++++++++-
 llama_api/utils/system.py         | 29 ++++++++++++++++++++---------
 6 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index b773261..d3ad093 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -1,8 +1,6 @@
 """Wrapper for exllama to generate text completions."""
 # flake8: noqa
-from gc import collect
 from os import environ
-from time import time
 
 from ..utils.logger import ApiLogger
 
@@ -15,9 +13,10 @@
         from ..modules.xformers import hijack_attention_forward
 
         hijack_attention_forward()
+from gc import collect
 from pathlib import Path
+from time import time
 from typing import (
-    TYPE_CHECKING,
     Dict,
     Iterable,
     Iterator,
@@ -35,34 +34,25 @@
 
 from ..logits.base import BaseLogitProcessor
 from ..schemas.api import (
+    APIChatMessage,
     ChatCompletion,
     ChatCompletionChunk,
     Completion,
     CompletionChunk,
+    TextGenerationSettings,
 )
 from ..schemas.models import ExllamaModel
+from ..shared.config import Config
 from ..utils.dependency import import_repository
 from ..utils.system import deallocate_memory
 from .base import BaseCompletionGenerator
 from .exllama_lora import ExLlamaLora
 
-with import_repository(
-    git_path="https://github.com/turboderp/exllama",
-    disk_path="repositories/exllama",
-):
+with import_repository(**Config.repositories["exllama"]):
     from repositories.exllama.generator import ExLlamaGenerator
     from repositories.exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
     from repositories.exllama.tokenizer import ExLlamaTokenizer
 
-if TYPE_CHECKING:
-    from ..schemas.api import (
-        APIChatMessage,
-        ChatCompletion,
-        ChatCompletionChunk,
-        Completion,
-        CompletionChunk,
-        TextGenerationSettings,
-    )
 
 assert cuda.is_available(), "CUDA must be available to use ExLlama."
 
diff --git a/llama_api/modules/exllama_lora.py b/llama_api/modules/exllama_lora.py
index 7f2c3c9..5e54391 100644
--- a/llama_api/modules/exllama_lora.py
+++ b/llama_api/modules/exllama_lora.py
@@ -1,20 +1,20 @@
 # flake8: noqa
-from pathlib import Path
-from typing import Dict, Union
-from llama_api.utils.dependency import import_repository
 
-with import_repository(
-    git_path="https://github.com/turboderp/exllama",
-    disk_path="repositories/exllama",
-):
-    from repositories.exllama.model import ExLlama, Ex4bitLinear, ExLlamaConfig
 
 import json
+from pathlib import Path
+from typing import Dict, Union
 
 import torch
 from safetensors.torch import load_file as safe_load_file
 from torch import load as load_file
 
+from ..shared.config import Config
+from ..utils.dependency import import_repository
+
+with import_repository(**Config.repositories["exllama"]):
+    from repositories.exllama.model import Ex4bitLinear, ExLlama, ExLlamaConfig
+
 
 class ExLlamaLora:
     lora_config_path: str
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index 4652d63..3207bec 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -1,12 +1,6 @@
 """Wrapper for llama_cpp to generate text completions."""
 from inspect import signature
-from typing import (
-    Callable,
-    Iterator,
-    List,
-    Optional,
-    Union,
-)
+from typing import Callable, Iterator, List, Optional, Union
 
 from ..schemas.api import (
     APIChatMessage,
@@ -17,6 +11,7 @@
     TextGenerationSettings,
 )
 from ..schemas.models import LlamaCppModel
+from ..shared.config import Config
 from ..utils.completions import (
     convert_text_completion_chunks_to_chat,
     convert_text_completion_to_chat,
@@ -28,11 +23,7 @@
 
 logger = ApiLogger(__name__)
 logger.info("🦙 llama-cpp-python repository found!")
-with import_repository(
-    git_path="https://github.com/abetlen/llama-cpp-python",
-    disk_path="repositories/llama_cpp",
-    options=["--recurse-submodules"],
-):
+with import_repository(**Config.repositories["llama_cpp"]):
     build_shared_lib(logger=logger)
     from repositories.llama_cpp import llama_cpp
 
diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 27abd75..4bb7891 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -5,9 +5,12 @@
 from pathlib import Path
 from typing import Dict, Literal, Optional
 
+from ..shared.config import Config
+
 from ..utils.dependency import (
     get_installed_packages,
     get_poetry_executable,
+    git_clone,
     install_all_dependencies,
     install_package,
     install_pytorch,
@@ -72,6 +75,8 @@ def initialize_before_launch(
     skip_compile: bool = False,
 ) -> None:
     """Initialize the app"""
+    for git_clone_args in Config.repositories.values():
+        git_clone(**git_clone_args)
     if install_packages:
         # Install all dependencies
         if not skip_compile:
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index 4ecd592..7c4c887 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -1,5 +1,12 @@
 from pathlib import Path
-from typing import List, Tuple
+from typing import Dict, List, Literal, Optional, Tuple
+from typing_extensions import TypedDict
+
+
+class GitCloneArgs(TypedDict):
+    git_path: str
+    disk_path: str
+    options: Optional[List[str]]
 
 
 class Config:
@@ -26,3 +33,15 @@ class Config:
         "q6_K",
         "q8_0",
     ]
+    repositories: Dict[Literal["exllama", "llama_cpp"], GitCloneArgs] = {
+        "exllama": GitCloneArgs(
+            git_path="https://github.com/turboderp/exllama",
+            disk_path="repositories/exllama",
+            options=["--recurse-submodules"],
+        ),
+        "llama_cpp": GitCloneArgs(
+            git_path="https://github.com/abetlen/llama-cpp-python",
+            disk_path="repositories/llama_cpp",
+            options=None,
+        ),
+    }
diff --git a/llama_api/utils/system.py b/llama_api/utils/system.py
index 78ca571..a616a56 100644
--- a/llama_api/utils/system.py
+++ b/llama_api/utils/system.py
@@ -11,20 +11,31 @@
     from queue import Queue
 
 ContainerLike = Union["deque", "Queue", "AsyncQueue", list, dict]
+cuda_version: Optional[str] = None  # Memoization of get_cuda_version()
 
 
 def get_cuda_version() -> Optional[str]:
     """Returns the current CUDA version as a string.
     Returns None if nvidia-smi is not available or CUDA is not installed."""
-    try:
-        result = compile(r"CUDA Version: (\d+\.\d+)").search(
-            check_output(["nvidia-smi"]).decode("utf-8")
-        )
-        if result is None:
-            return
-        return result.group(1)
-    except Exception:
-        return
+    global cuda_version
+    if cuda_version is not None:  # If memoized
+        return cuda_version or None  # If cuda_version is "", return None
+    for cli_args, regex in (
+        (["nvcc", "--version"], r"release (\d+\.\d+)"),
+        (["nvidia-smi"], r"CUDA Version: (\d+\.\d+)"),
+    ):
+        try:
+            # Try to get the CUDA version from the output of the command
+            cuda_version_match = compile(regex).search(
+                check_output(cli_args).decode("utf-8")
+            )
+            if cuda_version_match is None:
+                continue
+            cuda_version = cuda_version_match.group(1)
+            return cuda_version
+        except Exception:
+            continue
+    cuda_version = ""
 
 
 def get_vram_usages() -> Optional[List[int]]:

From 366c1a4e8ea3aae42b5c193a0e0ebcb49eceed3b Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sat, 19 Aug 2023 15:12:06 +0900
Subject: [PATCH 05/18] Optimized performance: llama.cpp & exllama

---
 llama_api/mixins/completion.py    |  44 ++
 llama_api/mixins/function_call.py | 371 ++++++++++++++++
 llama_api/mixins/prompt_utils.py  | 133 +++---
 llama_api/modules/base.py         | 197 ++++++++-
 llama_api/modules/exllama.py      | 678 ++++++++++--------------------
 llama_api/modules/llama_cpp.py    | 346 +++++++++------
 6 files changed, 1122 insertions(+), 647 deletions(-)
 create mode 100644 llama_api/mixins/completion.py
 create mode 100644 llama_api/mixins/function_call.py

diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py
new file mode 100644
index 0000000..312a4b0
--- /dev/null
+++ b/llama_api/mixins/completion.py
@@ -0,0 +1,44 @@
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Literal, Optional
+
+from ..schemas.api import CompletionLogprobs, TextGenerationSettings
+
+
+@dataclass
+class CompletionStatus:
+    # These fields are set by `accept_settings` method.
+    input_text: str = field(default="", init=False)
+    input_tokens: int = field(default=0, init=False)
+
+    # These fields are set by `generate_text` method.
+    generated_text: str = field(default="", init=False)
+    generated_tokens: int = field(default=0, init=False)
+    logprobs: Optional[CompletionLogprobs] = field(default=None, init=False)
+
+
+class CompletionMixin:
+    """A mixin for modules that support completion generation."""
+
+    _completion_status: Optional[defaultdict[str, CompletionStatus]] = None
+
+    @property
+    def completion_status(self) -> Dict[str, CompletionStatus]:
+        """Get the completion status.
+        key: completion_id
+        value: CompletionStatus"""
+        if self._completion_status is None:
+            self._completion_status = defaultdict(CompletionStatus)
+        return self._completion_status
+
+    def get_finish_reason(
+        self,
+        settings: TextGenerationSettings,
+    ) -> Literal["length", "stop"]:
+        """Get the finish reason for the completion."""
+        return (
+            "length"
+            if self.completion_status[settings.completion_id].generated_tokens
+            >= settings.max_tokens
+            else "stop"
+        )
diff --git a/llama_api/mixins/function_call.py b/llama_api/mixins/function_call.py
new file mode 100644
index 0000000..7b7ae6e
--- /dev/null
+++ b/llama_api/mixins/function_call.py
@@ -0,0 +1,371 @@
+"""Helper classes for wrapping functions in OpenAI's API"""
+# flake8: noqa
+import json
+from inspect import signature
+from re import Pattern, compile
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    overload,
+)
+
+from typing_extensions import Annotated, get_args, get_origin
+
+from ..schemas.api import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    CreateChatCompletionRequest,
+)
+from ..schemas.function_call import (
+    FunctionCall,
+    FunctionCallParameter,
+    JsonTypes,
+)
+
+# whitespace is constrained to a single space char
+# to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE: str = '" "?'
+
+PRIMITIVE_RULES: Dict[str, str] = {
+    "boolean": '("true" | "false") space',
+    "number": '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+    "integer": '("-"? ([0-9] | [1-9] [0-9]*)) space',
+    "string": r""" "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space """,
+    "null": '"null" space',
+}
+
+INVALID_RULE_CHARS_RE: "Pattern[str]" = compile(r"[^a-zA-Z0-9-]+")
+GRAMMAR_LITERAL_ESCAPE_RE: "Pattern[str]" = compile(r'[\r\n"]')
+GRAMMAR_LITERAL_ESCAPES: Dict[str, str] = {
+    "\r": "\\r",
+    "\n": "\\n",
+    '"': '\\"',
+}
+
+# Type aliases
+SchemaType = Literal[
+    "boolean", "number", "integer", "string", "null", "object", "array"
+]
+SchemaKey = Literal[
+    "type", "oneOf", "anyOf", "const", "enum", "properties", "items"
+]
+
+
+def _get_type_and_optional(t: Type) -> Tuple[Type, bool]:
+    """Returns the type and whether it's an Optional type.
+    This is useful when Type can be Union and you want to know if it's an Optional type.
+    """
+    # Optional[str] is equivalent to Union[str, None], so check if it's a Union type.
+    if get_origin(t) in (type(Union), Union):
+        args = get_args(t)  # type: Tuple[Type, ...]
+        # If there's a None type in the Union, it's an Optional type.
+        optional = type(None) in args
+        # Return the first argument that isn't None.
+        first_arg = next(arg for arg in args if arg is not type(None))
+        return first_arg, optional
+    else:
+        # If it's not a Union type, it's not an Optional type.
+        return t, False
+
+
+class FunctionCallMixin:
+    """Contains helper functions converting JSON schemas to BNF grammars
+    Reference: https://github.com/ggerganov/llama.cpp/pull/1887"""
+
+    _prop_order: Dict[str, int]
+    _rules: Dict[str, str]
+
+    def invoke_function_call(
+        self, request: CreateChatCompletionRequest
+    ) -> ChatCompletion:
+        """Invoke the function call while chat completion"""
+        raise NotImplementedError(
+            "function call is not implemented for this model"
+        )
+
+    def invoke_function_call_streaming(
+        self, request: CreateChatCompletionRequest
+    ) -> Iterator[ChatCompletionChunk]:
+        """Invoke the function call while chat completion, streaming the results"""
+        raise NotImplementedError(
+            "function call is not implemented for this model"
+        )
+
+    @staticmethod
+    @overload
+    def from_function_calls(
+        function_calls: FunctionCall,
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> str:
+        ...
+
+    @staticmethod
+    @overload
+    def from_function_calls(
+        function_calls: Iterable[FunctionCall],
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> List[str]:
+        ...
+
+    @staticmethod
+    def from_function_calls(
+        function_calls: Union[FunctionCall, Iterable[FunctionCall]],
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> Union[str, List[str]]:
+        """Parse a FunctionCall object into a BNF grammar"""
+        if isinstance(function_calls, Iterable):
+            return_as_list = True
+            function_calls = list(function_calls)
+        else:
+            return_as_list = False
+            function_calls = [function_calls]
+
+        bnfs = []  # type: List[str]
+        for function_call in function_calls:
+            self = FunctionCallMixin()
+            self._prop_order = prop_order or {}
+            self._rules = {"space": SPACE_RULE}
+            parameters = function_call.to_dict().get("parameters")
+            assert parameters is not None, "function call must have parameters"
+            self._visit(dict(parameters), "")
+            bnfs.append(self._format_grammar())
+        return bnfs if return_as_list else bnfs[0]
+
+    @staticmethod
+    @overload
+    def from_functions(
+        functions: Callable,
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> str:
+        ...
+
+    @staticmethod
+    @overload
+    def from_functions(
+        functions: Iterable[Callable],
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> List[str]:
+        ...
+
+    @staticmethod
+    def from_functions(
+        functions: Union[Callable, Iterable[Callable]],
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> Union[str, List[str]]:
+        """Parse a function into a BNF grammar"""
+        if isinstance(functions, Iterable):
+            return_as_list = True
+            functions = list(functions)
+        else:
+            return_as_list = False
+            functions = [functions]
+
+        function_calls = []  # type: List[FunctionCall]
+        json_types = get_args(JsonTypes)
+        line_break_pattern = compile(r"\n\s*")
+
+        for function in functions:
+            function_call_params = []  # type: List[FunctionCallParameter]
+            required = []  # type: List[str]
+            for name, param in signature(function).parameters.items():
+                annotation = param.annotation
+                description = ""  # type: str
+                enum = []  # type: List[Any]
+
+                if get_origin(annotation) is Annotated:
+                    # If the annotation is an Annotated type,
+                    # we need to parse the metadata
+                    _param_args = get_args(param.annotation)
+                    _param_type = _param_args[0]
+
+                    for metadata in _param_args[1:]:
+                        if isinstance(metadata, str):
+                            # If the metadata is a string, it's the description
+                            description += metadata
+                        elif isinstance(metadata, Iterable):
+                            # If the metadata is an iterable, it's the enum
+                            enum.extend(metadata)
+
+                else:
+                    _param_type = annotation
+                param_type, optional = _get_type_and_optional(_param_type)
+                if not optional:
+                    required.append(name)
+                if param_type not in json_types:
+                    continue
+                function_call_params.append(
+                    FunctionCallParameter(
+                        name=name,
+                        type=param_type,
+                        description=description or None,
+                        enum=enum or None,
+                    )
+                )
+                function_calls.append(
+                    FunctionCall(
+                        name=function.__name__,
+                        description=line_break_pattern.sub(
+                            " ", function.__doc__
+                        )
+                        if function.__doc__
+                        else None,
+                        parameters=function_call_params,
+                        required=required or None,
+                    )
+                )
+        return FunctionCallMixin.from_function_calls(
+            function_calls if return_as_list else function_calls[0],
+            prop_order,
+        )
+
+    def _format_literal(self, literal: Any) -> str:
+        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or "",
+            json.dumps(literal),
+        )
+        return f'"{escaped}"'
+
+    def _add_rule(self, name, rule):
+        esc_name = INVALID_RULE_CHARS_RE.sub("-", name)
+        if esc_name not in self._rules or self._rules[esc_name] == rule:
+            key = esc_name
+        else:
+            i = 0
+            while f"{esc_name}{i}" in self._rules:
+                i += 1
+            key = f"{esc_name}{i}"
+        self._rules[key] = rule
+        return key
+
+    def _visit(self, schema: Dict[SchemaKey, Any], name: str) -> str:
+        schema_type: SchemaType = schema[
+            "type"
+        ]  # The "type" key is always present
+        rule_name: str = name or "root"  # root rule is always named "root"
+
+        if "oneOf" in schema or "anyOf" in schema:
+            # This is a union type
+            rule: str = " | ".join(
+                (
+                    self._visit(alt_schema, f'{name}{"-" if name else ""}{i}')
+                    for i, alt_schema in enumerate(
+                        schema.get("oneOf") or schema["anyOf"]
+                    )
+                )
+            )
+            return self._add_rule(rule_name, rule)
+
+        elif "const" in schema:
+            # This is a literal
+            return self._add_rule(
+                rule_name, self._format_literal(schema["const"])
+            )
+
+        elif "enum" in schema:
+            # This is a set of literals
+            rule = " | ".join(
+                (self._format_literal(v) for v in schema["enum"])
+            )
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type == "object" and "properties" in schema:
+            # TODO: `required` keyword
+            prop_order = self._prop_order
+            prop_pairs = sorted(
+                schema["properties"].items(),
+                # sort by position in prop_order (if specified) then by key
+                key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+            )
+
+            rule = '"{" space'
+            for i, (prop_name, prop_schema) in enumerate(prop_pairs):
+                prop_rule_name = self._visit(
+                    prop_schema, f'{name}{"-" if name else ""}{prop_name}'
+                )
+                if i > 0:
+                    rule += ' "," space'
+                rule += rf' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
+            rule += ' "}" space'
+
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type == "array" and "items" in schema:
+            # TODO `prefixItems` keyword
+            item_rule_name = self._visit(
+                schema["items"], f'{name}{"-" if name else ""}item'
+            )
+            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+            return self._add_rule(rule_name, rule)
+
+        else:
+            assert (
+                schema_type in PRIMITIVE_RULES
+            ), f"Unrecognized schema: {schema}"
+            return self._add_rule(
+                "root" if rule_name == "root" else schema_type,
+                PRIMITIVE_RULES[schema_type],
+            )
+
+    def _format_grammar(self):
+        return "\n".join(
+            (f"{name} ::= {rule}" for name, rule in self._rules.items())
+        )
+
+
+if __name__ == "__main__":
+    # from llama_cpp import LlamaGrammar, Llama
+
+    # Define a python function and parse it into a grammar
+    def get_current_weather(
+        location: Annotated[
+            str,
+            "The location to get the current weather for",
+        ],
+        unit: Annotated[
+            str,
+            "The unit of temperature to return",
+            ["fahrenheit", "celsius"],
+        ],
+        source: Annotated[
+            str,
+            "The source of the weather information",
+            ["openweathermap", "weatherapi"],
+        ] = "openweathermap",
+    ):
+        """Get the current weather in a given location"""
+
+    model_path = "C:/Users/sdml/Desktop/orca-mini-3b.ggmlv3.q4_0.bin"
+    grammar: str = FunctionCallMixin.from_functions(get_current_weather)
+    print(f"Grammar:\n{grammar}")
+    # llama_grammar = LlamaGrammar.from_string(grammar, verbose=False)
+    # llm = Llama(model_path)
+    # llm.grammar = llama_grammar
+    # for city in (
+    #     "London",
+    #     "Paris",
+    #     "New York",
+    #     "Berlin",
+    #     "Tokyo",
+    #     "Sydney",
+    #     "Moscow",
+    #     "Beijing",
+    #     "Cairo",
+    #     "Rome",
+    # ):
+    #     print(llm(prompt=f"### User: What is the weather in {city} today? ### Assistant:")["choices"][0]["text"])  # type: ignore
+
+    # # Output:
+    # # { "location": "London", "source": "openweathermap","unit" : "celsius"}
diff --git a/llama_api/mixins/prompt_utils.py b/llama_api/mixins/prompt_utils.py
index 0b19dec..8573e27 100644
--- a/llama_api/mixins/prompt_utils.py
+++ b/llama_api/mixins/prompt_utils.py
@@ -1,76 +1,91 @@
-from typing import List
+from typing import List, Optional, Set
+
 from ..schemas.api import APIChatMessage, TextGenerationSettings
 
 
-class PromptUtilsMixin:
-    user_role: str = "user"
-    system_role: str = "system"
-    user_input_role: str = "User"
-    system_input_role: str = "System"
-    ai_fallback_input_role: str = "Assistant"
+def _get_stop_strings(*roles: str) -> List[str]:
+    """A helper method to generate stop strings for a given set of roles.
+    Stop strings are required to stop text completion API from generating
+    text that does not belong to the current chat turn.
+    e.g. The common stop string is "### USER:",
+    which can prevent ai from generating user's message itself."""
 
-    @staticmethod
-    def get_stop_strings(*roles: str) -> List[str]:
-        """A helper method to generate stop strings for a given set of roles.
-        Stop strings are required to stop text completion API from generating
-        text that does not belong to the current chat turn.
-        e.g. The common stop string is "### USER:",
-        which can prevent ai from generating user's message itself."""
+    prompt_stop = set()
+    for role in roles:
+        avoids = (
+            f"### {role}:",
+            f"###{role}:",
+        )
+        prompt_stop.update(
+            avoids,
+            map(str.capitalize, avoids),
+            map(str.upper, avoids),
+            map(str.lower, avoids),
+        )
+    return list(prompt_stop)
 
-        prompt_stop = set()
-        for role in roles:
-            avoids = (
-                f"{role}:",
-                f"### {role}:",
-                f"###{role}:",
-            )
-            prompt_stop.update(
-                avoids,
-                map(str.capitalize, avoids),
-                map(str.upper, avoids),
-                map(str.lower, avoids),
-            )
-        return list(prompt_stop)
 
-    @classmethod
+class PromptUtilsMixin:
+    _stop_set: Optional[Set[str]] = None
+    _stop_piece_set: Optional[Set[str]] = None
+
+    @staticmethod
     def convert_messages_into_prompt(
-        cls, messages: List[APIChatMessage], settings: TextGenerationSettings
+        messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> str:
-        """A helper method to convert list of messages into one text prompt."""
-
-        ai_input_role: str = cls.ai_fallback_input_role
-        chat_history: str = ""
-        for message in messages:
-            if message.role.lower() == cls.user_role:
-                input_role = cls.user_input_role
-            elif message.role.lower() == cls.system_role:
-                input_role = cls.system_input_role
-            else:
-                input_role = ai_input_role = message.role
-            chat_history += f"### {input_role}:{message.content}"
+        """A helper method to convert list of messages into one text prompt.
+        Save the stop tokens in the settings object for later use."""
 
-        prompt_stop: List[str] = cls.get_stop_strings(
-            cls.user_input_role, cls.system_input_role, ai_input_role
-        )
+        stops = _get_stop_strings(
+            *set(message.role for message in messages)
+        )  # type: List[str]
         if isinstance(settings.stop, str):
-            settings.stop = prompt_stop + [settings.stop]
+            settings.stop = stops + [settings.stop]
         elif isinstance(settings.stop, list):
-            settings.stop = prompt_stop + settings.stop
+            settings.stop = stops + settings.stop
         else:
-            settings.stop = prompt_stop
-        return chat_history + f"### {ai_input_role}:"
+            settings.stop = stops
+        return (
+            " ".join(
+                [
+                    f"### {message.role.upper()}: {message.content}"
+                    for message in messages
+                ]
+            )
+            + " ### ASSISTANT: "
+        )
 
-    @staticmethod
-    def is_possible_to_generate_stops(text: str, stops: List[str]) -> bool:
-        """A helper method to check if
-        the decoded text contains any of the stop tokens."""
+    def build_stops_from_settings(
+        self, settings: TextGenerationSettings
+    ) -> None:
+        """Pre-calculate sets for stops and the pieces of stops,
+        to speed up the stop checking process."""
+        if isinstance(settings.stop, str):
+            stops = [settings.stop]  # type: List[str]
+        elif isinstance(settings.stop, list):
+            stops = settings.stop
+        else:
+            stops = []
+        self._stop_set = set(stops)
+        self._stop_piece_set = {
+            stop[:prefix_idx]
+            for stop in stops
+            for prefix_idx in range(1, len(stop))
+        }
 
-        for stop in stops:
-            if stop in text or any(
-                [text.endswith(stop[: i + 1]) for i in range(len(stop))]
-            ):
-                return True
-        return False
+    def stop_checker(self, text_piece: str) -> Optional[bool]:
+        """Optimized stop checker for text completion.
+        Returns False if the text piece ends with any piece of stop.
+        Returns True if the text piece contains any stop.
+        Returns None if the text piece does not contain any piece of stop."""
+        if any(
+            text_piece.endswith(stop_piece)
+            for stop_piece in self._stop_piece_set or ()
+        ):
+            return False
+        if any(stop in text_piece for stop in self._stop_set or ()):
+            return True
+        return None
 
     @staticmethod
     def raise_for_token_limit(prompt_tokens: int, context_window: int) -> None:
diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
index 424395f..27b41d1 100644
--- a/llama_api/modules/base.py
+++ b/llama_api/modules/base.py
@@ -1,8 +1,11 @@
 from abc import ABC, abstractmethod
+from collections import deque
 from dataclasses import asdict, dataclass
 from pathlib import Path
+from time import time
 from typing import Any, Iterator, List, TypeVar
 
+from ..mixins.completion import CompletionMixin
 from ..mixins.interrupt import InterruptMixin
 from ..mixins.lock import LockMixin
 from ..mixins.logits import LogitsMixin
@@ -41,13 +44,23 @@ class BaseCompletionGenerator(
     InterruptMixin,
     LogitsMixin,
     LockMixin,
+    CompletionMixin,
 ):
     """Base class for all completion generators."""
 
     @abstractmethod
     def __del__(self):
         """Clean up resources."""
-        ...
+
+    @property
+    @abstractmethod
+    def llm_model(self) -> "BaseLLMModel":
+        """The LLM model used by this generator."""
+
+    @property
+    def model_name(self) -> str:
+        """Identifier for the model used by this generator."""
+        return Path(self.llm_model.model_path_resolved).stem
 
     @classmethod
     @abstractmethod
@@ -56,31 +69,171 @@ def from_pretrained(
     ) -> "BaseCompletionGenerator":
         """Load a pretrained model into RAM."""
 
-    @abstractmethod
     def generate_completion(
         self, prompt: str, settings: TextGenerationSettings
     ) -> Completion:
         """Generate a completion for a given prompt."""
+        completion_id = settings.completion_id
+        completion_status = self.completion_status[completion_id]
+        deque(
+            self.generate_text(prompt=prompt, settings=settings),
+            maxlen=0,
+        )  # exhaust the generator
+        return {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": int(time()),
+            "model": self.model_name,
+            "choices": [
+                {
+                    "text": completion_status.generated_text,
+                    "index": 0,
+                    "logprobs": completion_status.logprobs
+                    if settings.logprobs
+                    else None,
+                    "finish_reason": self.get_finish_reason(settings),
+                }
+            ],
+            "usage": {
+                "prompt_tokens": completion_status.input_tokens,
+                "completion_tokens": completion_status.generated_tokens,
+                "total_tokens": completion_status.input_tokens
+                + completion_status.generated_tokens,
+            },
+        }
 
-    @abstractmethod
     def generate_completion_with_streaming(
         self, prompt: str, settings: TextGenerationSettings
     ) -> Iterator[CompletionChunk]:
         """Generate a completion for a given prompt,
         yielding chunks of text as they are generated."""
+        completion_id = settings.completion_id = (
+            "chat" + settings.completion_id
+        )
+        completion_status = self.completion_status[completion_id]
+        model = self.model_name
+        for token in self.generate_text(prompt=prompt, settings=settings):
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": int(time()),
+                "model": model,
+                "choices": [
+                    {
+                        "text": token,
+                        "index": 0,
+                        "logprobs": completion_status.logprobs
+                        if settings.logprobs
+                        else None,
+                        "finish_reason": None,
+                    }
+                ],
+            }
+        yield {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": int(time()),
+            "model": model,
+            "choices": [
+                {
+                    "text": "",
+                    "index": 0,
+                    "logprobs": completion_status.logprobs
+                    if settings.logprobs
+                    else None,
+                    "finish_reason": self.get_finish_reason(settings),
+                }
+            ],
+        }
 
-    @abstractmethod
     def generate_chat_completion(
         self, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> ChatCompletion:
         """Generate a completion for a given prompt."""
+        completion_id = settings.completion_id = (
+            "chat" + settings.completion_id
+        )
+        completion_status = self.completion_status[completion_id]
+        deque(
+            self.generate_text(
+                prompt=self.convert_messages_into_prompt(
+                    messages, settings=settings
+                ),
+                settings=settings,
+            ),
+            maxlen=0,
+        )  # exhaust the generator
+        return {
+            "id": completion_id,
+            "object": "chat.completion",
+            "created": int(time()),
+            "model": self.model_name,
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": completion_status.generated_text,
+                    },
+                    "index": 0,
+                    "finish_reason": self.get_finish_reason(settings),
+                }
+            ],
+            "usage": {
+                "prompt_tokens": completion_status.input_tokens,
+                "completion_tokens": completion_status.generated_tokens,
+                "total_tokens": completion_status.input_tokens
+                + completion_status.generated_tokens,
+            },
+        }
 
-    @abstractmethod
     def generate_chat_completion_with_streaming(
         self, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> Iterator[ChatCompletionChunk]:
         """Generate a completion for a given prompt,
         yielding chunks of text as they are generated."""
+        completion_id = settings.completion_id
+        prompt = self.convert_messages_into_prompt(messages, settings=settings)
+        model = self.model_name
+        yield {
+            "id": completion_id,
+            "object": "chat.completion.chunk",
+            "created": int(time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"role": "assistant"},
+                    "finish_reason": None,
+                }
+            ],
+        }
+        for token in self.generate_text(prompt=prompt, settings=settings):
+            yield {
+                "id": completion_id,
+                "object": "chat.completion.chunk",
+                "created": int(time()),
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {"content": token},
+                        "finish_reason": None,
+                    }
+                ],
+            }
+        yield {
+            "id": completion_id,
+            "object": "chat.completion.chunk",
+            "created": int(time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": self.get_finish_reason(settings),
+                }
+            ],
+        }
 
     @abstractmethod
     def encode(self, text: str, **kwargs: Any) -> List[int]:
@@ -90,15 +243,35 @@ def encode(self, text: str, **kwargs: Any) -> List[int]:
     def decode(self, ids: List[int], **kwargs: Any) -> str:
         """Decode a list of token IDs into a text string."""
 
-    @property
     @abstractmethod
-    def llm_model(self) -> "BaseLLMModel":
-        """The LLM model used by this generator."""
+    def generate_text(
+        self, prompt: str, settings: TextGenerationSettings
+    ) -> Iterator[str]:
+        ...
 
-    @property
-    def model_name(self) -> str:
-        """Identifier for the model used by this generator."""
-        return Path(self.llm_model.model_path_resolved).stem
+    def accept_settings(
+        self,
+        prompt: str,
+        prompt_tokens: int,
+        settings: TextGenerationSettings,
+    ) -> None:
+        """Update the completion status."""
+        # Check if the prompt is too long
+        context_window = self.llm_model.max_total_tokens
+        self.raise_for_token_limit(
+            prompt_tokens=prompt_tokens, context_window=context_window
+        )
+        settings.max_tokens = min(
+            settings.max_tokens, context_window - prompt_tokens
+        )
+        completion_id = settings.completion_id
+
+        # Update completion status
+        self.completion_status[completion_id].input_text = prompt
+        self.completion_status[completion_id].input_tokens = prompt_tokens
+
+        # Cache the stops for later use of stop_checker
+        self.build_stops_from_settings(settings)
 
 
 class BaseEmbeddingGenerator(ABC):
diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index d3ad093..fc5cb29 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -1,5 +1,6 @@
 """Wrapper for exllama to generate text completions."""
 # flake8: noqa
+from array import array
 from os import environ
 
 from ..utils.logger import ApiLogger
@@ -15,32 +16,15 @@
         hijack_attention_forward()
 from gc import collect
 from pathlib import Path
-from time import time
-from typing import (
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Literal,
-    Optional,
-    Tuple,
-    Union,
-    overload,
-)
+from re import compile
+from typing import Iterable, Iterator, List, Optional, Tuple, Union, overload
 
 from torch import IntTensor, Tensor, cuda, version
 from torch.cuda import empty_cache
 from torch.nn.functional import log_softmax
 
 from ..logits.base import BaseLogitProcessor
-from ..schemas.api import (
-    APIChatMessage,
-    ChatCompletion,
-    ChatCompletionChunk,
-    Completion,
-    CompletionChunk,
-    TextGenerationSettings,
-)
+from ..schemas.api import TextGenerationSettings
 from ..schemas.models import ExllamaModel
 from ..shared.config import Config
 from ..utils.dependency import import_repository
@@ -56,7 +40,228 @@
 
 assert cuda.is_available(), "CUDA must be available to use ExLlama."
 
-_stop_checker = BaseCompletionGenerator.is_possible_to_generate_stops
+
+class ExllamaCompletionGenerator(BaseCompletionGenerator):
+    _config: Optional[ExLlamaConfig] = None
+    _model: Optional[ExLlama] = None
+    _cache: Optional[ExLlamaCache] = None
+    _tokenizer: Optional[ExLlamaTokenizer] = None
+    _generator: Optional[ExLlamaGenerator] = None
+    _llm_model: Optional["ExllamaModel"] = None
+    _lora: Optional["ExLlamaLora"] = None
+
+    @property
+    def llm_model(self) -> "ExllamaModel":
+        assert self._llm_model is not None
+        return self._llm_model
+
+    @property
+    def generator(self) -> ExLlamaGenerator:
+        assert self._generator is not None, "Generator is not initialized."
+        return self._generator
+
+    @property
+    def tokenizer(self) -> ExLlamaTokenizer:
+        assert self._tokenizer is not None, "Tokenizer is not initialized."
+        return self._tokenizer
+
+    @property
+    def cache(self) -> ExLlamaCache:
+        assert self._cache is not None, "Cache is not initialized."
+        return self._cache
+
+    @property
+    def model(self) -> ExLlama:
+        assert self._model is not None, "Model is not initialized."
+        return self._model
+
+    @property
+    def config(self) -> ExLlamaConfig:
+        assert self._config is not None, "Config is not initialized."
+        return self._config
+
+    @property
+    def lora(self) -> Optional[ExLlamaLora]:
+        return self._lora
+
+    @classmethod
+    def from_pretrained(
+        cls, llm_model: "ExllamaModel"
+    ) -> "ExllamaCompletionGenerator":
+        model_folder_path = Path(llm_model.model_path_resolved)
+        lora_path = model_folder_path / "adapter_model.bin"
+        lora_config_path = model_folder_path / "adapter_config.json"
+
+        result = cls()
+        result._llm_model = llm_model
+        result._config = _make_config(model_folder_path, llm_model)
+        result._tokenizer = ExLlamaTokenizer(
+            (model_folder_path / "tokenizer.model").as_posix()
+        )
+        result._model = ExLlama(result._config)
+        if lora_path.exists() and lora_config_path.exists():
+            logger.info(f"🦙 LORA model found for {result.model_name}")
+            with logger.log_any_error(
+                f"🦙 LORA model loading failed for {result.model_name}"
+            ):
+                result._lora = ExLlamaLora(
+                    model=result._model,
+                    lora_config_path=lora_config_path.as_posix(),
+                    lora_path=lora_path.as_posix(),
+                )
+            logger.info(f"🦙 LORA model loaded for {result.model_name}")
+        result._cache = ExLlamaCache(result._model)
+        result._generator = ExLlamaGenerator(
+            result._model, result._tokenizer, result._cache
+        )
+        return result
+
+    def encode(self, text: str) -> List[int]:
+        assert self._tokenizer is not None, "Tokenizer is not initialized"
+        return _encode(self._tokenizer, text).flatten().tolist()
+
+    def decode(self, ids: List[int], **kwargs) -> str:
+        assert self._tokenizer is not None, "Tokenizer is not initialized"
+        return str(self._tokenizer.decode(IntTensor(ids)))
+
+    def __del__(self) -> None:
+        if self._tokenizer is not None:
+            getattr(self._tokenizer, "__del__", lambda: None)()
+            del self._tokenizer
+            self._tokenizer = None
+            logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted")
+        if self._cache is not None:
+            getattr(self._cache, "__del__", lambda: None)()
+            del self._cache
+            self._cache = None
+            logger.info("🗑️ ExllamaCompletionGenerator cache deleted")
+        if self._generator is not None:
+            getattr(self._generator, "__del__", lambda: None)()
+            del self._generator
+            self._generator = None
+            logger.info("🗑️ ExllamaCompletionGenerator generator deleted")
+        if self._model is not None:
+            self._model.free_unmanaged()
+            del self._model
+            self._model = None
+            logger.info("🗑️ ExllamaCompletionGenerator model deleted")
+        collect()
+        empty_cache()
+
+    def generate_text(
+        self, prompt: str, settings: TextGenerationSettings
+    ) -> Iterator[str]:
+        with logger.log_any_error():
+            # Encode the prompt
+            if settings.guidance_scale == 1:
+                ids = _encode(self.tokenizer, prompt)
+                mask = None  # type: Optional[Tensor]
+            else:
+                ids, mask = _encode(
+                    self.tokenizer,
+                    [prompt, settings.negative_prompt or ""],
+                    return_mask=True,
+                )
+
+            # Accept and apply the settings
+            self.accept_settings(
+                prompt=prompt,
+                prompt_tokens=ids.shape[-1],
+                settings=settings,
+            )
+            generator = _apply_settings_to_generator(self, settings=settings)
+
+            # Apply LoRA
+            if self.lora:
+                generator.lora = self.lora  # type: ignore
+
+            # Inject the prompt
+            if mask is not None:
+                generator.gen_begin(ids, mask=mask)
+            else:
+                generator.end_beam_search()
+                generator.gen_begin_reuse(ids)
+
+            # Generate text
+            yield from self._generate_text(settings, mask)
+
+    def _generate_text(
+        self,
+        settings: TextGenerationSettings,
+        cfg_mask: Optional[Tensor] = None,
+    ) -> Iterator[str]:
+        # Set up the variables
+        IdToPiece = self.tokenizer.tokenizer.IdToPiece
+        generator = self.generator
+        initial_len = generator.sequence[0].shape[0]  # type: int
+        eos_token_id = generator.tokenizer.eos_token_id  # type: int
+        completion_status = self.completion_status[settings.completion_id]
+        text_buffer = ""  # type: str
+        byte_array = array("B")  # type: array[int]
+        byte_pattern = compile(r"<0x([0-9a-fA-F]{2})>")
+
+        for _ in range(settings.max_tokens):
+            # If the generator was interrupted, stop the generation
+            if self.is_interrupted:
+                break
+
+            # Predict next token id
+            token_id = (
+                _gen_single_token_with_cfg(
+                    generator=generator,
+                    mask=cfg_mask,
+                    cfg_alpha=settings.guidance_scale,
+                )
+                if cfg_mask is not None
+                else _gen_single_token_without_cfg(
+                    generator=generator,
+                    input_ids=generator.sequence[0][initial_len:],
+                    logit_processors=(
+                        [
+                            processor
+                            for processor in self.get_logit_processors(
+                                settings=settings,
+                                encoder=self.encode,
+                            )
+                        ]
+                        if cfg_mask is None
+                        else None
+                    )
+                    or None,
+                )
+            )  # type: int
+
+            # Check if the token is a stop token
+            if self.is_interrupted or token_id == eos_token_id:
+                break
+
+            # Update the completion status
+            completion_status.generated_tokens += 1
+
+            # Try to decode the token
+            piece = IdToPiece(token_id)  # type: str
+            if piece[0] == "<" and piece[-1] == ">":
+                byte_match = byte_pattern.match(piece)
+                if byte_match is None:
+                    continue
+                try:
+                    byte_array.append(int(byte_match.group(1), 16))
+                    piece = byte_array.tobytes().decode()
+                    del byte_array[:]
+                except UnicodeDecodeError:
+                    continue
+            text_to_yield = text_buffer + piece.replace("▁", " ")
+
+            # Check if the decoded text contains any of the stop tokens.
+            stop_status = self.stop_checker(text_to_yield)
+            if stop_status is None:  # Good to go
+                text_buffer = ""  # Clear the buffer
+                completion_status.generated_text += text_to_yield
+                yield text_to_yield
+            elif stop_status is True:  # Contains any of the stop tokens
+                break  # Stop generating
+            else:  # Contains any piece of the stop tokens
+                text_buffer = text_to_yield  # Save the buffer
 
 
 def _make_config(
@@ -129,7 +334,7 @@ def _make_config(
 
 def _apply_settings_to_generator(
     cg: "ExllamaCompletionGenerator",
-    settings: "TextGenerationSettings",
+    settings: TextGenerationSettings,
 ) -> ExLlamaGenerator:
     """Apply the settings to the generator."""
     # Make sure that the batch size is correct
@@ -181,7 +386,7 @@ def _gen_single_token_with_cfg(
 
 def _gen_single_token_without_cfg(
     generator: ExLlamaGenerator,
-    initial_len: int,
+    input_ids: Tensor,
     constraints: Optional[Tensor] = None,
     mask: Optional[Tensor] = None,
     logit_processors: Optional[Iterable[BaseLogitProcessor]] = None,
@@ -200,7 +405,6 @@ def _gen_single_token_without_cfg(
         logits[:, :, generator.tokenizer.bos_token_id] = -10000.0
 
         if logit_processors is not None:
-            input_ids = generator.sequence[0][initial_len:]
             for logit_processor in logit_processors:
                 logits = logit_processor.with_torch(input_ids, logits)
 
@@ -230,421 +434,16 @@ def _gen_single_token_without_cfg(
     return int(token.item())
 
 
-def _generator(
-    cg: "ExllamaCompletionGenerator",
-    settings: "TextGenerationSettings",
-    stops: List[str],
-    cfg_mask: Optional[Tensor] = None,
-) -> Iterator[str]:
-    IdToPiece = cg.tokenizer.tokenizer.IdToPiece
-    decoder = cg.tokenizer.decode
-    generator = cg.generator
-
-    cfg_alpha = settings.guidance_scale  # type: float
-    initial_len = generator.sequence[0].shape[0]  # type: int
-    eos_token_id = generator.tokenizer.eos_token_id  # type: int
-    has_leading_space = False  # type: bool
-    text_cursor = 0  # type: int
-    n_tokens = 0  # type: int
-    logit_processors = (
-        [
-            processor
-            for processor in cg.get_logit_processors(
-                settings=settings,
-                encoder=cg.encode,
-            )
-        ]
-        if cfg_mask is None
-        else None
-    )  # type: Optional[Iterable[BaseLogitProcessor]]
-    for n_tokens in range(1, settings.max_tokens + 1):
-        if cg.is_interrupted:
-            break  # the generator was interrupted
-
-        # Predict the next token id
-        if cfg_mask is not None:
-            token_id = _gen_single_token_with_cfg(
-                generator, mask=cfg_mask, cfg_alpha=cfg_alpha
-            )
-        else:
-            token_id = _gen_single_token_without_cfg(
-                generator,
-                initial_len=initial_len,
-                logit_processors=logit_processors or None,
-            )
-        if cg.is_interrupted or token_id == eos_token_id:
-            break
-
-        # Yield the text piece
-        if n_tokens == 1:
-            has_leading_space = IdToPiece(token_id).startswith("▁")
-        decoded_text = (
-            " " + str(decoder(generator.sequence[0][initial_len:]))
-            if has_leading_space
-            else str(decoder(generator.sequence[0][initial_len:]))
-        )
-        text_piece = decoded_text[text_cursor:]
-        if "�" in text_piece:  # Decode error when decoding multi-byte char
-            continue
-        if _stop_checker(text_piece, stops=stops):  # Stop token found maybe
-            if any(stop in decoded_text for stop in stops):
-                break  # Stop token found
-            continue
-        yield text_piece
-        text_cursor += len(text_piece)
-    # End of generation
-    cg._completion_status[settings.completion_id] = n_tokens
-
-
-def _generate_text_with_streaming(
-    cg: "ExllamaCompletionGenerator",
-    prompt: str,
-    settings: "TextGenerationSettings",
-) -> Iterator[str]:
-    with logger.log_any_error():
-        # Make sure that the stop token is a list
-        if isinstance(settings.stop, str):
-            stops = [settings.stop]  # type: List[str]
-        elif isinstance(settings.stop, list):
-            stops = settings.stop
-        else:
-            stops = []
-
-        # Apply the settings to the generator
-        generator = _apply_settings_to_generator(cg, settings=settings)
-
-        # Apply the LORA model
-        if cg.lora:
-            generator.lora = cg.lora  # type: ignore
-
-        # Start the generator
-        context_window = cg.llm_model.max_total_tokens
-        if settings.guidance_scale == 1:
-            ids = _encode(cg.tokenizer, prompt)
-            prompt_tokens = ids.shape[-1]
-            cg.raise_for_token_limit(
-                prompt_tokens=prompt_tokens, context_window=context_window
-            )
-            mask = None  # type: Optional[Tensor]
-            generator.end_beam_search()
-            generator.gen_begin_reuse(ids)
-        else:
-            ids, mask = _encode(
-                cg.tokenizer,
-                [prompt, settings.negative_prompt or ""],
-                return_mask=True,
-            )
-            prompt_tokens = ids.shape[-1]
-            cg.raise_for_token_limit(
-                prompt_tokens=prompt_tokens, context_window=context_window
-            )
-            generator.gen_begin(ids, mask=mask)
-
-        settings.max_tokens = min(
-            settings.max_tokens, context_window - prompt_tokens
-        )
-
-        yield from _generator(
-            cg, settings=settings, cfg_mask=mask, stops=stops
-        )
-
-
-class ExllamaCompletionGenerator(BaseCompletionGenerator):
-    _config: Optional[ExLlamaConfig] = None
-    _model: Optional[ExLlama] = None
-    _cache: Optional[ExLlamaCache] = None
-    _tokenizer: Optional[ExLlamaTokenizer] = None
-    _generator: Optional[ExLlamaGenerator] = None
-    _llm_model: Optional["ExllamaModel"] = None
-    _lora: Optional["ExLlamaLora"] = None
-    _completion_status: Dict[
-        str, int
-    ] = {}  # key: completion_id, value: number of completion tokens
-
-    @property
-    def llm_model(self) -> "ExllamaModel":
-        assert self._llm_model is not None
-        return self._llm_model
-
-    @property
-    def generator(self) -> ExLlamaGenerator:
-        assert self._generator is not None, "Generator is not initialized."
-        return self._generator
-
-    @property
-    def tokenizer(self) -> ExLlamaTokenizer:
-        assert self._tokenizer is not None, "Tokenizer is not initialized."
-        return self._tokenizer
-
-    @property
-    def cache(self) -> ExLlamaCache:
-        assert self._cache is not None, "Cache is not initialized."
-        return self._cache
-
-    @property
-    def model(self) -> ExLlama:
-        assert self._model is not None, "Model is not initialized."
-        return self._model
-
-    @property
-    def config(self) -> ExLlamaConfig:
-        assert self._config is not None, "Config is not initialized."
-        return self._config
-
-    @property
-    def lora(self) -> Optional[ExLlamaLora]:
-        return self._lora
-
-    @classmethod
-    def from_pretrained(
-        cls, llm_model: "ExllamaModel"
-    ) -> "ExllamaCompletionGenerator":
-        model_folder_path = Path(llm_model.model_path_resolved)
-        lora_path = model_folder_path / "adapter_model.bin"
-        lora_config_path = model_folder_path / "adapter_config.json"
-
-        result = cls()
-        result._llm_model = llm_model
-        result._config = _make_config(model_folder_path, llm_model)
-        result._tokenizer = ExLlamaTokenizer(
-            (model_folder_path / "tokenizer.model").as_posix()
-        )
-        result._model = ExLlama(result._config)
-        if lora_path.exists() and lora_config_path.exists():
-            logger.info(f"🦙 LORA model found for {result.model_name}")
-            with logger.log_any_error(
-                f"🦙 LORA model loading failed for {result.model_name}"
-            ):
-                result._lora = ExLlamaLora(
-                    model=result._model,
-                    lora_config_path=lora_config_path.as_posix(),
-                    lora_path=lora_path.as_posix(),
-                )
-            logger.info(f"🦙 LORA model loaded for {result.model_name}")
-        result._cache = ExLlamaCache(result._model)
-        result._generator = ExLlamaGenerator(
-            result._model, result._tokenizer, result._cache
-        )
-        return result
-
-    def generate_completion_with_streaming(
-        self, prompt: str, settings: "TextGenerationSettings"
-    ) -> Iterator["CompletionChunk"]:
-        completion_id = settings.completion_id
-        model = self.model_name
-        generated_text = ""  # type: str
-        for token in _generate_text_with_streaming(
-            self, prompt=prompt, settings=settings
-        ):
-            generated_text += token
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": int(time()),
-                "model": model,
-                "choices": [
-                    {
-                        "text": token,
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": None,
-                    }
-                ],
-            }
-        yield {
-            "id": completion_id,
-            "object": "text_completion",
-            "created": int(time()),
-            "model": model,
-            "choices": [
-                {
-                    "text": "",
-                    "index": 0,
-                    "logprobs": None,
-                    "finish_reason": _get_finish_reason(
-                        self, settings, completion_id, generated_text
-                    ),
-                }
-            ],
-        }
-
-    def generate_completion(
-        self, prompt: str, settings: "TextGenerationSettings"
-    ) -> "Completion":
-        completion_id = settings.completion_id
-        generated_text = "".join(
-            _generate_text_with_streaming(
-                self, prompt=prompt, settings=settings
-            )
-        )
-        prompt_tokens = _encode(self.tokenizer, prompt).shape[1]
-        completion_tokens = self._completion_status.get(
-            completion_id, _encode(self.tokenizer, generated_text).shape[1]
-        )
-        return {
-            "id": completion_id,
-            "object": "text_completion",
-            "created": int(time()),
-            "model": self.model_name,
-            "choices": [
-                {
-                    "text": generated_text,
-                    "index": 0,
-                    "logprobs": None,
-                    "finish_reason": _get_finish_reason(
-                        self, settings, completion_id, generated_text
-                    ),
-                }
-            ],
-            "usage": {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            },
-        }
-
-    def generate_chat_completion_with_streaming(
-        self,
-        messages: List["APIChatMessage"],
-        settings: "TextGenerationSettings",
-    ) -> Iterator["ChatCompletionChunk"]:
-        completion_id = settings.completion_id
-        prompt = self.convert_messages_into_prompt(messages, settings=settings)
-        model = self.model_name
-        generated_text = ""  # type: str
-        yield {
-            "id": completion_id,
-            "object": "chat.completion.chunk",
-            "created": int(time()),
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"role": "assistant"},
-                    "finish_reason": None,
-                }
-            ],
-        }
-        for token in _generate_text_with_streaming(
-            self, prompt=prompt, settings=settings
-        ):
-            generated_text += token
-            yield {
-                "id": completion_id,
-                "object": "chat.completion.chunk",
-                "created": int(time()),
-                "model": model,
-                "choices": [
-                    {
-                        "index": 0,
-                        "delta": {"content": token},
-                        "finish_reason": None,
-                    }
-                ],
-            }
-        yield {
-            "id": completion_id,
-            "object": "chat.completion.chunk",
-            "created": int(time()),
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {},
-                    "finish_reason": _get_finish_reason(
-                        self, settings, completion_id, generated_text
-                    ),
-                }
-            ],
-        }
-
-    def generate_chat_completion(
-        self,
-        messages: List["APIChatMessage"],
-        settings: "TextGenerationSettings",
-    ) -> "ChatCompletion":
-        completion_id = settings.completion_id
-        prompt = self.convert_messages_into_prompt(messages, settings=settings)
-        generated_text = "".join(
-            _generate_text_with_streaming(
-                self, prompt=prompt, settings=settings
-            )
-        )
-        prompt_tokens = _encode(self.tokenizer, prompt).shape[1]
-        completion_tokens = self._completion_status.get(
-            completion_id, _encode(self.tokenizer, generated_text).shape[1]
-        )
-        return {
-            "id": completion_id,
-            "object": "chat.completion",
-            "created": int(time()),
-            "model": self.model_name,
-            "choices": [
-                {
-                    "message": {
-                        "role": "assistant",
-                        "content": generated_text,
-                    },
-                    "index": 0,
-                    "finish_reason": _get_finish_reason(
-                        self, settings, completion_id, generated_text
-                    ),
-                }
-            ],
-            "usage": {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            },
-        }
-
-    def encode(self, text: str) -> List[int]:
-        assert self._tokenizer is not None, "Tokenizer is not initialized"
-        return _encode(self._tokenizer, text).flatten().tolist()
-
-    def decode(self, ids: List[int], **kwargs) -> str:
-        assert self._tokenizer is not None, "Tokenizer is not initialized"
-        return str(self._tokenizer.decode(IntTensor(ids)))
-
-    def __del__(self) -> None:
-        if self._tokenizer is not None:
-            getattr(self._tokenizer, "__del__", lambda: None)()
-            del self._tokenizer
-            self._tokenizer = None
-            logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted")
-        if self._cache is not None:
-            getattr(self._cache, "__del__", lambda: None)()
-            del self._cache
-            self._cache = None
-            logger.info("🗑️ ExllamaCompletionGenerator cache deleted")
-        if self._generator is not None:
-            getattr(self._generator, "__del__", lambda: None)()
-            del self._generator
-            self._generator = None
-            logger.info("🗑️ ExllamaCompletionGenerator generator deleted")
-        if self._model is not None:
-            self._model.free_unmanaged()
-            del self._model
-            self._model = None
-            logger.info("🗑️ ExllamaCompletionGenerator model deleted")
-        collect()
-        empty_cache()
-
-
 @overload
 def _encode(
-    tokenizer: ExLlamaTokenizer,
-    text: str,
-    return_mask: bool = False,
+    tokenizer: ExLlamaTokenizer, text: str, return_mask: bool = False
 ) -> Tensor:
     ...
 
 
 @overload
 def _encode(
-    tokenizer: ExLlamaTokenizer,
-    text: List[str],
-    return_mask: bool = True,
+    tokenizer: ExLlamaTokenizer, text: List[str], return_mask: bool = True
 ) -> Tuple[Tensor, Tensor]:
     ...
 
@@ -664,20 +463,3 @@ def _encode(
         ids = result[0] if isinstance(result, tuple) else result
         assert isinstance(ids, Tensor)
         return ids
-
-
-def _get_finish_reason(
-    cg: ExllamaCompletionGenerator,
-    settings: "TextGenerationSettings",
-    completion_id: str,
-    generated_text: str,
-) -> Literal["length", "stop"]:
-    return (
-        "length"
-        if cg._completion_status.get(
-            completion_id,
-            _encode(cg.tokenizer, generated_text).shape[1],
-        )
-        >= settings.max_tokens
-        else "stop"
-    )
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index 3207bec..c58e743 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -1,21 +1,18 @@
 """Wrapper for llama_cpp to generate text completions."""
+# flake8: noqa
+import sys
+from array import array
 from inspect import signature
-from typing import Callable, Iterator, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Iterator, List, Optional, Union
 
 from ..schemas.api import (
-    APIChatMessage,
-    ChatCompletion,
     ChatCompletionChunk,
-    Completion,
     CompletionChunk,
+    CompletionLogprobs,
     TextGenerationSettings,
 )
 from ..schemas.models import LlamaCppModel
 from ..shared.config import Config
-from ..utils.completions import (
-    convert_text_completion_chunks_to_chat,
-    convert_text_completion_to_chat,
-)
 from ..utils.dependency import import_repository
 from ..utils.llama_cpp import build_shared_lib
 from ..utils.logger import ApiLogger
@@ -28,6 +25,20 @@
     from repositories.llama_cpp import llama_cpp
 
 
+if TYPE_CHECKING:
+    from llama_api.mixins.completion import CompletionStatus
+
+
+class StoppingCriteriaList(List[Callable[[List[int], List[float]], bool]]):
+    def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
+        return any(
+            [
+                stopping_criteria(input_ids, logits)
+                for stopping_criteria in self
+            ]
+        )
+
+
 class LogitsProcessorList(
     List[Callable[[List[int], List[float]], List[float]]]
 ):
@@ -39,66 +50,6 @@ def __call__(
         return scores
 
 
-def _create_completion(
-    client: llama_cpp.Llama,
-    prompt: str,
-    stream: bool,
-    settings: TextGenerationSettings,
-) -> Union[Completion, Iterator[CompletionChunk]]:
-    logit_processors = LogitsProcessorList(
-        [
-            processor.without_torch
-            for processor in BaseCompletionGenerator.get_logit_processors(
-                settings=settings,
-                encoder=lambda s: client.tokenize(
-                    s.encode("utf-8"), add_bos=False
-                ),
-            )
-        ]
-    )
-    return client.create_completion(
-        stream=stream,
-        prompt=prompt,
-        max_tokens=settings.max_tokens,
-        temperature=settings.temperature,
-        top_p=settings.top_p,
-        logprobs=settings.logprobs,
-        echo=settings.echo,
-        frequency_penalty=settings.frequency_penalty,
-        presence_penalty=settings.presence_penalty,
-        repeat_penalty=settings.repeat_penalty,
-        top_k=settings.top_k,
-        tfs_z=settings.tfs_z,
-        mirostat_mode=settings.mirostat_mode,
-        mirostat_tau=settings.mirostat_tau,
-        mirostat_eta=settings.mirostat_eta,
-        logits_processor=logit_processors if logit_processors else None,  # type: ignore  # noqa: E501
-        stop=settings.stop,
-    )
-
-
-def _create_chat_completion(
-    client: llama_cpp.Llama,
-    messages: List[APIChatMessage],
-    stream: bool,
-    settings: TextGenerationSettings,
-) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-    prompt: str = LlamaCppCompletionGenerator.convert_messages_into_prompt(
-        messages, settings=settings
-    )
-    completion_or_chunks = _create_completion(
-        client=client, prompt=prompt, stream=stream, settings=settings
-    )
-    if isinstance(completion_or_chunks, Iterator):
-        return convert_text_completion_chunks_to_chat(
-            completion_or_chunks,
-        )
-    else:
-        return convert_text_completion_to_chat(
-            completion_or_chunks,
-        )
-
-
 class LlamaCppCompletionGenerator(BaseCompletionGenerator):
     generator: Optional[
         Iterator[Union[CompletionChunk, ChatCompletionChunk]]
@@ -164,66 +115,6 @@ def from_pretrained(
         self._llm_model = llm_model
         return self
 
-    def generate_completion(
-        self,
-        prompt: str,
-        settings: TextGenerationSettings = TextGenerationSettings(),
-    ) -> Completion:
-        assert self.client is not None
-        completion = _create_completion(
-            client=self.client, prompt=prompt, stream=False, settings=settings
-        )
-        assert not isinstance(completion, Iterator)
-        return completion
-
-    def generate_completion_with_streaming(
-        self,
-        prompt: str,
-        settings: TextGenerationSettings = TextGenerationSettings(),
-    ) -> Iterator[CompletionChunk]:
-        assert self.client is not None
-        completion_chunk_generator = _create_completion(
-            client=self.client, prompt=prompt, stream=True, settings=settings
-        )
-        assert isinstance(completion_chunk_generator, Iterator)
-        self.generator = completion_chunk_generator
-        for chunk in completion_chunk_generator:
-            if self.is_interrupted:
-                yield chunk
-                return  # the generator was interrupted
-            yield chunk
-
-    def generate_chat_completion(
-        self, messages: List[APIChatMessage], settings: TextGenerationSettings
-    ) -> ChatCompletion:
-        assert self.client is not None
-        chat_completion = _create_chat_completion(
-            client=self.client,
-            messages=messages,
-            stream=False,
-            settings=settings,
-        )
-        assert not isinstance(chat_completion, Iterator)
-        return chat_completion
-
-    def generate_chat_completion_with_streaming(
-        self, messages: List[APIChatMessage], settings: TextGenerationSettings
-    ) -> Iterator[ChatCompletionChunk]:
-        assert self.client is not None
-        chat_completion_chunk_generator = _create_chat_completion(
-            client=self.client,
-            messages=messages,
-            stream=True,
-            settings=settings,
-        )
-        assert isinstance(chat_completion_chunk_generator, Iterator)
-        self.generator = chat_completion_chunk_generator
-        for chunk in chat_completion_chunk_generator:
-            if self.is_interrupted:
-                yield chunk
-                return  # the generator was interrupted
-            yield chunk
-
     def encode(self, text: str, add_bos: bool = True, **kwargs) -> List[int]:
         assert self.client is not None, "Client is not initialized"
         return self.client.tokenize(
@@ -233,3 +124,202 @@ def encode(self, text: str, add_bos: bool = True, **kwargs) -> List[int]:
     def decode(self, ids: List[int], **kwargs) -> str:
         assert self.client is not None, "Client is not initialized"
         return self.client.detokenize(ids).decode("utf-8", errors="ignore")
+
+    def generate_text(
+        self, prompt: str, settings: TextGenerationSettings
+    ) -> Iterator[str]:
+        client = self.client
+        assert client is not None, "Llama is not initialized"
+        self.llm_model.max_total_tokens = client.n_ctx()
+        assert client.ctx is not None, "Llama context is not initialized"
+        n_ctx = client.n_ctx()
+        tokens = (llama_cpp.llama_token * n_ctx)()
+        n_tokens = llama_cpp.llama_tokenize(
+            client.ctx,
+            b" " + prompt.encode("utf-8"),
+            tokens,
+            llama_cpp.c_int(n_ctx),
+            llama_cpp.c_bool(True),
+        )
+        if n_tokens < 0:
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * n_tokens)()
+            n_tokens = llama_cpp.llama_tokenize(
+                client.ctx,
+                b" " + prompt.encode("utf-8"),
+                tokens,
+                llama_cpp.c_int(n_tokens),
+                llama_cpp.c_bool(True),
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{prompt}" n_tokens={n_tokens}'
+                )
+        input_ids = array("i", tokens[:n_tokens])  # type: array[int]
+        self.accept_settings(
+            prompt=prompt, prompt_tokens=len(input_ids), settings=settings
+        )
+        yield from self._generate_text(client, input_ids, settings)
+
+    def _generate_text(
+        self,
+        client: llama_cpp.Llama,
+        input_ids: "array[int]",
+        settings: TextGenerationSettings,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[llama_cpp.LlamaGrammar] = None,
+    ) -> Iterator[str]:
+        ctx = client.ctx
+        assert ctx is not None, "Llama context is not initialized"
+        verbose = self.llm_model.verbose
+        if verbose:
+            llama_cpp.llama_reset_timings(ctx)
+
+        # Cache the variables frequently used in the loop
+        completion_status = self.completion_status[settings.completion_id]
+        generated_ids = array("i")  # type: array[int]
+        byte_array = bytearray()  # type: bytearray
+        eos_token = llama_cpp.llama_token_eos()
+        logprobs = settings.logprobs
+        text_buffer = ""  # type: str
+        llama_token_to_str = llama_cpp.llama_token_to_str
+        llama_token = llama_cpp.llama_token
+
+        if logprobs is not None and client.params.logits_all is False:
+            raise ValueError(
+                "logprobs is not supported for models "
+                "created with logits_all=False"
+            )
+
+        if client.cache:
+            _load_cache(client, client.cache, input_ids)
+
+        for _, token_id in zip(
+            range(settings.max_tokens),
+            client.generate(
+                input_ids,
+                **{
+                    key: value
+                    for key, value in {
+                        **self.llm_model.asdict,
+                        **{
+                            "temp": settings.temperature,
+                            "stopping_criteria": stopping_criteria,
+                            "logits_processor": logits_processor,
+                            "grammar": grammar,
+                        },
+                    }.items()
+                    # Hacky way to pass arguments safely to older versions of llama-cpp-python
+                    if key in signature(client.generate).parameters.keys()
+                },
+            ),
+        ):
+            if self.is_interrupted or token_id == eos_token:
+                break
+
+            # Update the generated id
+            generated_ids.append(token_id)
+            completion_status.generated_tokens += 1
+
+            piece = llama_token_to_str(
+                ctx, llama_token(token_id)
+            )  # type: bytes
+            try:
+                # Try to decode the token
+                text_to_yield = text_buffer + (byte_array + piece).decode()
+                byte_array.clear()
+            except UnicodeDecodeError:
+                # Multi-byte characters are not decoded correctly if partial
+                byte_array.extend(piece)
+                continue
+
+            # Check if the decoded text contains any of the stop tokens.
+            stop_status = self.stop_checker(text_to_yield)
+            if stop_status is None:  # Good to go
+                text_buffer = ""  # Clear the buffer
+                completion_status.generated_text += text_to_yield
+                yield text_to_yield
+            elif stop_status is True:  # Contains any of the stop tokens
+                break  # Stop generating
+            else:  # Contains any piece of the stop tokens
+                text_buffer = text_to_yield  # Save the buffer
+
+        # End of the loop
+        if verbose:
+            llama_cpp.llama_print_timings(ctx)
+        if client.cache:
+            if verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            client.cache[input_ids + generated_ids] = client.save_state()
+            print("Llama._create_completion: cache saved", file=sys.stderr)
+        return
+
+
+def _load_cache(
+    client: llama_cpp.Llama, cache: llama_cpp.BaseLlamaCache, ids: "array[int]"
+) -> None:
+    try:
+        cache_item = cache[ids]
+        cache_prefix_len = client.longest_token_prefix(
+            cache_item.input_ids.tolist(), ids
+        )
+        eval_prefix_len = client.longest_token_prefix(
+            client._input_ids.tolist(), ids
+        )
+        if cache_prefix_len > eval_prefix_len:
+            client.load_state(cache_item)
+            if client.verbose:
+                print(
+                    "Llama._create_completion: cache hit",
+                    file=sys.stderr,
+                )
+    except KeyError:
+        if client.verbose:
+            print("Llama._create_completion: cache miss", file=sys.stderr)
+
+
+def _get_log_probs(
+    client: llama_cpp.Llama,
+    completion_status: "CompletionStatus",
+    prompt_tokens: int,
+    generated_ids: "array[int]",
+    generated_tokens: int,
+    logprobs: int,
+    token: int,
+) -> CompletionLogprobs:
+    assert client.ctx is not None, "Llama context is not initialized"
+    token_str = client.detokenize([token]).decode("utf-8", errors="ignore")
+    text_offset = len(completion_status.input_text) + len(
+        completion_status.generated_text
+    )
+    token_offset = prompt_tokens + generated_tokens
+    current_logprobs = client.logits_to_logprobs(
+        client.scores[: client.n_tokens, :][token_offset - 1, :].tolist()
+    )
+    return {
+        "tokens": [
+            client.detokenize([token]).decode("utf-8", errors="ignore")
+        ],
+        "text_offset": [text_offset],
+        "token_logprobs": [current_logprobs[int(token)]],
+        "top_logprobs": [
+            {
+                **{
+                    client.detokenize([i]).decode(
+                        "utf-8", errors="ignore"
+                    ): logprob
+                    for logprob, i in list(
+                        sorted(
+                            zip(
+                                current_logprobs,
+                                range(len(current_logprobs)),
+                            ),
+                            reverse=True,
+                        )
+                    )[:logprobs]
+                },
+                token_str: current_logprobs[int(token)],
+            }
+        ],
+    }

From 6cea61ee258a947d0c2238d04aac5c3e7942f366 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sat, 19 Aug 2023 18:59:10 +0900
Subject: [PATCH 06/18] Removed TaskStatus and task_manager

---
 llama_api/mixins/completion.py  |   4 +
 llama_api/modules/base.py       |   8 +-
 llama_api/modules/llama_cpp.py  |   7 +-
 llama_api/schemas/api.py        |   4 +
 llama_api/server/pools/llama.py |  75 +++++-----
 llama_api/server/routers/v1.py  | 238 +++++++++++++-------------------
 6 files changed, 150 insertions(+), 186 deletions(-)

diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py
index 312a4b0..26f03f9 100644
--- a/llama_api/mixins/completion.py
+++ b/llama_api/mixins/completion.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
+from time import time
 from typing import Dict, Literal, Optional
 
 from ..schemas.api import CompletionLogprobs, TextGenerationSettings
@@ -7,6 +8,9 @@
 
 @dataclass
 class CompletionStatus:
+    # These fields are automatically set
+    started_at: float = field(default_factory=time, init=False)
+
     # These fields are set by `accept_settings` method.
     input_text: str = field(default="", init=False)
     input_tokens: int = field(default=0, init=False)
diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
index 27b41d1..df101f9 100644
--- a/llama_api/modules/base.py
+++ b/llama_api/modules/base.py
@@ -107,9 +107,7 @@ def generate_completion_with_streaming(
     ) -> Iterator[CompletionChunk]:
         """Generate a completion for a given prompt,
         yielding chunks of text as they are generated."""
-        completion_id = settings.completion_id = (
-            "chat" + settings.completion_id
-        )
+        completion_id = settings.completion_id
         completion_status = self.completion_status[completion_id]
         model = self.model_name
         for token in self.generate_text(prompt=prompt, settings=settings):
@@ -150,9 +148,7 @@ def generate_chat_completion(
         self, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> ChatCompletion:
         """Generate a completion for a given prompt."""
-        completion_id = settings.completion_id = (
-            "chat" + settings.completion_id
-        )
+        completion_id = settings.completion_id
         completion_status = self.completion_status[completion_id]
         deque(
             self.generate_text(
diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
index c58e743..0815e4d 100644
--- a/llama_api/modules/llama_cpp.py
+++ b/llama_api/modules/llama_cpp.py
@@ -3,8 +3,9 @@
 import sys
 from array import array
 from inspect import signature
-from typing import TYPE_CHECKING, Callable, Iterator, List, Optional, Union
+from typing import Callable, Iterator, List, Optional, Union
 
+from ..mixins.completion import CompletionStatus
 from ..schemas.api import (
     ChatCompletionChunk,
     CompletionChunk,
@@ -25,10 +26,6 @@
     from repositories.llama_cpp import llama_cpp
 
 
-if TYPE_CHECKING:
-    from llama_api.mixins.completion import CompletionStatus
-
-
 class StoppingCriteriaList(List[Callable[[List[int], List[float]], bool]]):
     def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
         return any(
diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py
index e052324..2f4e870 100644
--- a/llama_api/schemas/api.py
+++ b/llama_api/schemas/api.py
@@ -253,6 +253,10 @@ class Config:
 
 
 class CreateChatCompletionRequest(TextGenerationSettings):
+    completion_id: str = Field(
+        default_factory=lambda: f"chatcmpl-{str(uuid4())}",
+        description="The unique ID of the chat generation",
+    )
     model: str = Field(
         default=..., description="The model to use for completion."
     )
diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
index c2db637..a7d741f 100644
--- a/llama_api/server/pools/llama.py
+++ b/llama_api/server/pools/llama.py
@@ -1,13 +1,16 @@
 from collections import deque
 from contextlib import contextmanager
+from dataclasses import dataclass, field
 from multiprocessing.dummy import current_process
 from os import getpid
 from queue import Queue
 from threading import Event
-from typing import Deque, Dict, Iterator, List, Union
+from time import time
+from typing import Deque, Dict, Iterator, List, Optional, Union
 
 import model_definitions
 
+from ...mixins.completion import CompletionStatus
 from ...modules.base import (
     BaseCompletionGenerator,
     BaseEmbeddingGenerator,
@@ -31,7 +34,6 @@
 from ...utils.logger import ApiLogger
 from ...utils.system import free_memory_of_first_item_from_container
 
-
 logger = ApiLogger(__name__)
 logger.info(f"🔧 {current_process()} is initiated with PID: {getpid()}")
 
@@ -40,6 +42,12 @@
 embedding_generators: Deque["BaseEmbeddingGenerator"] = deque(maxlen=1)
 
 
+@dataclass
+class EmbeddingStatus:
+    started_at: float = field(default_factory=time, init=False)
+    embedding: Optional[Embedding] = None
+
+
 def init() -> None:
     pass
 
@@ -78,8 +86,7 @@ def get_model(model_name: str) -> "BaseLLMModel":
             llm_model, BaseLLMModel
         ), f"Not a LLM model: {model_name}"
         return llm_model
-    except Exception as e:
-        logger.error(e)
+    except Exception:
         raise ValueError(f"Model path does not exist: {model_name}")
 
 
@@ -89,7 +96,7 @@ def get_completion_generator(
         CreateChatCompletionRequest,
         CreateEmbeddingRequest,
     ],
-) -> "BaseCompletionGenerator":
+) -> BaseCompletionGenerator:
     """Get a completion generator for the given model.
     If the model is not cached, create a new one.
     If the cache is full, delete the oldest completion generator."""
@@ -153,7 +160,7 @@ def get_completion_generator(
 
 def get_embedding_generator(
     body: CreateEmbeddingRequest,
-) -> "BaseEmbeddingGenerator":
+) -> BaseEmbeddingGenerator:
     """Get an embedding generator for the given model.
     If the model is not cached, create a new one.
     If the cache is full, delete the oldest completion generator."""
@@ -203,7 +210,7 @@ def generate_completion_chunks(
     body: Union[CreateChatCompletionRequest, CreateCompletionRequest],
     queue: Queue,
     interrupt_signal: Event,
-) -> None:
+) -> CompletionStatus:
     with queue_manager(queue=queue):
         with completion_generator_manager(
             body=body, interrupt_signal=interrupt_signal
@@ -233,16 +240,17 @@ def iterator() -> (
 
             for chunk in iterator():
                 if interrupt_signal.is_set():
-                    # If the event is set, it means the client has disconnected
-                    return
+                    # If the event is set, the client is disconnected
+                    return cg.completion_status[body.completion_id]
                 queue.put(chunk)
+        return cg.completion_status[body.completion_id]
 
 
 def generate_completion(
     body: Union[CreateChatCompletionRequest, CreateCompletionRequest],
     queue: Queue,
     interrupt_signal: Event,
-) -> None:
+) -> CompletionStatus:
     with queue_manager(queue=queue):
         with completion_generator_manager(
             body=body, interrupt_signal=interrupt_signal
@@ -260,17 +268,18 @@ def generate_completion(
                     settings=body,
                 )
             queue.put(completion)
+        return cg.completion_status[body.completion_id]
 
 
 def generate_embeddings(
-    body: CreateEmbeddingRequest, queue: Queue, interrupt_signal: Event
-) -> None:
+    body: CreateEmbeddingRequest, queue: Queue
+) -> EmbeddingStatus:
+    embedding_status = EmbeddingStatus()
     with queue_manager(queue=queue):
         try:
             llm_model = get_model(body.model)
             if not isinstance(llm_model, LlamaCppModel):
                 raise NotImplementedError("Using non-llama-cpp model")
-
         except Exception:
             # Embedding model from local
             #     "intfloat/e5-large-v2",
@@ -290,23 +299,21 @@ def generate_embeddings(
                 context_length=512,
                 batch=1000,
             )
-            queue.put(
-                Embedding(
-                    object="list",
-                    data=[
-                        EmbeddingData(
-                            index=embedding_idx,
-                            object="embedding",
-                            embedding=embedding,
-                        )
-                        for embedding_idx, embedding in enumerate(embeddings)
-                    ],
-                    model=body.model,
-                    usage=EmbeddingUsage(
-                        prompt_tokens=-1,
-                        total_tokens=-1,
-                    ),
-                )
+            embedding = Embedding(
+                object="list",
+                data=[
+                    EmbeddingData(
+                        index=embedding_idx,
+                        object="embedding",
+                        embedding=embedding,
+                    )
+                    for embedding_idx, embedding in enumerate(embeddings)
+                ],
+                model=body.model,
+                usage=EmbeddingUsage(
+                    prompt_tokens=-1,
+                    total_tokens=-1,
+                ),
             )
 
         else:
@@ -323,7 +330,9 @@ def generate_embeddings(
                 completion_generator, lazy.LlamaCppCompletionGenerator
             ), f"Model {body.model} is not supported for llama.cpp embeddings."
             assert completion_generator.client, "Model not loaded yet."
-            queue.put(
-                completion_generator.client.create_embedding,
-                **body.model_dump(exclude={"user"}),
+            embedding = completion_generator.client.create_embedding(
+                **body.model_dump(exclude={"user"})
             )
+        queue.put(embedding)
+        embedding_status.embedding = embedding
+        return embedding_status
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index a9a4fba..0afa584 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -3,7 +3,7 @@
 
 
 from asyncio import Task, create_task
-from contextlib import asynccontextmanager, contextmanager
+from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from functools import partial
 from os import environ
@@ -14,18 +14,16 @@
 from typing import (
     Any,
     AsyncGenerator,
-    Callable,
     Dict,
-    Generator,
     Iterator,
     List,
+    Literal,
     Optional,
     Tuple,
     Type,
     TypeVar,
     Union,
 )
-from typing_extensions import TypedDict
 
 from anyio import (
     Semaphore,
@@ -39,6 +37,7 @@
 from orjson import OPT_INDENT_2, dumps
 from sse_starlette.sse import EventSourceResponse
 
+from ...mixins.completion import CompletionStatus
 from ...schemas.api import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -58,6 +57,7 @@
 from ...utils.errors import RouteErrorHandler
 from ...utils.logger import ApiLogger, LoggingConfig
 from ..pools.llama import (
+    EmbeddingStatus,
     generate_completion,
     generate_completion_chunks,
     generate_embeddings,
@@ -77,16 +77,6 @@
 T = TypeVar("T")
 
 
-class TaskStatus(TypedDict):
-    """Completion status"""
-
-    completion_tokens: int
-    started_at: float
-    interrupted: bool
-    embedding_chunks: Optional[int]
-    generated_text: str
-
-
 @dataclass
 class WixMetadata:
     """Worker index (wix) metadata"""
@@ -179,31 +169,33 @@ async def get_event_publisher(
         CreateCompletionRequest,
     ],
     inner_send_chan: MemoryObjectSendStream[bytes],
-    task: "Task[None]",
+    task: "Task[CompletionStatus]",
     interrupt_signal: Event,
     iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]],
 ) -> None:
     """Publish Server-Sent-Events (SSE) to the client"""
-    with task_manager(
-        body=body,
-        task=task,
-        interrupt_signal=interrupt_signal,
-    ) as task_status:
-        async with inner_send_chan:
+    is_interrupted = False  # type: bool
+    async with inner_send_chan:
+        try:
+            async for chunk in iterate_in_threadpool(iterator):
+                await inner_send_chan.send(b"data: " + dumps(chunk) + b"\n\n")
+                if await request.is_disconnected():
+                    raise get_cancelled_exc_class()()
+            await inner_send_chan.send(b"data: [DONE]\n\n")
+        except get_cancelled_exc_class():
+            is_interrupted = True
+            with move_on_after(1, shield=True):
+                raise
+        finally:
+            # Cancel the producer task and set event,
+            # so the completion task can be stopped
+            interrupt_signal.set()
+            state = "Interrupted" if is_interrupted else "Completed"
             try:
-                async for chunk in iterate_in_threadpool(iterator):
-                    task_status["completion_tokens"] += 1
-                    task_status["generated_text"] += get_text_from_chunk(chunk)
-                    await inner_send_chan.send(
-                        b"data: " + dumps(chunk) + b"\n\n"
-                    )
-                    if await request.is_disconnected():
-                        raise get_cancelled_exc_class()()
-                await inner_send_chan.send(b"data: [DONE]\n\n")
-            except get_cancelled_exc_class():
-                with move_on_after(1, shield=True):
-                    task_status["interrupted"] = True
-                    raise
+                status = await task
+                log_request_and_response(body, status, state)
+            finally:
+                task.cancel()
 
 
 def get_streaming_iterator(
@@ -228,14 +220,46 @@ def log_request_and_response(
         CreateCompletionRequest,
         CreateEmbeddingRequest,
     ],
-    task_status: TaskStatus,
+    status: Union[CompletionStatus, EmbeddingStatus],
+    state: Literal["Completed", "Interrupted"],
 ) -> None:
+    """Log the request and response of the completion or embedding"""
+    elapsed_time = time() - status.started_at
+    log_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"]
     body_without_prompt = body.model_dump(
         exclude={"prompt", "messages", "input"},
         exclude_defaults=True,
         exclude_unset=True,
         exclude_none=True,
     )
+
+    # Log the embedding status
+    if isinstance(status, EmbeddingStatus) and isinstance(
+        body, CreateEmbeddingRequest
+    ):
+        embed_usage = {
+            "input_chars": len(body.input),
+            "embedding_chunks": len(status.embedding["data"])
+            if status.embedding
+            else 0,
+        }
+        log_messages.append(f"embedding chunks: {embed_usage}")
+        embed_log = {
+            "request": body_without_prompt,
+            "input": body.input,
+            "embedding": status.embedding,
+        }
+        logger.info(
+            f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})"
+        )
+        return chat_logger.info(dumps(embed_log, option=OPT_INDENT_2).decode())
+    if not isinstance(status, CompletionStatus):
+        return
+
+    # Log the completion status
+    tokens = status.generated_tokens
+    tokens_per_second = tokens / elapsed_time
+    log_messages.append(f"tokens: {tokens}({tokens_per_second: .1f}tok/s)")
     if isinstance(body, CreateChatCompletionRequest):
         chat_log = {
             "request": body_without_prompt,
@@ -246,7 +270,7 @@ def log_request_and_response(
             + [
                 {
                     "role": "assistant",
-                    "content": task_status["generated_text"],
+                    "content": status.generated_text,
                 }
             ],
         }
@@ -255,69 +279,15 @@ def log_request_and_response(
             "request": body_without_prompt,
             "prompt": {
                 "user": body.prompt,
-                "assistant": task_status["generated_text"],
+                "assistant": status.generated_text,
             },
         }
     else:
-        chat_log = {
-            "request": body_without_prompt,
-            "input": body.input,
-            "embedding": task_status["embedding_chunks"],
-        }
+        return
+    logger.info(f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})")
     chat_logger.info(dumps(chat_log, option=OPT_INDENT_2).decode())
 
 
-@contextmanager
-def task_manager(
-    body: Union[
-        CreateChatCompletionRequest,
-        CreateCompletionRequest,
-        CreateEmbeddingRequest,
-    ],
-    task: "Task[None]",
-    interrupt_signal: Event,
-) -> Generator[TaskStatus, None, None]:
-    """Start the producer task and cancel it when the client disconnects.
-    Also, log the completion status."""
-    task_status = TaskStatus(
-        completion_tokens=0,
-        started_at=time(),
-        interrupted=False,
-        embedding_chunks=None,
-        generated_text="",
-    )
-    try:
-        logger.info(f"🦙 Handling request of {body.model}...")
-        yield task_status
-    finally:
-        # Cancel the producer task and set event,
-        # so the completion task can be stopped
-        task.cancel()
-        interrupt_signal.set()
-
-        # Log the completion status
-        if task_status["interrupted"]:
-            status = "Interrupted"
-        else:
-            status = "Completed"
-
-        elapsed_time = time() - task_status["started_at"]
-        basic_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"]
-        if task_status["completion_tokens"]:
-            tokens = task_status["completion_tokens"]
-            tokens_per_second = tokens / elapsed_time
-            basic_messages.append(
-                f"tokens: {tokens}({tokens_per_second: .1f}tok/s)"
-            )
-        if task_status["embedding_chunks"] is not None:
-            embedding_chunks = task_status["embedding_chunks"]
-            basic_messages.append(f"embedding chunks: {embedding_chunks}")
-        logger.info(
-            f"🦙 [{status} for {body.model}]: ({' | '.join(basic_messages)})"
-        )
-        log_request_and_response(body=body, task_status=task_status)
-
-
 async def create_chat_completion_or_completion(
     request: Request,
     body: Union[CreateChatCompletionRequest, CreateCompletionRequest],
@@ -328,21 +298,18 @@ async def create_chat_completion_or_completion(
     If streaming is enabled, then return an EventSourceResponse."""
     async with get_wix_with_semaphore(request, body.model) as wix:
         queue, interrupt_signal = get_queue_and_event()
-        producer: Callable[
-            [
-                Union[CreateChatCompletionRequest, CreateCompletionRequest],
-                Queue,
-                Event,
-            ],
-            None,
-        ] = partial(
-            generate_completion_chunks if body.stream else generate_completion,
-            body=body,
-            queue=queue,
-            interrupt_signal=interrupt_signal,
-        )
-        task: "Task[None]" = create_task(
-            run_in_processpool_with_wix(producer, wix=wix)
+        task: "Task[CompletionStatus]" = create_task(
+            run_in_processpool_with_wix(
+                partial(
+                    generate_completion_chunks
+                    if body.stream
+                    else generate_completion,
+                    body=body,
+                    queue=queue,
+                    interrupt_signal=interrupt_signal,
+                ),
+                wix=wix,
+            )
         )
         if body.stream:
             send_chan, recv_chan = create_memory_object_stream(10)
@@ -364,24 +331,16 @@ async def create_chat_completion_or_completion(
                 ),
             )
         else:
-            with task_manager(
-                body=body,
-                task=task,
-                interrupt_signal=interrupt_signal,
-            ) as task_status:
-                completion: Union[
-                    ChatCompletion, Completion
-                ] = validate_item_type(
+            # Cancel the producer task and set event,
+            # so the completion task can be stopped
+            try:
+                return validate_item_type(
                     await run_in_threadpool(queue.get),
                     type=dict,  # type: ignore
                 )
-                task_status["completion_tokens"] = completion["usage"][
-                    "completion_tokens"
-                ]
-                task_status["generated_text"] = get_text_from_completion(
-                    completion
-                )
-                return completion
+            finally:
+                interrupt_signal.set()
+                log_request_and_response(body, await task, "Completed")
 
 
 @router.post("/chat/completions")
@@ -409,29 +368,24 @@ async def create_embedding(
     assert body.model is not None, "Model is required"
     async with get_wix_with_semaphore(request, body.model) as wix:
         queue, interrupt_signal = get_queue_and_event()
-        producer: Callable[
-            [CreateEmbeddingRequest, Queue, Event],
-            None,
-        ] = partial(
-            generate_embeddings,
-            body=body,
-            queue=queue,
-            interrupt_signal=interrupt_signal,
-        )
-        task: "Task[None]" = create_task(
-            run_in_processpool_with_wix(producer, wix=wix)
+        task: Task["EmbeddingStatus"] = create_task(
+            run_in_processpool_with_wix(
+                partial(
+                    generate_embeddings,
+                    body=body,
+                    queue=queue,
+                ),
+                wix=wix,
+            )
         )
-        with task_manager(
-            body=body,
-            task=task,
-            interrupt_signal=interrupt_signal,
-        ) as task_status:
-            embedding: Embedding = validate_item_type(
+        try:
+            return validate_item_type(
                 await run_in_threadpool(queue.get),
                 type=dict,  # type: ignore
             )
-            task_status["embedding_chunks"] = len(embedding["data"])
-            return embedding
+        finally:
+            interrupt_signal.set()
+            log_request_and_response(body, await task, "Completed")
 
 
 @router.get("/models")

From 7c1251c9fc5566afa00a00a5740c52f235b95f6c Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sat, 19 Aug 2023 22:16:57 +0900
Subject: [PATCH 07/18] Bump dependencies

---
 llama_api/modules/exllama.py     |   4 +-
 llama_api/server/app_settings.py |   4 +-
 llama_api/server/routers/v1.py   |   4 +-
 llama_api/shared/config.py       |   8 +-
 llama_api/utils/process_pool.py  |   8 +-
 poetry.lock                      | 356 ++++++++++++++++++-------------
 requirements.txt                 |  12 +-
 7 files changed, 225 insertions(+), 171 deletions(-)

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index fc5cb29..9fafffa 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -154,12 +154,12 @@ def generate_text(
         with logger.log_any_error():
             # Encode the prompt
             if settings.guidance_scale == 1:
-                ids = _encode(self.tokenizer, prompt)
+                ids = _encode(self.tokenizer, prompt or " ")
                 mask = None  # type: Optional[Tensor]
             else:
                 ids, mask = _encode(
                     self.tokenizer,
-                    [prompt, settings.negative_prompt or ""],
+                    [prompt or " ", settings.negative_prompt or ""],
                     return_mask=True,
                 )
 
diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 4bb7891..4939932 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -77,6 +77,8 @@ def initialize_before_launch(
     """Initialize the app"""
     for git_clone_args in Config.repositories.values():
         git_clone(**git_clone_args)
+    if environ.get("LLAMA_API_XFORMERS") == "1":
+        install_package("xformers")
     if install_packages:
         # Install all dependencies
         if not skip_compile:
@@ -100,8 +102,6 @@ def initialize_before_launch(
 
         # Get current packages installed
         logger.info(f"📦 Installed packages: {get_installed_packages()}")
-    if environ.get("LLAMA_API_XFORMERS") == "1":
-        install_package("xformers")
     else:
         logger.warning(
             "🏃‍♂️ Skipping package installation... "
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 0afa584..7e579d0 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -2,7 +2,7 @@
 Use same format as OpenAI API"""
 
 
-from asyncio import Task, create_task
+from asyncio import Task, create_task, wait_for
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from functools import partial
@@ -192,7 +192,7 @@ async def get_event_publisher(
             interrupt_signal.set()
             state = "Interrupted" if is_interrupted else "Completed"
             try:
-                status = await task
+                status = await wait_for(task, timeout=3)
                 log_request_and_response(body, status, state)
             finally:
                 task.cancel()
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index 7c4c887..6469e76 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -1,6 +1,12 @@
 from pathlib import Path
 from typing import Dict, List, Literal, Optional, Tuple
-from typing_extensions import TypedDict
+
+try:
+    from typing_extensions import TypedDict
+
+
+except ImportError:
+    from typing import TypedDict  # When dependencies aren't installed yet
 
 
 class GitCloneArgs(TypedDict):
diff --git a/llama_api/utils/process_pool.py b/llama_api/utils/process_pool.py
index c3358d4..9b848e2 100644
--- a/llama_api/utils/process_pool.py
+++ b/llama_api/utils/process_pool.py
@@ -1,12 +1,12 @@
-from itertools import islice
-from os import kill
 import pickle
 import queue
-from signal import SIGINT
 import sys
 from concurrent.futures import Future
 from functools import partial
+from itertools import islice
 from multiprocessing import Process, Queue, cpu_count
+from os import kill
+from signal import SIGINT
 from threading import Thread
 from time import sleep
 from traceback import format_exception
@@ -23,7 +23,7 @@
     Union,
 )
 
-from llama_api.utils.logger import ApiLogger
+from ..utils.logger import ApiLogger
 
 if sys.version_info >= (3, 10):
     from typing import ParamSpec
diff --git a/poetry.lock b/poetry.lock
index a969ac9..fc82221 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -348,13 +348,13 @@ rapidfuzz = ">=2.2.0,<3.0.0"
 
 [[package]]
 name = "click"
-version = "8.1.6"
+version = "8.1.7"
 description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"},
-    {file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"},
+    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
 ]
 
 [package.dependencies]
@@ -681,13 +681,13 @@ dev = ["flake8", "markdown", "twine", "wheel"]
 
 [[package]]
 name = "griffe"
-version = "0.32.3"
+version = "0.33.0"
 description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "griffe-0.32.3-py3-none-any.whl", hash = "sha256:d9471934225818bf8f309822f70451cc6abb4b24e59e0bb27402a45f9412510f"},
-    {file = "griffe-0.32.3.tar.gz", hash = "sha256:14983896ad581f59d5ad7b6c9261ff12bdaa905acccc1129341d13e545da8521"},
+    {file = "griffe-0.33.0-py3-none-any.whl", hash = "sha256:16af15d0140c0a5f0b2628d33235fef91a6d8a832dd7cff5759dfe6b7d7a7a49"},
+    {file = "griffe-0.33.0.tar.gz", hash = "sha256:783bcb7e7f0d346fcb0cb8072667ca8f6ce7ff776bb278fbccf8a3a753a793e4"},
 ]
 
 [package.dependencies]
@@ -1283,17 +1283,17 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
 
 [[package]]
 name = "mkdocstrings-python"
-version = "1.3.0"
+version = "1.4.0"
 description = "A Python handler for mkdocstrings."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocstrings_python-1.3.0-py3-none-any.whl", hash = "sha256:36c224c86ab77e90e0edfc9fea3307f7d0d245dd7c28f48bbb2203cf6e125530"},
-    {file = "mkdocstrings_python-1.3.0.tar.gz", hash = "sha256:f967f84bab530fcc13cc9c02eccf0c18bdb2c3bab5c55fa2045938681eec4fc4"},
+    {file = "mkdocstrings_python-1.4.0-py3-none-any.whl", hash = "sha256:46f4b0ed8540c6bfd0c3f50471831a7bdb9a1bf35f24400525721d7555aa355c"},
+    {file = "mkdocstrings_python-1.4.0.tar.gz", hash = "sha256:c92304c402928a05c793203dadee7a1a51b5ae56404fd594d0b2db49a7b3957a"},
 ]
 
 [package.dependencies]
-griffe = ">=0.30,<0.33"
+griffe = ">=0.33"
 mkdocstrings = ">=0.20"
 
 [[package]]
@@ -1458,28 +1458,71 @@ files = [
 
 [[package]]
 name = "orjson"
-version = "3.9.4"
+version = "3.9.5"
 description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "orjson-3.9.4-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2e83ec1ee66d83b558a6d273d8a01b86563daa60bea9bc040e2c1cb8008de61f"},
-    {file = "orjson-3.9.4-cp310-none-win32.whl", hash = "sha256:04cd7f4a4f4cd2fe43d104eb70e7435c6fcbdde7aa0cde4230e444fbc66924d3"},
-    {file = "orjson-3.9.4-cp310-none-win_amd64.whl", hash = "sha256:4fdb59cfa00e10c82e09d1c32a9ce08a38bd29496ba20a73cd7f498e3a0a5024"},
-    {file = "orjson-3.9.4-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:daeed2502ddf1f2b29ec8da2fe2ea82807a5c4acf869608ce6c476db8171d070"},
-    {file = "orjson-3.9.4-cp311-none-win32.whl", hash = "sha256:e12492ce65cb10f385e70a88badc6046bc720fa7d468db27b7429d85d41beaeb"},
-    {file = "orjson-3.9.4-cp311-none-win_amd64.whl", hash = "sha256:3b9f8bf43a5367d5522f80e7d533c98d880868cd0b640b9088c9237306eca6e8"},
-    {file = "orjson-3.9.4-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:0b400cf89c15958cd829c8a4ade8f5dd73588e63d2fb71a00483e7a74e9f92da"},
-    {file = "orjson-3.9.4-cp312-none-win_amd64.whl", hash = "sha256:a533e664a0e3904307d662c5d45775544dc2b38df6e39e213ff6a86ceaa3d53c"},
-    {file = "orjson-3.9.4-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:149d1b7630771222f73ecb024ab5dd8e7f41502402b02015494d429bacc4d5c1"},
-    {file = "orjson-3.9.4-cp37-none-win32.whl", hash = "sha256:bcda6179eb863c295eb5ea832676d33ef12c04d227b4c98267876c8322e5a96e"},
-    {file = "orjson-3.9.4-cp37-none-win_amd64.whl", hash = "sha256:3d947366127abef192419257eb7db7fcee0841ced2b49ccceba43b65e9ce5e3f"},
-    {file = "orjson-3.9.4-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:a7d029fc34a516f7eae29b778b30371fcb621134b2acfe4c51c785102aefc6cf"},
-    {file = "orjson-3.9.4-cp38-none-win32.whl", hash = "sha256:94d15ee45c2aaed334688e511aa73b4681f7c08a0810884c6b3ae5824dea1222"},
-    {file = "orjson-3.9.4-cp38-none-win_amd64.whl", hash = "sha256:336ec8471102851f0699198031924617b7a77baadea889df3ffda6000bd59f4c"},
-    {file = "orjson-3.9.4-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2f57ccb50e9e123709e9f2d7b1a9e09e694e49d1fa5c5585e34b8e3f01929dc3"},
-    {file = "orjson-3.9.4-cp39-none-win32.whl", hash = "sha256:b5b5038187b74e2d33e5caee8a7e83ddeb6a21da86837fa2aac95c69aeb366e6"},
-    {file = "orjson-3.9.4-cp39-none-win_amd64.whl", hash = "sha256:915da36bc93ef0c659fa50fe7939d4f208804ad252fc4fc8d55adbbb82293c48"},
+    {file = "orjson-3.9.5-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ad6845912a71adcc65df7c8a7f2155eba2096cf03ad2c061c93857de70d699ad"},
+    {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e298e0aacfcc14ef4476c3f409e85475031de24e5b23605a465e9bf4b2156273"},
+    {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83c9939073281ef7dd7c5ca7f54cceccb840b440cec4b8a326bda507ff88a0a6"},
+    {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e174cc579904a48ee1ea3acb7045e8a6c5d52c17688dfcb00e0e842ec378cabf"},
+    {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f8d51702f42c785b115401e1d64a27a2ea767ae7cf1fb8edaa09c7cf1571c660"},
+    {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f13d61c0c7414ddee1ef4d0f303e2222f8cced5a2e26d9774751aecd72324c9e"},
+    {file = "orjson-3.9.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d748cc48caf5a91c883d306ab648df1b29e16b488c9316852844dd0fd000d1c2"},
+    {file = "orjson-3.9.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bd19bc08fa023e4c2cbf8294ad3f2b8922f4de9ba088dbc71e6b268fdf54591c"},
+    {file = "orjson-3.9.5-cp310-none-win32.whl", hash = "sha256:5793a21a21bf34e1767e3d61a778a25feea8476dcc0bdf0ae1bc506dc34561ea"},
+    {file = "orjson-3.9.5-cp310-none-win_amd64.whl", hash = "sha256:2bcec0b1024d0031ab3eab7a8cb260c8a4e4a5e35993878a2da639d69cdf6a65"},
+    {file = "orjson-3.9.5-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8547b95ca0e2abd17e1471973e6d676f1d8acedd5f8fb4f739e0612651602d66"},
+    {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87ce174d6a38d12b3327f76145acbd26f7bc808b2b458f61e94d83cd0ebb4d76"},
+    {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a960bb1bc9a964d16fcc2d4af5a04ce5e4dfddca84e3060c35720d0a062064fe"},
+    {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a7aa5573a949760d6161d826d34dc36db6011926f836851fe9ccb55b5a7d8e8"},
+    {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b2852afca17d7eea85f8e200d324e38c851c96598ac7b227e4f6c4e59fbd3df"},
+    {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa185959c082475288da90f996a82e05e0c437216b96f2a8111caeb1d54ef926"},
+    {file = "orjson-3.9.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:89c9332695b838438ea4b9a482bce8ffbfddde4df92750522d928fb00b7b8dce"},
+    {file = "orjson-3.9.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2493f1351a8f0611bc26e2d3d407efb873032b4f6b8926fed8cfed39210ca4ba"},
+    {file = "orjson-3.9.5-cp311-none-win32.whl", hash = "sha256:ffc544e0e24e9ae69301b9a79df87a971fa5d1c20a6b18dca885699709d01be0"},
+    {file = "orjson-3.9.5-cp311-none-win_amd64.whl", hash = "sha256:89670fe2732e3c0c54406f77cad1765c4c582f67b915c74fda742286809a0cdc"},
+    {file = "orjson-3.9.5-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:15df211469625fa27eced4aa08dc03e35f99c57d45a33855cc35f218ea4071b8"},
+    {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9f17c59fe6c02bc5f89ad29edb0253d3059fe8ba64806d789af89a45c35269a"},
+    {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca6b96659c7690773d8cebb6115c631f4a259a611788463e9c41e74fa53bf33f"},
+    {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a26fafe966e9195b149950334bdbe9026eca17fe8ffe2d8fa87fdc30ca925d30"},
+    {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9006b1eb645ecf460da067e2dd17768ccbb8f39b01815a571bfcfab7e8da5e52"},
+    {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebfdbf695734b1785e792a1315e41835ddf2a3e907ca0e1c87a53f23006ce01d"},
+    {file = "orjson-3.9.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4a3943234342ab37d9ed78fb0a8f81cd4b9532f67bf2ac0d3aa45fa3f0a339f3"},
+    {file = "orjson-3.9.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e6762755470b5c82f07b96b934af32e4d77395a11768b964aaa5eb092817bc31"},
+    {file = "orjson-3.9.5-cp312-none-win_amd64.whl", hash = "sha256:c74df28749c076fd6e2157190df23d43d42b2c83e09d79b51694ee7315374ad5"},
+    {file = "orjson-3.9.5-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:88e18a74d916b74f00d0978d84e365c6bf0e7ab846792efa15756b5fb2f7d49d"},
+    {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d28514b5b6dfaf69097be70d0cf4f1407ec29d0f93e0b4131bf9cc8fd3f3e374"},
+    {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b81aca8c7be61e2566246b6a0ca49f8aece70dd3f38c7f5c837f398c4cb142"},
+    {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:385c1c713b1e47fd92e96cf55fd88650ac6dfa0b997e8aa7ecffd8b5865078b1"},
+    {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9850c03a8e42fba1a508466e6a0f99472fd2b4a5f30235ea49b2a1b32c04c11"},
+    {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4449f84bbb13bcef493d8aa669feadfced0f7c5eea2d0d88b5cc21f812183af8"},
+    {file = "orjson-3.9.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:86127bf194f3b873135e44ce5dc9212cb152b7e06798d5667a898a00f0519be4"},
+    {file = "orjson-3.9.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0abcd039f05ae9ab5b0ff11624d0b9e54376253b7d3217a358d09c3edf1d36f7"},
+    {file = "orjson-3.9.5-cp37-none-win32.whl", hash = "sha256:10cc8ad5ff7188efcb4bec196009d61ce525a4e09488e6d5db41218c7fe4f001"},
+    {file = "orjson-3.9.5-cp37-none-win_amd64.whl", hash = "sha256:ff27e98532cb87379d1a585837d59b187907228268e7b0a87abe122b2be6968e"},
+    {file = "orjson-3.9.5-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:5bfa79916ef5fef75ad1f377e54a167f0de334c1fa4ebb8d0224075f3ec3d8c0"},
+    {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e87dfa6ac0dae764371ab19b35eaaa46dfcb6ef2545dfca03064f21f5d08239f"},
+    {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:50ced24a7b23058b469ecdb96e36607fc611cbaee38b58e62a55c80d1b3ad4e1"},
+    {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1b74ea2a3064e1375da87788897935832e806cc784de3e789fd3c4ab8eb3fa5"},
+    {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7cb961efe013606913d05609f014ad43edfaced82a576e8b520a5574ce3b2b9"},
+    {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1225d2d5ee76a786bda02f8c5e15017462f8432bb960de13d7c2619dba6f0275"},
+    {file = "orjson-3.9.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f39f4b99199df05c7ecdd006086259ed25886cdbd7b14c8cdb10c7675cfcca7d"},
+    {file = "orjson-3.9.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a461dc9fb60cac44f2d3218c36a0c1c01132314839a0e229d7fb1bba69b810d8"},
+    {file = "orjson-3.9.5-cp38-none-win32.whl", hash = "sha256:dedf1a6173748202df223aea29de814b5836732a176b33501375c66f6ab7d822"},
+    {file = "orjson-3.9.5-cp38-none-win_amd64.whl", hash = "sha256:fa504082f53efcbacb9087cc8676c163237beb6e999d43e72acb4bb6f0db11e6"},
+    {file = "orjson-3.9.5-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6900f0248edc1bec2a2a3095a78a7e3ef4e63f60f8ddc583687eed162eedfd69"},
+    {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17404333c40047888ac40bd8c4d49752a787e0a946e728a4e5723f111b6e55a5"},
+    {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0eefb7cfdd9c2bc65f19f974a5d1dfecbac711dae91ed635820c6b12da7a3c11"},
+    {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:68c78b2a3718892dc018adbc62e8bab6ef3c0d811816d21e6973dee0ca30c152"},
+    {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:591ad7d9e4a9f9b104486ad5d88658c79ba29b66c5557ef9edf8ca877a3f8d11"},
+    {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6cc2cbf302fbb2d0b2c3c142a663d028873232a434d89ce1b2604ebe5cc93ce8"},
+    {file = "orjson-3.9.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b26b5aa5e9ee1bad2795b925b3adb1b1b34122cb977f30d89e0a1b3f24d18450"},
+    {file = "orjson-3.9.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ef84724f7d29dcfe3aafb1fc5fc7788dca63e8ae626bb9298022866146091a3e"},
+    {file = "orjson-3.9.5-cp39-none-win32.whl", hash = "sha256:664cff27f85939059472afd39acff152fbac9a091b7137092cb651cf5f7747b5"},
+    {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"},
+    {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"},
 ]
 
 [[package]]
@@ -1642,24 +1685,24 @@ poetry-core = ">=1.6.0,<2.0.0"
 
 [[package]]
 name = "protobuf"
-version = "4.24.0"
+version = "4.24.1"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "protobuf-4.24.0-cp310-abi3-win32.whl", hash = "sha256:81cb9c4621d2abfe181154354f63af1c41b00a4882fb230b4425cbaed65e8f52"},
-    {file = "protobuf-4.24.0-cp310-abi3-win_amd64.whl", hash = "sha256:6c817cf4a26334625a1904b38523d1b343ff8b637d75d2c8790189a4064e51c3"},
-    {file = "protobuf-4.24.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:ae97b5de10f25b7a443b40427033e545a32b0e9dda17bcd8330d70033379b3e5"},
-    {file = "protobuf-4.24.0-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:567fe6b0647494845d0849e3d5b260bfdd75692bf452cdc9cb660d12457c055d"},
-    {file = "protobuf-4.24.0-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:a6b1ca92ccabfd9903c0c7dde8876221dc7d8d87ad5c42e095cc11b15d3569c7"},
-    {file = "protobuf-4.24.0-cp37-cp37m-win32.whl", hash = "sha256:a38400a692fd0c6944c3c58837d112f135eb1ed6cdad5ca6c5763336e74f1a04"},
-    {file = "protobuf-4.24.0-cp37-cp37m-win_amd64.whl", hash = "sha256:5ab19ee50037d4b663c02218a811a5e1e7bb30940c79aac385b96e7a4f9daa61"},
-    {file = "protobuf-4.24.0-cp38-cp38-win32.whl", hash = "sha256:e8834ef0b4c88666ebb7c7ec18045aa0f4325481d724daa624a4cf9f28134653"},
-    {file = "protobuf-4.24.0-cp38-cp38-win_amd64.whl", hash = "sha256:8bb52a2be32db82ddc623aefcedfe1e0eb51da60e18fcc908fb8885c81d72109"},
-    {file = "protobuf-4.24.0-cp39-cp39-win32.whl", hash = "sha256:ae7a1835721086013de193311df858bc12cd247abe4ef9710b715d930b95b33e"},
-    {file = "protobuf-4.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:44825e963008f8ea0d26c51911c30d3e82e122997c3c4568fd0385dd7bacaedf"},
-    {file = "protobuf-4.24.0-py3-none-any.whl", hash = "sha256:82e6e9ebdd15b8200e8423676eab38b774624d6a1ad696a60d86a2ac93f18201"},
-    {file = "protobuf-4.24.0.tar.gz", hash = "sha256:5d0ceb9de6e08311832169e601d1fc71bd8e8c779f3ee38a97a78554945ecb85"},
+    {file = "protobuf-4.24.1-cp310-abi3-win32.whl", hash = "sha256:d414199ca605eeb498adc4d2ba82aedc0379dca4a7c364ff9bc9a179aa28e71b"},
+    {file = "protobuf-4.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:5906c5e79ff50fe38b2d49d37db5874e3c8010826f2362f79996d83128a8ed9b"},
+    {file = "protobuf-4.24.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:970c701ee16788d74f3de20938520d7a0aebc7e4fff37096a48804c80d2908cf"},
+    {file = "protobuf-4.24.1-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:fc361148e902949dcb953bbcb148c99fe8f8854291ad01107e4120361849fd0e"},
+    {file = "protobuf-4.24.1-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:5d32363d14aca6e5c9e9d5918ad8fb65b091b6df66740ae9de50ac3916055e43"},
+    {file = "protobuf-4.24.1-cp37-cp37m-win32.whl", hash = "sha256:df015c47d6855b8efa0b9be706c70bf7f050a4d5ac6d37fb043fbd95157a0e25"},
+    {file = "protobuf-4.24.1-cp37-cp37m-win_amd64.whl", hash = "sha256:d4af4fd9e9418e819be30f8df2a16e72fbad546a7576ac7f3653be92a6966d30"},
+    {file = "protobuf-4.24.1-cp38-cp38-win32.whl", hash = "sha256:302e8752c760549ed4c7a508abc86b25d46553c81989343782809e1a062a2ef9"},
+    {file = "protobuf-4.24.1-cp38-cp38-win_amd64.whl", hash = "sha256:06437f0d4bb0d5f29e3d392aba69600188d4be5ad1e0a3370e581a9bf75a3081"},
+    {file = "protobuf-4.24.1-cp39-cp39-win32.whl", hash = "sha256:0b2b224e9541fe9f046dd7317d05f08769c332b7e4c54d93c7f0f372dedb0b1a"},
+    {file = "protobuf-4.24.1-cp39-cp39-win_amd64.whl", hash = "sha256:bd39b9094a4cc003a1f911b847ab379f89059f478c0b611ba1215053e295132e"},
+    {file = "protobuf-4.24.1-py3-none-any.whl", hash = "sha256:55dd644adc27d2a624339332755fe077c7f26971045b469ebb9732a69ce1f2ca"},
+    {file = "protobuf-4.24.1.tar.gz", hash = "sha256:44837a5ed9c9418ad5d502f89f28ba102e9cd172b6668bc813f21716f9273348"},
 ]
 
 [[package]]
@@ -1723,18 +1766,18 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "2.1.1"
+version = "2.2.1"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-2.1.1-py3-none-any.whl", hash = "sha256:43bdbf359d6304c57afda15c2b95797295b702948082d4c23851ce752f21da70"},
-    {file = "pydantic-2.1.1.tar.gz", hash = "sha256:22d63db5ce4831afd16e7c58b3192d3faf8f79154980d9397d9867254310ba4b"},
+    {file = "pydantic-2.2.1-py3-none-any.whl", hash = "sha256:0c88bd2b63ed7a5109c75ab180d55f58f80a4b559682406812d0684d3f4b9192"},
+    {file = "pydantic-2.2.1.tar.gz", hash = "sha256:31b5cada74b2320999fb2577e6df80332a200ff92e7775a52448b6b036fce24a"},
 ]
 
 [package.dependencies]
 annotated-types = ">=0.4.0"
-pydantic-core = "2.4.0"
+pydantic-core = "2.6.1"
 typing-extensions = ">=4.6.1"
 
 [package.extras]
@@ -1742,112 +1785,117 @@ email = ["email-validator (>=2.0.0)"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.4.0"
+version = "2.6.1"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic_core-2.4.0-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:2ca4687dd996bde7f3c420def450797feeb20dcee2b9687023e3323c73fc14a2"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:782fced7d61469fd1231b184a80e4f2fa7ad54cd7173834651a453f96f29d673"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6213b471b68146af97b8551294e59e7392c2117e28ffad9c557c65087f4baee3"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63797499a219d8e81eb4e0c42222d0a4c8ec896f5c76751d4258af95de41fdf1"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_24_armv7l.whl", hash = "sha256:0455876d575a35defc4da7e0a199596d6c773e20d3d42fa1fc29f6aa640369ed"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:8c938c96294d983dcf419b54dba2d21056959c22911d41788efbf949a29ae30d"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_24_s390x.whl", hash = "sha256:878a5017d93e776c379af4e7b20f173c82594d94fa073059bcc546789ad50bf8"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69159afc2f2dc43285725f16143bc5df3c853bc1cb7df6021fce7ef1c69e8171"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:54df7df399b777c1fd144f541c95d351b3aa110535a6810a6a569905d106b6f3"},
-    {file = "pydantic_core-2.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e412607ca89a0ced10758dfb8f9adcc365ce4c1c377e637c01989a75e9a9ec8a"},
-    {file = "pydantic_core-2.4.0-cp310-none-win32.whl", hash = "sha256:853f103e2b9a58832fdd08a587a51de8b552ae90e1a5d167f316b7eabf8d7dde"},
-    {file = "pydantic_core-2.4.0-cp310-none-win_amd64.whl", hash = "sha256:3ba2c9c94a9176f6321a879c8b864d7c5b12d34f549a4c216c72ce213d7d953c"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:a8b7acd04896e8f161e1500dc5f218017db05c1d322f054e89cbd089ce5d0071"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16468bd074fa4567592d3255bf25528ed41e6b616d69bf07096bdb5b66f947d1"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cba5ad5eef02c86a1f3da00544cbc59a510d596b27566479a7cd4d91c6187a11"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7206e41e04b443016e930e01685bab7a308113c0b251b3f906942c8d4b48fcb"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_24_armv7l.whl", hash = "sha256:c1375025f0bfc9155286ebae8eecc65e33e494c90025cda69e247c3ccd2bab00"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_24_ppc64le.whl", hash = "sha256:3534118289e33130ed3f1cc487002e8d09b9f359be48b02e9cd3de58ce58fba9"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_24_s390x.whl", hash = "sha256:94d2b36a74623caab262bf95f0e365c2c058396082bd9d6a9e825657d0c1e7fa"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:af24ad4fbaa5e4a2000beae0c3b7fd1c78d7819ab90f9370a1cfd8998e3f8a3c"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bf10963d8aed8bbe0165b41797c9463d4c5c8788ae6a77c68427569be6bead41"},
-    {file = "pydantic_core-2.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:68199ada7c310ddb8c76efbb606a0de656b40899388a7498954f423e03fc38be"},
-    {file = "pydantic_core-2.4.0-cp311-none-win32.whl", hash = "sha256:6f855bcc96ed3dd56da7373cfcc9dcbabbc2073cac7f65c185772d08884790ce"},
-    {file = "pydantic_core-2.4.0-cp311-none-win_amd64.whl", hash = "sha256:de39eb3bab93a99ddda1ac1b9aa331b944d8bcc4aa9141148f7fd8ee0299dafc"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:f773b39780323a0499b53ebd91a28ad11cde6705605d98d999dfa08624caf064"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a297c0d6c61963c5c3726840677b798ca5b7dfc71bc9c02b9a4af11d23236008"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:546064c55264156b973b5e65e5fafbe5e62390902ce3cf6b4005765505e8ff56"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36ba9e728588588f0196deaf6751b9222492331b5552f865a8ff120869d372e0"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_24_armv7l.whl", hash = "sha256:57a53a75010c635b3ad6499e7721eaa3b450e03f6862afe2dbef9c8f66e46ec8"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_24_ppc64le.whl", hash = "sha256:4b262bbc13022f2097c48a21adcc360a81d83dc1d854c11b94953cd46d7d3c07"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_24_s390x.whl", hash = "sha256:01947ad728f426fa07fcb26457ebf90ce29320259938414bc0edd1476e75addb"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b2799c2eaf182769889761d4fb4d78b82bc47dae833799fedbf69fc7de306faa"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a08fd490ba36d1fbb2cd5dcdcfb9f3892deb93bd53456724389135712b5fc735"},
-    {file = "pydantic_core-2.4.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1e8a7c62d15a5c4b307271e4252d76ebb981d6251c6ecea4daf203ef0179ea4f"},
-    {file = "pydantic_core-2.4.0-cp312-none-win32.whl", hash = "sha256:9206c14a67c38de7b916e486ae280017cf394fa4b1aa95cfe88621a4e1d79725"},
-    {file = "pydantic_core-2.4.0-cp312-none-win_amd64.whl", hash = "sha256:884235507549a6b2d3c4113fb1877ae263109e787d9e0eb25c35982ab28d0399"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:4cbe929efa77a806e8f1a97793f2dc3ea3475ae21a9ed0f37c21320fe93f6f50"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:9137289de8fe845c246a8c3482dd0cb40338846ba683756d8f489a4bd8fddcae"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5d8e764b5646623e57575f624f8ebb8f7a9f7fd1fae682ef87869ca5fec8dcf"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fba0aff4c407d0274e43697e785bcac155ad962be57518d1c711f45e72da70f"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_24_armv7l.whl", hash = "sha256:30527d173e826f2f7651f91c821e337073df1555e3b5a0b7b1e2c39e26e50678"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:bd7d1dde70ff3e09e4bc7a1cbb91a7a538add291bfd5b3e70ef1e7b45192440f"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_24_s390x.whl", hash = "sha256:72f1216ca8cef7b8adacd4c4c6b89c3b0c4f97503197f5284c80f36d6e4edd30"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b013c7861a7c7bfcec48fd709513fea6f9f31727e7a0a93ca0dd12e056740717"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:478f5f6d7e32bd4a04d102160efb2d389432ecf095fe87c555c0a6fc4adfc1a4"},
-    {file = "pydantic_core-2.4.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d9610b47b5fe4aacbbba6a9cb5f12cbe864eec99dbfed5710bd32ef5dd8a5d5b"},
-    {file = "pydantic_core-2.4.0-cp37-none-win32.whl", hash = "sha256:ff246c0111076c8022f9ba325c294f2cb5983403506989253e04dbae565e019b"},
-    {file = "pydantic_core-2.4.0-cp37-none-win_amd64.whl", hash = "sha256:d0c2b713464a8e263a243ae7980d81ce2de5ac59a9f798a282e44350b42dc516"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:12ef6838245569fd60a179fade81ca4b90ae2fa0ef355d616f519f7bb27582db"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49db206eb8fdc4b4f30e6e3e410584146d813c151928f94ec0db06c4f2595538"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a507d7fa44688bbac76af6521e488b3da93de155b9cba6f2c9b7833ce243d59"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffe18407a4d000c568182ce5388bbbedeb099896904e43fc14eee76cfae6dec5"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_24_armv7l.whl", hash = "sha256:fa8e48001b39d54d97d7b380a0669fa99fc0feeb972e35a2d677ba59164a9a22"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:394f12a2671ff8c4dfa2e85be6c08be0651ad85bc1e6aa9c77c21671baaf28cd"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_24_s390x.whl", hash = "sha256:2f9ea0355f90db2a76af530245fa42f04d98f752a1236ed7c6809ec484560d5b"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:61d4e713f467abcdd59b47665d488bb898ad3dd47ce7446522a50e0cbd8e8279"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:453862ab268f6326b01f067ed89cb3a527d34dc46f6f4eeec46a15bbc706d0da"},
-    {file = "pydantic_core-2.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:56a85fa0dab1567bd0cac10f0c3837b03e8a0d939e6a8061a3a420acd97e9421"},
-    {file = "pydantic_core-2.4.0-cp38-none-win32.whl", hash = "sha256:0d726108c1c0380b88b6dd4db559f0280e0ceda9e077f46ff90bc85cd4d03e77"},
-    {file = "pydantic_core-2.4.0-cp38-none-win_amd64.whl", hash = "sha256:047580388644c473b934d27849f8ed8dbe45df0adb72104e78b543e13bf69762"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:867d3eea954bea807cabba83cfc939c889a18576d66d197c60025b15269d7cc0"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:664402ef0c238a7f8a46efb101789d5f2275600fb18114446efec83cfadb5b66"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64e8012ad60a5f0da09ed48725e6e923d1be25f2f091a640af6079f874663813"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac2b680de398f293b68183317432b3d67ab3faeba216aec18de0c395cb5e3060"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_24_armv7l.whl", hash = "sha256:8efc1be43b036c2b6bcfb1451df24ee0ddcf69c31351003daf2699ed93f5687b"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:d93aedbc4614cc21b9ab0d0c4ccd7143354c1f7cffbbe96ae5216ad21d1b21b5"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_24_s390x.whl", hash = "sha256:af788b64e13d52fc3600a68b16d31fa8d8573e3ff2fc9a38f8a60b8d94d1f012"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97c6349c81cee2e69ef59eba6e6c08c5936e6b01c2d50b9e4ac152217845ae09"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc086ddb6dc654a15deeed1d1f2bcb1cb924ebd70df9dca738af19f64229b06c"},
-    {file = "pydantic_core-2.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e953353180bec330c3b830891d260b6f8e576e2d18db3c78d314e56bb2276066"},
-    {file = "pydantic_core-2.4.0-cp39-none-win32.whl", hash = "sha256:6feb4b64d11d5420e517910d60a907d08d846cacaf4e029668725cd21d16743c"},
-    {file = "pydantic_core-2.4.0-cp39-none-win_amd64.whl", hash = "sha256:153a61ac4030fa019b70b31fb7986461119230d3ba0ab661c757cfea652f4332"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:3fcf529382b282a30b466bd7af05be28e22aa620e016135ac414f14e1ee6b9e1"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2edef05b63d82568b877002dc4cb5cc18f8929b59077120192df1e03e0c633f8"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da055a1b0bfa8041bb2ff586b2cb0353ed03944a3472186a02cc44a557a0e661"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:77dadc764cf7c5405e04866181c5bd94a447372a9763e473abb63d1dfe9b7387"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a4ea23b07f29487a7bef2a869f68c7ee0e05424d81375ce3d3de829314c6b5ec"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:382f0baa044d674ad59455a5eff83d7965572b745cc72df35c52c2ce8c731d37"},
-    {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:08f89697625e453421401c7f661b9d1eb4c9e4c0a12fd256eeb55b06994ac6af"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:43a405ce520b45941df9ff55d0cd09762017756a7b413bbad3a6e8178e64a2c2"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:584a7a818c84767af16ce8bda5d4f7fedb37d3d231fc89928a192f567e4ef685"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04922fea7b13cd480586fa106345fe06e43220b8327358873c22d8dfa7a711c7"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17156abac20a9feed10feec867fddd91a80819a485b0107fe61f09f2117fe5f3"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e562cc63b04636cde361fd47569162f1daa94c759220ff202a8129902229114"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:90f3785146f701e053bb6b9e8f53acce2c919aca91df88bd4975be0cb926eb41"},
-    {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:e40b1e97edd3dc127aa53d8a5e539a3d0c227d71574d3f9ac1af02d58218a122"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:b27f3e67f6e031f6620655741b7d0d6bebea8b25d415924b3e8bfef2dd7bd841"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be86c2eb12fb0f846262ace9d8f032dc6978b8cb26a058920ecb723dbcb87d05"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4665f7ed345012a8d2eddf4203ef145f5f56a291d010382d235b94e91813f88a"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:79262be5a292d1df060f29b9a7cdd66934801f987a817632d7552534a172709a"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:5fd905a69ac74eaba5041e21a1e8b1a479dab2b41c93bdcc4c1cede3c12a8d86"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:2ad538b7e07343001934417cdc8584623b4d8823c5b8b258e75ec8d327cec969"},
-    {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:dd2429f7635ad4857b5881503f9c310be7761dc681c467a9d27787b674d1250a"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:efff8b6761a1f6e45cebd1b7a6406eb2723d2d5710ff0d1b624fe11313693989"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32a1e0352558cd7ccc014ffe818c7d87b15ec6145875e2cc5fa4bb7351a1033d"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a027f41c5008571314861744d83aff75a34cf3a07022e0be32b214a5bc93f7f1"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1927f0e15d190f11f0b8344373731e28fd774c6d676d8a6cfadc95c77214a48b"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:7aa82d483d5fb867d4fb10a138ffd57b0f1644e99f2f4f336e48790ada9ada5e"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b85778308bf945e9b33ac604e6793df9b07933108d20bdf53811bc7c2798a4af"},
-    {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3ded19dcaefe2f6706d81e0db787b59095f4ad0fbadce1edffdf092294c8a23f"},
-    {file = "pydantic_core-2.4.0.tar.gz", hash = "sha256:ec3473c9789cc00c7260d840c3db2c16dbfc816ca70ec87a00cddfa3e1a1cdd5"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:f55001a689111a297c0006c46c0589cfd559261baaa9a37bc35eff05b8cae1a6"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bb6273068e9450c5c91f58dd277fbd406b896ffa30f0ef312edc5519d07f16ae"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:043212f21c75cb6ee3a92fffbc747410e32b08e1a419ce16a9da98a16d660a7c"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:db0c12f1e9d3bf658634621f3423486803d749fef77a64cfb4252f9d619e1817"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:81424dc05c4342a19fb64323bb9d4468e7407b745c00377ccc4d3dd96d5e02fe"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c8f3aebaf92f088b1dafd7101d1ccca0459ae0f5b26017411b9969667d289a9"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd9f14454b4bc89c705ce17951f9c783db82efd2b44a424487c593e2269eef61"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2effc71653247e76c5b95d15c58d4ca3f591f42f714eb3b32df9d6ec613794a5"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:56672429f8a89d2a0f4402d912f0dad68c2d05f7c278d3152c6fb4a76c2a429a"},
+    {file = "pydantic_core-2.6.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d0bf1c2545ab253732229c7fe8294d98eb08f99aa25a388267e1bc4d2d7e0a34"},
+    {file = "pydantic_core-2.6.1-cp310-none-win32.whl", hash = "sha256:c5be947ad41a7602f941dc834d03e64dd1c7fae65fa85cb4f1004a95c5d50df1"},
+    {file = "pydantic_core-2.6.1-cp310-none-win_amd64.whl", hash = "sha256:3d14ae98a8d251402ef8ed017039d2fc3e29fb155f909cd3816ba259fd30fb48"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:4a3c20808d3ced90e29439f72a563eadf21d29560935cc818b2dab80b92c114a"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:da240bbd8191edc6009e7793d5d4d67c55f56225c4788f068d6286c20e5a2038"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de1a3e56e34264d5216c67d2a48185216ada8f5f35a7f4c96a3971847c0de897"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9b623e09239ed333d14c02c9fcd1a7bb350b95eca8383f6e9b0d8e373d5a14b5"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a12520a6d502a25f6e47319874e47056b290f1b3c2ed9391444ce81c8cc5b83"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1141f18414aee8865c7917ae1432e419c1983272f53625152493692ff3d6783"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7888b3ee7566865cff3e9edab5d6cdf2e7cf793df17fe53d5e7be3e57eae45ec"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3bdf293b6304bc451678b7016c2505b7d97aa85ff13dac4420027b1b69e15d3d"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7ef56a05bb60336d5e795bf166d6712b2362e6478522c77e8336cb0da8909913"},
+    {file = "pydantic_core-2.6.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3210eb73707e3487c16ef25cfd1663660f4e7d647a181d6c2fb18bc6167985fb"},
+    {file = "pydantic_core-2.6.1-cp311-none-win32.whl", hash = "sha256:707e3005e8c129bdac117285b71717c13b9ed81a81eae0b1642f4ddc60028e63"},
+    {file = "pydantic_core-2.6.1-cp311-none-win_amd64.whl", hash = "sha256:2b8ccec2189d8a8b83929f79e5bc00c0656f6c2ba4345125c0c82d1b77e15a26"},
+    {file = "pydantic_core-2.6.1-cp311-none-win_arm64.whl", hash = "sha256:c1e44b77442fb5b1b6fccea30e3359b14d0a2e5896801243defe54482a591500"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:c82fb25f965f6777032fc2f2856c86149f7709c8f7fd0c020a8631b8211f2bab"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:494b211b12b8fedd184dbba609f6ed582e23561db57c1996fd6773989dbaef9b"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1281c940f47e5c89b594ef7580045647df1f9ad687edd503bcc0485be94576f4"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2d41701c88d8b678c16c10562949f2d28aceacd767cbe51dac9c8c41e6e609fb"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a839c95d5cc91eed053d8dafde4e200c4bc82f56fb1cf7bbfaeb03e2d907929"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c22e4fbfb5823d0fcb2c20ed164b39c3588554f9635f70765e8c9cff0fef67ad"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2fed4ad60ccf2698bd04e95dfc3bd84149ced9605a29fd27d624701e1da300c"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33b9343aa464d60c31937b361abde08d3af9943f3eb09d3216211b6236bd40c4"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:56e4953cd911293d6d755e2a97c651826aca76201db8f1ee298939e703721390"},
+    {file = "pydantic_core-2.6.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cd163109047ab41ef1ea34258b35beb3ccac90af2012927ee8ab6ff122fef671"},
+    {file = "pydantic_core-2.6.1-cp312-none-win32.whl", hash = "sha256:f5b51ec04743c94288c46e3759769611ab7c5ce0f941113363da96d20d345fb6"},
+    {file = "pydantic_core-2.6.1-cp312-none-win_amd64.whl", hash = "sha256:ca5606bd82e255b1d704a4334e5ebf05ae966b69686fae02dcd31c057bdcb113"},
+    {file = "pydantic_core-2.6.1-cp312-none-win_arm64.whl", hash = "sha256:dfc8f534a21b60b00f87e5a4fc36b8b8945160a6cc9e7b6e67db541c766c9597"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:b1aed20778092f8334c8eaf91550fa2805221d5e9b40ebdd1f46ee7efc159a48"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:64ff7a4b7ee2a56735af28da76c5dacbba6995801080f739d14610f4aa3de35d"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2d8faedb138c704957642fdf154c94f1b3d2a15cbd2472e45665f80463e85ee"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:55aac69d7339a63e37164f0a629c3034becc6746d68d126118a3ee4493514bed"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfdb1617af455a551be4cc0471f0bf3bfb1e882db71afad0e587c821326bb749"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aadc84f5bd7b1421b5a6b389ceff46062dd4a58c44cfb75990e9ca2d9d8270df"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1a01dce87507b9a8f1b71933ade85c573a22c9bd4649590e28d8a497afb68bd"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cd6f05f3e237ed6b3949464e7679e55843645fe0fe8d3b33277c321386836f6a"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:760f8a0aeb43ceeff1e536859e071a72e91075d4d37d51470812c4f49e682702"},
+    {file = "pydantic_core-2.6.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a1ad48e77935d7dbbc2d75aeb638abbfbd0df0cfacf774dbe98d52271468f00c"},
+    {file = "pydantic_core-2.6.1-cp37-none-win32.whl", hash = "sha256:153a5dd24c09ab7544beda967366afbaae8350b327a4ebd5807ed45ec791baa0"},
+    {file = "pydantic_core-2.6.1-cp37-none-win_amd64.whl", hash = "sha256:cc7fc3e81b4ea6bce7e0e1d9797f496e957c5e66adf483f89afdce2d81d19986"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:5482d692ae37857695feccb179022728b275b7bfcc1c85bcdf7b556e76bffcd8"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:45d248c3c5c5c23a8d048cfdebc8151ae7b32a6dc6d68fbca995521e54692207"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6dd6c9f47e26779bf1f7da4d6ccd60f66973e63b0a143438f1e20bae296c3fde"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:55701608e60418a423db2486b5c64d790f86eb78a11b9077efb6302c50e62564"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:420a76a62dd20a6ef08445abf7cf04dcd8a845a5bb15932c2e88a8e518c70d43"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f253d20314e53ba0fb2b95541b6ed23f44fbcd927fe7674de341545c3327c3d"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5127b811c6a26deb85f5b17a06c26c28ce204e51e0a963b75bdf8612b22546d"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:51ffa985b874ca7d0dc199bb75c67b77907379291c91532a9e2d981f7b681527"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4902300e763a2fcc49ae14366493ef1fdbd3c7128b9acf37aef505f671aa681f"},
+    {file = "pydantic_core-2.6.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e1c69334bb843c9bff98f52a1fa6c06420081a561fcecb03c6b9376960bd7de2"},
+    {file = "pydantic_core-2.6.1-cp38-none-win32.whl", hash = "sha256:e84812b1ca989b2e9f4913d7b75ae0eece2a90154de35b4c5411ad640bfd387c"},
+    {file = "pydantic_core-2.6.1-cp38-none-win_amd64.whl", hash = "sha256:775098e3629a959dfec8444667a53e0916839e9fbf6b55e07d6e2aadde006400"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:a32ed5a794918a61bf77b967c197eb78f31ad4e3145860193dc381bde040717e"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:66eda8ac48ac33e9e5c6541c8e30c702924b70a6f2e9732b74230d9b2dd35fb6"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb5131d75d69b0547ef9a8f46f7b94857411c9badcdd5092de61a3b4943f08c7"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20e850f3242d7836a5e15453f798d8569b9754350c8e184ba32d102c515dd507"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f4327fa6a1ac3da62b27d43bb0f27657ed4e601b141ecbfcf8523814b6c33b6"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c7b89b2875b967ad5c3c980bf72773851554f80c2529796e815a10c99295d872"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78eadd8d7d5cd8c3616e363c394d721437c339feaa4c28404e2eda79add69781"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17ab25bb24e98b61d120b7248c2b49ea56ce754a050d6b348be42015fcb7aa25"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6ea8dd2854fe6cee5ea0d60304ee7877dffe487cf118f221e85029269dd1235d"},
+    {file = "pydantic_core-2.6.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9bf3ba6b4878ee692f6e24230801f682807fd97356bc2064f630fc0a2ad2ead6"},
+    {file = "pydantic_core-2.6.1-cp39-none-win32.whl", hash = "sha256:b974d65692333931b4c7f730e7a3135ff854a1e5384bc260de3327ea364c835a"},
+    {file = "pydantic_core-2.6.1-cp39-none-win_amd64.whl", hash = "sha256:f34f26d8a5f1a45366189ec30a57f43b21e2172d0d3b62822638dd885cc8eaab"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:f7ec4c6edafa3f0eb1aa461e31ea263736cc541b2459dddfbda7085b30844801"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:3679b9a1f41eb1b699e9556f91281d78c416cdc59ae90d5733fbe2017f1effe9"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3ff36f945342086ee917d4219dd0e59660a2dfcdb86a07696c2791f5d59c07d"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:734864605d722a6f8db3b9c96371710f7cb591fbfca40cfeaedf5b67df282438"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7188359b95a2b1aef5744a2ee6af2d9cfc733dd823f8840f4c896129477a172b"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:382d40843ae759d43ef65b67dec713390f9417135c1dd730afbf03cf2f450f45"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:4525b8498d362e4e324e3e175239b364768f52bd3563ac4ef9750160f5789de8"},
+    {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e55514a022c768cccf07a675d20d07b847980dcd9250f6b516a86bab5612fc01"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:34734d486d059f0f6f5bfa9ba4a41449f666e2abbde002e9fa8b050bc50e3347"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a809498dceb0cd1cd1e57a2bfdc70ea82f424776e0196f4d63c4b6fcdaeb5aab"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:588a5ffd8bbf1b2230611ed1b45221adcf05b981037b2f853b5f20465849b5c1"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:26b81017aeae0d96f776fbce34a3a763d26ac575d8ad3f1202bdfdd2b935954b"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:7ddaa2c3c66682f0ff4ebc8c85ef2d8305f32deba79416464c47c93d94ca3740"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d6971131de66d1a37293f2e032206b6984b0dec44f568b453dfe89a84a2de0cc"},
+    {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:200704f6824f8014bdccb1ce57cbd328666e6de4ecd77f0b8ab472cdea9c49ce"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:6916b27072c957947919fb32551f08486562bb8616f2e3db9e4e9c1d83d36886"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:136de286abf53f326b90389aaaca8a8050c2570adfc74afe06ab1c35d5d242bf"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60a238bb4ab09a81a6b25c9a0bb12756cfab2d9f3a7a471f857a179f83da0df6"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2034d9b83a59b3b74b9dbf97ddb99de86c08863c1c33aabf80bc95791c7d50c3"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7c3a2b4d1636446dc71da1e949d2cf9ac1ee691ca63a640b77fce0360b4b75be"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:09e4ebd11a0b333b1fca75c1004c76dc9719f3aaf83ae38c42358754d8a76148"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:a4536d132a8bbd05bf368fb802a264cb9828f6c85e4029a6a3670bc98ba97323"},
+    {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6221c97d6d58f2370650cfe3d81408901a1951c99960e1df9f6f9f8482d73d08"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4223e8bdad41d846a84cda400cd538e1cdc63d98eb4d41951396bfdb88fd8ce9"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c07cdb2e02733e5f26b9b004a1a8b99814d175f8953fa9f59e4293de2b8e9787"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8714e958d01342d08e520ffec6c1acf66cdec83ce51302f9a1a6efb2f784d0b6"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f03541c25a77fb5445055e070b69d292c9818a9195ffbfd3962c0ad0da983e8"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:364c13ef48c9e2f8c2ea8ee0da5ea23db5e218f99e796cbf360a2a7cab511439"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:27ba58bbfd1b2b9da45bfe524e680e2bc747a1ca9738ee5aa18d8cbdcc08e5e6"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:92321582e59da185b76b2eca4488ea95e41800672e57107509d32ebf8ad550f8"},
+    {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2da1d21a4f2675d5b8a749674993a65c0537e2066e7ab7b1a4a54ef0b3ac8efd"},
+    {file = "pydantic_core-2.6.1.tar.gz", hash = "sha256:5b4efa68bcfa6f2b93624c6660b6cf4b7b4336d4225afb314254a0ed9c9f4153"},
 ]
 
 [package.dependencies]
@@ -2170,13 +2218,13 @@ full = ["numpy"]
 
 [[package]]
 name = "readme-renderer"
-version = "40.0"
+version = "41.0"
 description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "readme_renderer-40.0-py3-none-any.whl", hash = "sha256:e18feb2a1e7706f2865b81ebb460056d93fb29d69daa10b223c00faa7bd9a00a"},
-    {file = "readme_renderer-40.0.tar.gz", hash = "sha256:9f77b519d96d03d7d7dce44977ba543090a14397c4f60de5b6eb5b8048110aa4"},
+    {file = "readme_renderer-41.0-py3-none-any.whl", hash = "sha256:a38243d5b6741b700a850026e62da4bd739edc7422071e95fd5c4bb60171df86"},
+    {file = "readme_renderer-41.0.tar.gz", hash = "sha256:4f4b11e5893f5a5d725f592c5a343e0dc74f5f273cb3dcf8c42d9703a27073f7"},
 ]
 
 [package.dependencies]
@@ -2607,13 +2655,13 @@ files = [
 
 [[package]]
 name = "shellingham"
-version = "1.5.0.post1"
+version = "1.5.3"
 description = "Tool to Detect Surrounding Shell"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "shellingham-1.5.0.post1-py2.py3-none-any.whl", hash = "sha256:368bf8c00754fd4f55afb7bbb86e272df77e4dc76ac29dbcbb81a59e9fc15744"},
-    {file = "shellingham-1.5.0.post1.tar.gz", hash = "sha256:823bc5fb5c34d60f285b624e7264f4dda254bc803a3774a147bf99c0e3004a28"},
+    {file = "shellingham-1.5.3-py2.py3-none-any.whl", hash = "sha256:419c6a164770c9c7cfcaeddfacb3d31ac7a8db0b0f3e9c1287679359734107e9"},
+    {file = "shellingham-1.5.3.tar.gz", hash = "sha256:cb4a6fec583535bc6da17b647dd2330cf7ef30239e05d547d99ae3705fd0f7f8"},
 ]
 
 [[package]]
diff --git a/requirements.txt b/requirements.txt
index 7801854..18c91b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ certifi==2023.7.22 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cffi==1.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux")
 charset-normalizer==3.2.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cleo==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-click==8.1.6 ; python_full_version >= "3.8.1" and python_version < "3.12"
+click==8.1.7 ; python_full_version >= "3.8.1" and python_version < "3.12"
 cmake==3.27.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 colorama==0.4.6 ; python_full_version >= "3.8.1" and python_version < "3.12" and (os_name == "nt" or platform_system == "Windows")
 crashtest==0.4.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
@@ -37,7 +37,7 @@ more-itertools==10.1.0 ; python_full_version >= "3.8.1" and python_version < "3.
 msgpack==1.0.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 ninja==1.11.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 numpy==1.24.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
-orjson==3.9.4 ; python_full_version >= "3.8.1" and python_version < "3.12"
+orjson==3.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 packaging==23.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pexpect==4.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pkginfo==1.9.6 ; python_full_version >= "3.8.1" and python_version < "3.12"
@@ -46,13 +46,13 @@ platformdirs==3.10.0 ; python_full_version >= "3.8.1" and python_version < "3.12
 poetry-core==1.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 poetry-plugin-export==1.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 poetry==1.5.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
-protobuf==4.24.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+protobuf==4.24.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 psutil==5.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12"
 ptyprocess==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pycparser==2.21 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux")
-pydantic-core==2.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
+pydantic-core==2.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pydantic-settings==2.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
-pydantic==2.1.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
+pydantic==2.2.1 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pyproject-hooks==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 python-dotenv==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 pywin32-ctypes==0.2.2 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "win32"
@@ -66,7 +66,7 @@ rpds-py==0.9.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 safetensors==0.3.2 ; python_full_version >= "3.8.1" and python_version < "3.12"
 secretstorage==3.3.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux"
 sentencepiece==0.1.99 ; python_full_version >= "3.8.1" and python_version < "3.12"
-shellingham==1.5.0.post1 ; python_full_version >= "3.8.1" and python_version < "3.12"
+shellingham==1.5.3 ; python_full_version >= "3.8.1" and python_version < "3.12"
 six==1.16.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 sniffio==1.3.0 ; python_full_version >= "3.8.1" and python_version < "3.12"
 sse-starlette==1.6.5 ; python_full_version >= "3.8.1" and python_version < "3.12"

From 28c6925665eaff7f5dab8971a9bb7269b62eb71e Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 20 Aug 2023 01:09:39 +0900
Subject: [PATCH 08/18] Docker image update

---
 Dockerfile                       | 55 ++++++++------------
 Dockerfile.compressed            | 36 +++++++++++++
 docker-compose.persistent.yml    | 30 ++---------
 docker-compose.yml               | 29 ++---------
 llama_api/server/app_settings.py | 86 +++++++++++++++++++-------------
 llama_api/utils/dependency.py    | 14 ++++--
 main.py                          | 36 ++-----------
 readme.md                        | 18 ++++---
 8 files changed, 139 insertions(+), 165 deletions(-)
 create mode 100644 Dockerfile.compressed

diff --git a/Dockerfile b/Dockerfile
index 7757736..23d842d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,48 +1,33 @@
-### Dockerfile for Python 3.11.4 & CUDA 12.1.1 & Ubuntu 22.04
-### Approximately 5 ~ 10 minutes to build
-
 # Select the required CUDA version.
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
-ENV PYTHON_VERSION="3.11.4"
-ENV PYTHON_VERSION_SHORT="3.11"
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
 
-# Copy the necessary files.
-COPY llama_api /app/llama_api
-COPY pyproject.toml /app/pyproject.toml
-COPY requirements.txt /app/requirements.txt
-COPY main.py /app/main.py
-COPY model_definitions.py /app/model_definitions.py
+ENV PYTHON_VERSION="3.11.4" \
+    PYTHON_VERSION_SHORT="3.11" \
+    DEBIAN_FRONTEND=noninteractive \
+    CUDA_DOCKER_ARCH=all
 
 # Install the necessary applications, and then install Python.
-# Then, install the necessary Python packages(Dependencies).
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    zlib1g-dev \
-    libncurses5-dev \
-    libgdbm-dev \
-    libnss3-dev \
-    libssl-dev \
-    libreadline-dev \
-    libffi-dev \
-    wget \
-    git \
-    libsqlite3-dev\
+    git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \
     && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \
     && tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \
     && cd /tmp/Python-${PYTHON_VERSION} \
-    && ./configure \
-    && make \
-    && make install \
+    && ./configure && make && make install \
+    && python3 -m pip install --upgrade pip --no-cache-dir \
+    && rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \
     && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
     && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
-    && python3 -m pip install --upgrade pip \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get clean \
-    && rm -rf /tmp/* \
-    && cd /app \
-    && python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
-    # Need to skip complie because GPU access to host is not supported when building image.
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
+    && nvcc --version
+
+# Copy the necessary files.
+COPY llama_api /app/llama_api
+COPY pyproject.toml requirements.txt main.py model_definitions.py /app/
+
+# Install the necessary Python packages(Dependencies).
+RUN cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir
 
 # Set the working directory and start the server.
+STOPSIGNAL SIGINT
 WORKDIR /app
-ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
\ No newline at end of file
+ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/Dockerfile.compressed b/Dockerfile.compressed
new file mode 100644
index 0000000..2eac700
--- /dev/null
+++ b/Dockerfile.compressed
@@ -0,0 +1,36 @@
+### Compressed version of the Dockerfile.
+### It is compressed by only do RUN once to reduce the number of layers.
+### However, it takes a long time to build the image compared to the original Dockerfile
+### because it does not use the cache.
+
+# Select the required CUDA version.
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
+
+ENV PYTHON_VERSION="3.11.4" \
+    PYTHON_VERSION_SHORT="3.11" \
+    DEBIAN_FRONTEND=noninteractive \
+    CUDA_DOCKER_ARCH=all
+
+# Copy the necessary files.
+COPY llama_api /app/llama_api
+COPY pyproject.toml requirements.txt main.py model_definitions.py /app/
+
+# Install the necessary applications, and then install Python.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \
+    && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \
+    && tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \
+    && cd /tmp/Python-${PYTHON_VERSION} \
+    && ./configure && make && make install \
+    && python3 -m pip install --upgrade pip --no-cache-dir \
+    && rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \
+    && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
+    && nvcc --version \
+    && cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir
+
+# Set the working directory and start the server.
+STOPSIGNAL SIGINT
+WORKDIR /app
+ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
index f018d07..e09df69 100644
--- a/docker-compose.persistent.yml
+++ b/docker-compose.persistent.yml
@@ -5,7 +5,9 @@ volumes:
 
 services:
   llama-api:
-    image: cosogi/llama-api:230816
+    image: c0sogi/llama-api:latest
+    cap_add:
+      - IPC_LOCK
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
@@ -14,7 +16,6 @@ services:
     volumes:
       - llama-api-models:/app/models
       - ./model_definitions.py:/app/model_definitions.py
-      - ./main.py:/app/main.py
     ports:
       - 8000:8000
     deploy:
@@ -22,27 +23,4 @@ services:
         reservations:
           devices:
           - driver: nvidia
-            capabilities: [gpu]
-
-
-# services:
-#   llama-api:
-#     build:
-#       context: .
-#       dockerfile: Dockerfile
-#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
-#     environment:
-#       - LLAMA_API_MAX_WORKERS=1
-#       - LLAMA_API_API_KEY=
-#     volumes:
-#       - llama-api-models:/app/models
-#       - ./model_definitions.py:/app/model_definitions.py
-#       - ./main.py:/app/main.py
-#     ports:
-#       - 8000:8000
-#     deploy:
-#       resources:
-#         reservations:
-#           devices:
-#           - driver: nvidia
-#             capabilities: [gpu]
\ No newline at end of file
+            capabilities: [gpu]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 3c910ea..f2ae587 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,9 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230816
+    image: c0sogi/llama-api:latest
+    cap_add:
+      - IPC_LOCK
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
@@ -22,27 +24,4 @@ services:
         reservations:
           devices:
           - driver: nvidia
-            capabilities: [gpu]
-
-# services:
-#   llama-api:
-#     build:
-#       context: .
-#       dockerfile: Dockerfile
-#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
-#     environment:
-#       - MAX_WORKERS=1
-#     volumes:
-#       - ./models:/app/models
-#       - ./llama_api:/app/llama_api
-#       - ./model_definitions.py:/app/model_definitions.py
-#       - ./main.py:/app/main.py
-#       - ./requirements.txt:/app/requirements.txt
-#     ports:
-#       - 8000:8000
-#     deploy:
-#       resources:
-#         reservations:
-#           devices:
-#           - driver: nvidia
-#             capabilities: [gpu]
\ No newline at end of file
+            capabilities: [gpu]
\ No newline at end of file
diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 4939932..1938c4a 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -73,12 +73,14 @@ def initialize_before_launch(
     skip_pytorch_install: bool = False,
     skip_tensorflow_install: bool = False,
     skip_compile: bool = False,
+    no_cache: bool = False,
 ) -> None:
     """Initialize the app"""
+    args = ["--no-cache-dir"] if no_cache else []
     for git_clone_args in Config.repositories.values():
         git_clone(**git_clone_args)
     if environ.get("LLAMA_API_XFORMERS") == "1":
-        install_package("xformers")
+        install_package("xformers", args=args)
     if install_packages:
         # Install all dependencies
         if not skip_compile:
@@ -88,17 +90,17 @@ def initialize_before_launch(
         if not poetry.exists():
             # Install poetry
             logger.warning(f"⚠️ Poetry not found: {poetry}")
-            install_package("poetry", force=True)
+            install_package("poetry", force=True, args=args)
         if not skip_pytorch_install:
             # Install pytorch
-            install_pytorch(force_cuda=force_cuda)
+            install_pytorch(force_cuda=force_cuda, args=args)
         if not skip_tensorflow_install:
             # Install tensorflow
-            install_tensorflow()
+            install_tensorflow(args=args)
 
         # Install all dependencies of our project and other repositories
         project_paths = [Path(".")] + list(Path("repositories").glob("*"))
-        install_all_dependencies(project_paths=project_paths)
+        install_all_dependencies(project_paths=project_paths, args=args)
 
         # Get current packages installed
         logger.info(f"📦 Installed packages: {get_installed_packages()}")
@@ -151,6 +153,7 @@ def run(
     skip_pytorch_install: bool = False,
     skip_tensorflow_install: bool = False,
     skip_compile: bool = False,
+    no_cache: bool = False,
     environs: Optional[Dict[str, str]] = None,
 ) -> None:
     initialize_before_launch(
@@ -159,6 +162,7 @@ def run(
         skip_pytorch_install=skip_pytorch_install,
         skip_tensorflow_install=skip_tensorflow_install,
         skip_compile=skip_compile,
+        no_cache=no_cache,
     )
 
     from uvicorn import Config as UvicornConfig
@@ -177,43 +181,53 @@ def run(
     ).run()
 
 
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-i",
+    "--install-pkgs",
+    action="store_true",
+    help="Install all required packages before running the server",
+)
+parser.add_argument(
+    "-fc",
+    "--force-cuda",
+    action="store_true",
+    help=(
+        "Force CUDA version of pytorch to be used"
+        "when installing pytorch. e.g. torch==2.0.1+cu118"
+    ),
+)
+parser.add_argument(
+    "-st",
+    "--skip-torch-install",
+    action="store_true",
+    help="Skip installing pytorch, if `install-pkgs` is set",
+)
+parser.add_argument(
+    "-stf",
+    "--skip-tf-install",
+    action="store_true",
+    help="Skip installing tensorflow, if `install-pkgs` is set",
+)
+parser.add_argument(
+    "-sc",
+    "--skip-compile",
+    action="store_true",
+    help="Skip compiling the shared library of LLaMA C++ code",
+)
+parser.add_argument(
+    "-nc",
+    "--no-cache-dir",
+    action="store_true",
+    help="Disable caching of pip installs, if `install-pkgs` is set",
+)
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--install-pkgs",
-        action="store_true",
-        help="Install all required packages before running the server",
-    )
-    parser.add_argument(
-        "--force-cuda",
-        action="store_true",
-        help=(
-            "Force CUDA version of pytorch to be used"
-            "when installing pytorch. e.g. torch==2.0.1+cu118"
-        ),
-    )
-    parser.add_argument(
-        "--skip-torch-install",
-        action="store_true",
-        help="Skip installing pytorch, if `install-pkgs` is set",
-    )
-    parser.add_argument(
-        "--skip-tf-install",
-        action="store_true",
-        help="Skip installing tensorflow, if `install-pkgs` is set",
-    )
-    parser.add_argument(
-        "--skip-compile",
-        action="store_true",
-        help="Skip compiling the shared library of LLaMA C++ code",
-    )
-
     args = parser.parse_args()
-
     initialize_before_launch(
         install_packages=args.install_pkgs,
         force_cuda=args.force_cuda,
         skip_pytorch_install=args.skip_torch_install,
         skip_tensorflow_install=args.skip_tf_install,
         skip_compile=args.skip_compile,
+        no_cache=args.no_cache_dir,
     )
diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py
index 9fc4dd4..2ad2be2 100644
--- a/llama_api/utils/dependency.py
+++ b/llama_api/utils/dependency.py
@@ -224,12 +224,14 @@ def import_repository(
     sys.path.remove(str(disk_path))
 
 
-def install_package(package: str, *args, force: bool = False) -> bool:
+def install_package(
+    package: str, force: bool = False, args: Optional[List[str]] = None
+) -> bool:
     """Install a package with pip."""
     if not force and is_package_available(package):
         return True
     return run_command(
-        [sys.executable, "-m", "pip", "install", package, *args],
+        [sys.executable, "-m", "pip", "install", package, *(args or [])],
         action="install",
         name=package,
     )
@@ -250,6 +252,7 @@ def install_pytorch(
     cuda_version: Optional[str] = Config.cuda_version,
     source: Optional[str] = Config.torch_source,
     force_cuda: bool = False,
+    args: Optional[List[str]] = None,
 ) -> bool:
     """Try to install Pytorch.
     If CUDA is available, install the CUDA version of torch.
@@ -304,12 +307,14 @@ def install_pytorch(
         pip_install.append(f"torch{torch_version}")
 
     # Install torch
+    pip_install += args or []
     return run_command(pip_install, action="install", name="PyTorch")
 
 
 def install_tensorflow(
     tensorflow_version: str = Config.tensorflow_version,
     source: Optional[str] = None,
+    args: Optional[List[str]] = None,
 ) -> bool:
     """Try to install TensorFlow.
 
@@ -332,6 +337,8 @@ def install_tensorflow(
     # If a source is specified, add it to the pip install command
     if source:
         pip_install += ["-f", source]
+    if args:
+        pip_install += args
 
     # Install TensorFlow
     return run_command(pip_install, action="install", name="TensorFlow")
@@ -339,6 +346,7 @@ def install_tensorflow(
 
 def install_all_dependencies(
     project_paths: Optional[Union[List[Path], List[str]]] = None,
+    args: Optional[List[str]] = None,
 ) -> Optional[bool]:
     """Install every dependencies."""
     pip_install = [sys.executable, "-m", "pip", "install", "-r"]
@@ -356,7 +364,7 @@ def install_all_dependencies(
             )
             continue
         result &= run_command(
-            pip_install + [requirements_path.as_posix()],
+            pip_install + [requirements_path.as_posix()] + (args or []),
             action="install",
             name="dependencies",
         )
diff --git a/main.py b/main.py
index 15877df..49b1850 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,7 @@
-import argparse
-from llama_api.server.app_settings import run
+from llama_api.server.app_settings import run, parser
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
     parser.add_argument(
         "-p",
         "--port",
@@ -18,36 +16,6 @@
         default=1,
         help="Maximum number of process workers to run; default is 1",
     )
-    parser.add_argument(
-        "-i",
-        "--install-pkgs",
-        action="store_true",
-        help="Install all required packages before running the server",
-    )
-    parser.add_argument(
-        "-c",
-        "--force-cuda",
-        action="store_true",
-        help=(
-            "Force CUDA version of pytorch to be used"
-            "when installing pytorch. e.g. torch==2.0.1+cu118"
-        ),
-    )
-    parser.add_argument(
-        "--skip-torch-install",
-        action="store_true",
-        help="Skip installing pytorch, if `install-pkgs` is set",
-    )
-    parser.add_argument(
-        "--skip-tf-install",
-        action="store_true",
-        help="Skip installing tensorflow, if `install-pkgs` is set",
-    )
-    parser.add_argument(
-        "--skip-compile",
-        action="store_true",
-        help="Skip compiling the shared library of LLaMA C++ code",
-    )
     parser.add_argument(
         "-k",
         "--api-key",
@@ -62,6 +30,7 @@
         help="Apply xformers' memory-efficient optimizations",
     )
     parser.add_argument(
+        "-ne",
         "--no-embed",
         action="store_true",
         help="Disable embeddings endpoint",
@@ -75,6 +44,7 @@
         skip_pytorch_install=args.skip_torch_install,
         skip_tensorflow_install=args.skip_tf_install,
         skip_compile=args.skip_compile,
+        no_cache=args.no_cache_dir,
         environs={
             "LLAMA_API_MAX_WORKERS": str(args.max_workers),
             "LLAMA_API_XFORMERS": "1" if args.xformers else "",
diff --git a/readme.md b/readme.md
index 5a54926..75a9c36 100644
--- a/readme.md
+++ b/readme.md
@@ -27,22 +27,26 @@ python -m main
 ```
 Options:
 ```b
-usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-i] [-c] [--skip-torch-install] [--skip-tf-install] [--skip-compile] [-k API_KEY] [-x] [--no-embed]
+usage: main.py [-h] [-i--install-pkgs] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne]
 
 options:
   -h, --help            show this help message and exit
+  -i, --install-pkgs      Install all required packages before running the server
+  -fc, --force-cuda     Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
+  -st, --skip-torch-install
+                        Skip installing pytorch, if `install-pkgs` is set
+  -stf, --skip-tf-install
+                        Skip installing tensorflow, if `install-pkgs` is set
+  -sc, --skip-compile   Skip compiling the shared library of LLaMA C++ code
+  -nc, --no-cache-dir
+                        Disable caching of pip installs, if `install-pkgs` is set
   -p PORT, --port PORT  Port to run the server on; default is 8000
   -w MAX_WORKERS, --max-workers MAX_WORKERS
                         Maximum number of process workers to run; default is 1
-  -i, --install-pkgs    Install all required packages before running the server
-  -c, --force-cuda      Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
-  --skip-torch-install  Skip installing pytorch, if `install-pkgs` is set
-  --skip-tf-install     Skip installing tensorflow, if `install-pkgs` is set
-  --skip-compile        Skip compiling the shared library of LLaMA C++ code
   -k API_KEY, --api-key API_KEY
                         API key to use for the server
   -x, --xformers        Apply xformers' memory-efficient optimizations
-  --no-embed            Disable embeddings endpoint
+  -ne, --no-embed       Disable embeddings endpoint
 ```
 
 ### Unique features

From 99ab6c94f2c800b9d349b427b376071eee604390 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 20 Aug 2023 01:21:53 +0900
Subject: [PATCH 09/18] Fix typo

---
 Dockerfile                    | 1 +
 Dockerfile.compressed         | 1 +
 docker-compose.persistent.yml | 2 +-
 docker-compose.yml            | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 23d842d..3974d50 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,6 +28,7 @@ COPY pyproject.toml requirements.txt main.py model_definitions.py /app/
 RUN cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir
 
 # Set the working directory and start the server.
+ENV PORT=${PORT:-8000}
 STOPSIGNAL SIGINT
 WORKDIR /app
 ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/Dockerfile.compressed b/Dockerfile.compressed
index 2eac700..7892d41 100644
--- a/Dockerfile.compressed
+++ b/Dockerfile.compressed
@@ -31,6 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir
 
 # Set the working directory and start the server.
+ENV PORT=${PORT:-8000}
 STOPSIGNAL SIGINT
 WORKDIR /app
 ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
index e09df69..9a762c2 100644
--- a/docker-compose.persistent.yml
+++ b/docker-compose.persistent.yml
@@ -5,7 +5,7 @@ volumes:
 
 services:
   llama-api:
-    image: c0sogi/llama-api:latest
+    image: cosogi/llama-api:latest
     cap_add:
       - IPC_LOCK
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
diff --git a/docker-compose.yml b/docker-compose.yml
index f2ae587..8236e02 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3'
 
 services:
   llama-api:
-    image: c0sogi/llama-api:latest
+    image: cosogi/llama-api:latest
     cap_add:
       - IPC_LOCK
     entrypoint: ["python3", "-m", "main", "--port", "8000"]

From 970b1c52ec09e1f6cd55c5b7bc2984f0778af320 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 20 Aug 2023 11:20:41 +0900
Subject: [PATCH 10/18] Added docker option: NICENESS of process

---
 docker-compose.persistent.yml  |  2 ++
 docker-compose.yml             |  2 ++
 llama_api/modules/exllama.py   | 23 +++++++++++------------
 llama_api/server/routers/v1.py |  4 +++-
 llama_api/utils/errors.py      |  4 ++++
 5 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
index 9a762c2..cbc9ec3 100644
--- a/docker-compose.persistent.yml
+++ b/docker-compose.persistent.yml
@@ -8,6 +8,8 @@ services:
     image: cosogi/llama-api:latest
     cap_add:
       - IPC_LOCK
+      - SYS_NICE
+      - SYS_RESOURCE
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
diff --git a/docker-compose.yml b/docker-compose.yml
index 8236e02..a370ce8 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,6 +5,8 @@ services:
     image: cosogi/llama-api:latest
     cap_add:
       - IPC_LOCK
+      - SYS_NICE
+      - SYS_RESOURCE
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index 9fafffa..e86e0af 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -199,6 +199,16 @@ def _generate_text(
         text_buffer = ""  # type: str
         byte_array = array("B")  # type: array[int]
         byte_pattern = compile(r"<0x([0-9a-fA-F]{2})>")
+        logit_processors = (
+            [
+                processor
+                for processor in self.get_logit_processors(
+                    settings=settings, encoder=self.encode
+                )
+            ]
+            if cfg_mask is None
+            else None
+        ) or None
 
         for _ in range(settings.max_tokens):
             # If the generator was interrupted, stop the generation
@@ -216,18 +226,7 @@ def _generate_text(
                 else _gen_single_token_without_cfg(
                     generator=generator,
                     input_ids=generator.sequence[0][initial_len:],
-                    logit_processors=(
-                        [
-                            processor
-                            for processor in self.get_logit_processors(
-                                settings=settings,
-                                encoder=self.encode,
-                            )
-                        ]
-                        if cfg_mask is None
-                        else None
-                    )
-                    or None,
+                    logit_processors=logit_processors,
                 )
             )  # type: int
 
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 7e579d0..2b3b56c 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -220,10 +220,12 @@ def log_request_and_response(
         CreateCompletionRequest,
         CreateEmbeddingRequest,
     ],
-    status: Union[CompletionStatus, EmbeddingStatus],
+    status: Optional[Union[CompletionStatus, EmbeddingStatus]],
     state: Literal["Completed", "Interrupted"],
 ) -> None:
     """Log the request and response of the completion or embedding"""
+    if status is None:
+        return
     elapsed_time = time() - status.started_at
     log_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"]
     body_without_prompt = body.model_dump(
diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index e647c79..3d1cef0 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -247,6 +247,10 @@ async def custom_route_handler(self, request: Request) -> Response:
                 status_code,
                 error_message,
             ) = self.error_message_wrapper(error=error, body=body)
+            client = request.client.host if request.client else "UNKNOWN"
+            logger.error(
+                f'"{client} → {request.url.path}": {error_message["message"]}'
+            )
             return JSONResponse(
                 {"error": error_message},
                 status_code=status_code,

From 95cb3761d51a64fb665c05d19caa178a8c255c38 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 20 Aug 2023 11:29:52 +0900
Subject: [PATCH 11/18] Added feature: Tunnel through cloudflare

---
 llama_api/server/app_settings.py | 12 ++++++++++++
 llama_api/utils/dependency.py    |  2 +-
 main.py                          |  7 +++++++
 readme.md                        |  8 ++++----
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 1938c4a..2d95232 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -3,6 +3,8 @@
 from contextlib import asynccontextmanager
 from os import environ, getpid
 from pathlib import Path
+from random import randint
+from threading import Timer
 from typing import Dict, Literal, Optional
 
 from ..shared.config import Config
@@ -154,6 +156,7 @@ def run(
     skip_tensorflow_install: bool = False,
     skip_compile: bool = False,
     no_cache: bool = False,
+    tunnel: bool = False,
     environs: Optional[Dict[str, str]] = None,
 ) -> None:
     initialize_before_launch(
@@ -170,6 +173,15 @@ def run(
 
     if environs:
         environ.update(environs)
+    if tunnel:
+        install_package("flask-cloudflared")
+        from flask_cloudflared import start_cloudflared
+
+        thread = Timer(
+            2, start_cloudflared, args=(port, randint(8100, 9000), None, None)
+        )
+        thread.daemon = True
+        thread.start()
 
     UvicornServer(
         config=UvicornConfig(
diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py
index 2ad2be2..49fc279 100644
--- a/llama_api/utils/dependency.py
+++ b/llama_api/utils/dependency.py
@@ -228,7 +228,7 @@ def install_package(
     package: str, force: bool = False, args: Optional[List[str]] = None
 ) -> bool:
     """Install a package with pip."""
-    if not force and is_package_available(package):
+    if not force and is_package_available(package.replace("-", "_")):
         return True
     return run_command(
         [sys.executable, "-m", "pip", "install", package, *(args or [])],
diff --git a/main.py b/main.py
index 49b1850..f23613b 100644
--- a/main.py
+++ b/main.py
@@ -35,6 +35,12 @@
         action="store_true",
         help="Disable embeddings endpoint",
     )
+    parser.add_argument(
+        "-t",
+        "--tunnel",
+        action="store_true",
+        help="Tunnel the server through cloudflared",
+    )
 
     args = parser.parse_args()
     run(
@@ -45,6 +51,7 @@
         skip_tensorflow_install=args.skip_tf_install,
         skip_compile=args.skip_compile,
         no_cache=args.no_cache_dir,
+        tunnel=args.tunnel,
         environs={
             "LLAMA_API_MAX_WORKERS": str(args.max_workers),
             "LLAMA_API_XFORMERS": "1" if args.xformers else "",
diff --git a/readme.md b/readme.md
index 1649447..f9646ee 100644
--- a/readme.md
+++ b/readme.md
@@ -32,19 +32,18 @@ python -m main
 ```
 Options:
 ```b
-usage: main.py [-h] [-i--install-pkgs] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne]
+usage: main.py [-h] [-i] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne] [-t]
 
 options:
   -h, --help            show this help message and exit
-  -i, --install-pkgs      Install all required packages before running the server
+  -i, --install-pkgs    Install all required packages before running the server
   -fc, --force-cuda     Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
   -st, --skip-torch-install
                         Skip installing pytorch, if `install-pkgs` is set
   -stf, --skip-tf-install
                         Skip installing tensorflow, if `install-pkgs` is set
   -sc, --skip-compile   Skip compiling the shared library of LLaMA C++ code
-  -nc, --no-cache-dir
-                        Disable caching of pip installs, if `install-pkgs` is set
+  -nc, --no-cache-dir   Disable caching of pip installs, if `install-pkgs` is set
   -p PORT, --port PORT  Port to run the server on; default is 8000
   -w MAX_WORKERS, --max-workers MAX_WORKERS
                         Maximum number of process workers to run; default is 1
@@ -52,6 +51,7 @@ options:
                         API key to use for the server
   -x, --xformers        Apply xformers' memory-efficient optimizations
   -ne, --no-embed       Disable embeddings endpoint
+  -t, --tunnel          Tunnel the server through cloudflared
 ```
 
 ### Unique features

From 3a835e509f3e01e0033097b9dab820224c3d592b Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Sun, 20 Aug 2023 23:27:07 +0900
Subject: [PATCH 12/18] Refactored CLI args

---
 llama_api/modules/exllama.py     |   7 +-
 llama_api/server/app_settings.py | 156 +++++++++++--------------
 llama_api/server/routers/v1.py   |   9 +-
 llama_api/shared/config.py       | 195 ++++++++++++++++++++++++++++++-
 llama_api/utils/concurrency.py   |   8 +-
 llama_api/utils/errors.py        |   4 +-
 llama_api/utils/llama_cpp.py     |   6 +-
 main.py                          |  63 +---------
 readme.md                        |  25 ++--
 tests/__init__.py                |   0
 tests/conftest.py                |  40 ++++++-
 tests/test_cli.py                |  57 +++++++++
 12 files changed, 396 insertions(+), 174 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_cli.py

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
index e86e0af..548890e 100644
--- a/llama_api/modules/exllama.py
+++ b/llama_api/modules/exllama.py
@@ -6,7 +6,7 @@
 from ..utils.logger import ApiLogger
 
 logger = ApiLogger(__name__)
-if environ.get("LLAMA_API_XFORMERS") == "1":
+if environ.get("XFORMERS") == "1":
     with logger.log_any_error(
         "xformers mode is enabled, but xformers is not installed",
         suppress_exception=True,
@@ -140,6 +140,11 @@ def __del__(self) -> None:
             del self._generator
             self._generator = None
             logger.info("🗑️ ExllamaCompletionGenerator generator deleted")
+        if self._lora is not None:
+            getattr(self._lora, "__del__", lambda: None)()
+            del self._lora
+            self._lora = None
+            logger.info("🗑️ ExllamaCompletionGenerator lora deleted")
         if self._model is not None:
             self._model.free_unmanaged()
             del self._model
diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 2d95232..26ab23b 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -1,18 +1,19 @@
-import argparse
 import platform
 from contextlib import asynccontextmanager
 from os import environ, getpid
 from pathlib import Path
 from random import randint
+import sys
 from threading import Timer
-from typing import Dict, Literal, Optional
+from typing import Literal, Optional
 
-from ..shared.config import Config
+from ..shared.config import AppSettingsCliArgs, MainCliArgs, CliArg, Config
 
 from ..utils.dependency import (
     get_installed_packages,
     get_poetry_executable,
     git_clone,
+    run_command,
     install_all_dependencies,
     install_package,
     install_pytorch,
@@ -69,22 +70,57 @@ def set_priority(
         return False
 
 
-def initialize_before_launch(
-    install_packages: bool = False,
-    force_cuda: bool = False,
-    skip_pytorch_install: bool = False,
-    skip_tensorflow_install: bool = False,
-    skip_compile: bool = False,
-    no_cache: bool = False,
-) -> None:
+def parse_cli_args_from_environ(prefix: str = "LLAMA_") -> None:
+    """Parse CLI arguments from environment variables"""
+    prefix = prefix.lower()
+    cli_args = {
+        cli_key: cli_arg
+        for cli_key, cli_arg in MainCliArgs.iterate_over_cli_args()
+    }  # type: dict[str, CliArg]
+    prefix_length = len(prefix)
+    for key, value in environ.items():
+        key = key.lower()
+        if not key.startswith(prefix):
+            continue
+        key = key[prefix_length:]
+        if key not in cli_args:
+            continue
+        cli_arg = cli_args[key]
+        if not isinstance(cli_arg, CliArg):
+            continue
+        cli_arg.value = cli_arg.type(value)
+
+
+def initialize_before_launch() -> None:
     """Initialize the app"""
-    args = ["--no-cache-dir"] if no_cache else []
+    args = MainCliArgs
+    install_packages = args.install_pkgs.value or False
+    upgrade_packages = args.upgrade_pkgs.value or False
+    force_cuda = args.force_cuda.value or False
+    skip_pytorch_install = args.skip_torch_install.value or False
+    skip_tensorflow_install = args.skip_tf_install.value or False
+    skip_compile = args.skip_compile.value or False
+    no_cache_dir = args.no_cache_dir.value or False
+
+    # PIP arguments
+    pip_args = []  # type: list[str]
+    if no_cache_dir:
+        pip_args.append("--no-cache-dir")
+    if upgrade_packages:
+        pip_args.append("--upgrade")
+        # Upgrade pip
+        run_command(
+            [sys.executable, "-m", "pip", "install", "--upgrade", "pip"],
+            action="upgrade",
+            name="pip",
+        )
+
+    # Clone all repositories
     for git_clone_args in Config.repositories.values():
         git_clone(**git_clone_args)
-    if environ.get("LLAMA_API_XFORMERS") == "1":
-        install_package("xformers", args=args)
+
+    # Install packages
     if install_packages:
-        # Install all dependencies
         if not skip_compile:
             # Build the shared library of LLaMA C++ code
             build_shared_lib(logger=logger, force_cuda=force_cuda)
@@ -92,17 +128,17 @@ def initialize_before_launch(
         if not poetry.exists():
             # Install poetry
             logger.warning(f"⚠️ Poetry not found: {poetry}")
-            install_package("poetry", force=True, args=args)
+            install_package("poetry", force=True, args=pip_args)
         if not skip_pytorch_install:
             # Install pytorch
-            install_pytorch(force_cuda=force_cuda, args=args)
+            install_pytorch(force_cuda=force_cuda, args=pip_args)
         if not skip_tensorflow_install:
             # Install tensorflow
-            install_tensorflow(args=args)
+            install_tensorflow(args=pip_args)
 
         # Install all dependencies of our project and other repositories
         project_paths = [Path(".")] + list(Path("repositories").glob("*"))
-        install_all_dependencies(project_paths=project_paths, args=args)
+        install_all_dependencies(project_paths=project_paths, args=pip_args)
 
         # Get current packages installed
         logger.info(f"📦 Installed packages: {get_installed_packages()}")
@@ -112,6 +148,8 @@ def initialize_before_launch(
             "If any packages are missing, "
             "use `--install-pkgs` option to install them."
         )
+    if MainCliArgs.xformers.value:
+        install_package("xformers", args=pip_args)
 
 
 @asynccontextmanager
@@ -148,32 +186,17 @@ async def health():
     return new_app
 
 
-def run(
-    port: int,
-    install_packages: bool = False,
-    force_cuda: bool = False,
-    skip_pytorch_install: bool = False,
-    skip_tensorflow_install: bool = False,
-    skip_compile: bool = False,
-    no_cache: bool = False,
-    tunnel: bool = False,
-    environs: Optional[Dict[str, str]] = None,
-) -> None:
-    initialize_before_launch(
-        install_packages=install_packages,
-        force_cuda=force_cuda,
-        skip_pytorch_install=skip_pytorch_install,
-        skip_tensorflow_install=skip_tensorflow_install,
-        skip_compile=skip_compile,
-        no_cache=no_cache,
-    )
+def run() -> None:
+    port = MainCliArgs.port.value
+    assert port is not None, "Port is not set"
+    if MainCliArgs.force_cuda.value:
+        environ["FORCE_CUDA"] = "1"
+    initialize_before_launch()
 
     from uvicorn import Config as UvicornConfig
     from uvicorn import Server as UvicornServer
 
-    if environs:
-        environ.update(environs)
-    if tunnel:
+    if MainCliArgs.tunnel.value:
         install_package("flask-cloudflared")
         from flask_cloudflared import start_cloudflared
 
@@ -193,53 +216,6 @@ def run(
     ).run()
 
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-i",
-    "--install-pkgs",
-    action="store_true",
-    help="Install all required packages before running the server",
-)
-parser.add_argument(
-    "-fc",
-    "--force-cuda",
-    action="store_true",
-    help=(
-        "Force CUDA version of pytorch to be used"
-        "when installing pytorch. e.g. torch==2.0.1+cu118"
-    ),
-)
-parser.add_argument(
-    "-st",
-    "--skip-torch-install",
-    action="store_true",
-    help="Skip installing pytorch, if `install-pkgs` is set",
-)
-parser.add_argument(
-    "-stf",
-    "--skip-tf-install",
-    action="store_true",
-    help="Skip installing tensorflow, if `install-pkgs` is set",
-)
-parser.add_argument(
-    "-sc",
-    "--skip-compile",
-    action="store_true",
-    help="Skip compiling the shared library of LLaMA C++ code",
-)
-parser.add_argument(
-    "-nc",
-    "--no-cache-dir",
-    action="store_true",
-    help="Disable caching of pip installs, if `install-pkgs` is set",
-)
 if __name__ == "__main__":
-    args = parser.parse_args()
-    initialize_before_launch(
-        install_packages=args.install_pkgs,
-        force_cuda=args.force_cuda,
-        skip_pytorch_install=args.skip_torch_install,
-        skip_tensorflow_install=args.skip_tf_install,
-        skip_compile=args.skip_compile,
-        no_cache=args.no_cache_dir,
-    )
+    AppSettingsCliArgs.load()
+    initialize_before_launch()
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index 2b3b56c..de0901b 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -6,7 +6,6 @@
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from functools import partial
-from os import environ
 from queue import Queue
 from random import choice
 from threading import Event
@@ -37,6 +36,8 @@
 from orjson import OPT_INDENT_2, dumps
 from sse_starlette.sse import EventSourceResponse
 
+from llama_api.shared.config import MainCliArgs
+
 from ...mixins.completion import CompletionStatus
 from ...schemas.api import (
     ChatCompletion,
@@ -72,8 +73,8 @@
 )
 logger = ApiLogger(__name__)
 router = APIRouter(prefix="/v1", route_class=RouteErrorHandler)
-max_workers = int(environ.get("LLAMA_API_MAX_WORKERS", 1))
-max_semaphores = int(environ.get("LLAMA_API_MAX_SEMAPHORES", 1))
+max_workers = int(MainCliArgs.max_workers.value or 1)
+max_semaphores = int(MainCliArgs.max_semaphores.value or 1)
 T = TypeVar("T")
 
 
@@ -365,7 +366,7 @@ async def create_completion(request: Request, body: CreateCompletionRequest):
 async def create_embedding(
     request: Request, body: CreateEmbeddingRequest
 ) -> Embedding:
-    if not environ.get("LLAMA_API_EMBEDDINGS"):
+    if MainCliArgs.no_embed.value:
         raise PermissionError("Embeddings endpoint is disabled")
     assert body.model is not None, "Model is required"
     async with get_wix_with_semaphore(request, body.model) as wix:
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index 6469e76..55e24de 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -1,5 +1,21 @@
+import argparse
+from dataclasses import dataclass, field
+import json
+from os import environ
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Tuple
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
 
 try:
     from typing_extensions import TypedDict
@@ -9,6 +25,181 @@
     from typing import TypedDict  # When dependencies aren't installed yet
 
 
+T = TypeVar("T", bound=Union[str, int, float, bool])
+
+
+@dataclass
+class CliArg(Generic[T]):
+    type: Callable[[Any], T]
+    help: str = ""
+    short_option: Optional[str] = None
+    action: Optional[str] = None
+    default: Optional[T] = None
+    value: Optional[T] = field(init=False)  # ensure it's set in __post_init__
+
+    def __post_init__(self):
+        self.value = self.default
+
+
+class CliArgHelper:
+    @classmethod
+    def get_parser(cls) -> argparse.ArgumentParser:
+        parser = argparse.ArgumentParser()
+        for cli_key, cli_arg in cls.iterate_over_cli_args():
+            args = []  # type: List[str]
+            if cli_arg.short_option:
+                args.append(f"-{cli_arg.short_option.replace('_', '-')}")
+            args.append(f"--{cli_key.replace('_', '-')}")
+            kwargs = {}
+            if cli_arg.help:
+                kwargs["help"] = cli_arg.help
+            if cli_arg.default is not None:
+                kwargs["default"] = cli_arg.default
+            if cli_arg.action:
+                kwargs["action"] = cli_arg.action
+            else:
+                kwargs["type"] = cli_arg.type
+            parser.add_argument(*args, **kwargs)
+        return parser
+
+    @classmethod
+    def load(cls) -> None:
+        cls.load_from_namespace(cls.get_parser().parse_args())
+
+    @classmethod
+    def load_from_namespace(
+        cls, args: argparse.Namespace, environ_key: str = "LLAMA_API_ARGS"
+    ) -> None:
+        cli_args = {
+            cli_key: cli_arg
+            for cli_key, cli_arg in cls.iterate_over_cli_args()
+        }
+        for cli_key, cli_arg in cli_args.items():
+            cli_arg_value = getattr(args, cli_key, None)
+            if cli_arg_value is not None:
+                cli_arg.value = cli_arg.type(cli_arg_value)
+        environ[environ_key] = json.dumps(
+            {
+                cli_key.upper(): cli_arg.value
+                for cli_key, cli_arg in cli_args.items()
+            }
+        )
+
+    @classmethod
+    def load_from_environ(cls, environ_key: str = "LLAMA_API_ARGS") -> None:
+        json_str = environ.get(environ_key)
+        assert (
+            json_str is not None
+        ), f"Environment variable {environ_key} not found"
+        cli_args = {
+            cli_key: cli_arg
+            for cli_key, cli_arg in cls.iterate_over_cli_args()
+        }  # type: Dict[str, CliArg]
+        cli_arg_values = json.loads(json_str)  # type: Dict[str, Any]
+        for cli_key, cli_value in cli_arg_values.items():
+            cli_key = cli_key.lower()
+            if cli_key in cli_args and cli_value is not None:
+                cli_arg = cli_args[cli_key]
+                cli_arg.value = cli_arg.type(cli_value)
+
+    @classmethod
+    def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]:
+        for _cls in cls.__mro__:
+            for attr_name, attr_value in vars(_cls).items():
+                if isinstance(attr_value, CliArg):
+                    yield attr_name, attr_value
+
+
+class AppSettingsCliArgs(CliArgHelper):
+    install_pkgs: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="i",
+        help="Install all required packages before running the server",
+    )
+    force_cuda: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="c",
+        help="Force CUDA version of pytorch to be used "
+        "when installing pytorch. e.g. torch==2.0.1+cu118",
+    )
+    skip_torch_install: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="-no-torch",
+        help="Skip installing pytorch, if `install-pkgs` is set",
+    )
+    skip_tf_install: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="-no-tf",
+        help="Skip installing tensorflow, if `install-pkgs` is set",
+    )
+    skip_compile: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="-no-compile",
+        help="Skip compiling the shared library of LLaMA C++ code",
+    )
+    no_cache_dir: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="-no-cache",
+        help="Disable caching of pip installs, if `install-pkgs` is set",
+    )
+    upgrade_pkgs: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="u",
+        help="Upgrade all packages before running the server",
+    )
+
+
+class MainCliArgs(AppSettingsCliArgs):
+    port: CliArg[int] = CliArg(
+        type=int,
+        short_option="p",
+        help="Port to run the server on; default is 8000",
+        default=8000,
+    )
+    max_workers: CliArg[int] = CliArg(
+        type=int,
+        short_option="w",
+        help="Maximum number of process workers to run; default is 1",
+        default=1,
+    )
+    max_semaphores: CliArg[int] = CliArg(
+        type=int,
+        short_option="s",
+        help="Maximum number of process semaphores to permit; default is 1",
+        default=1,
+    )
+    api_key: CliArg[str] = CliArg(
+        type=str,
+        short_option="k",
+        help="API key to use for the server",
+        default=None,
+    )
+    xformers: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="x",
+        help="Apply xformers' memory-efficient optimizations",
+    )
+    no_embed: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        help="Disable embeddings endpoint",
+    )
+    tunnel: CliArg[bool] = CliArg(
+        type=bool,
+        action="store_true",
+        short_option="t",
+        help="Tunnel the server through cloudflared",
+    )
+
+
 class GitCloneArgs(TypedDict):
     git_path: str
     disk_path: str
@@ -43,7 +234,7 @@ class Config:
         "exllama": GitCloneArgs(
             git_path="https://github.com/turboderp/exllama",
             disk_path="repositories/exllama",
-            options=["--recurse-submodules"],
+            options=["recurse-submodules"],
         ),
         "llama_cpp": GitCloneArgs(
             git_path="https://github.com/abetlen/llama-cpp-python",
diff --git a/llama_api/utils/concurrency.py b/llama_api/utils/concurrency.py
index 797a1be..0424202 100644
--- a/llama_api/utils/concurrency.py
+++ b/llama_api/utils/concurrency.py
@@ -10,6 +10,8 @@
 
 from fastapi.concurrency import run_in_threadpool
 
+from llama_api.shared.config import MainCliArgs
+
 from ..server.app_settings import set_priority
 from ..utils.logger import ApiLogger
 from ..utils.process_pool import ProcessPool
@@ -36,6 +38,8 @@ def init_process_pool(env_vars: Dict[str, str]) -> None:
     for key, value in env_vars.items():
         environ[key] = value
 
+    MainCliArgs.load_from_environ()
+
 
 def pool() -> ProcessPool:
     """Get the process pool, and initialize it if it's not initialized yet"""
@@ -44,14 +48,14 @@ def pool() -> ProcessPool:
     if _pool is None:
         logger.info("Initializing process pool...")
         _pool = ProcessPool(
-            max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)),
+            max_workers=MainCliArgs.max_workers.value or 1,
             initializer=init_process_pool,
             initargs=(dict(environ),),
         )
     elif not _pool.is_available:
         logger.critical("🚨 Process pool died. Reinitializing process pool...")
         _pool = ProcessPool(
-            max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)),
+            max_workers=MainCliArgs.max_workers.value or 1,
             initializer=init_process_pool,
             initargs=(dict(environ),),
         )
diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index 3d1cef0..8f367f2 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -1,5 +1,4 @@
 from functools import cached_property
-from os import environ
 from pathlib import Path
 from re import Match, Pattern, compile
 from typing import Callable, Coroutine, Dict, Optional, Tuple, Union
@@ -14,6 +13,7 @@
     CreateCompletionRequest,
     CreateEmbeddingRequest,
 )
+from ..shared.config import MainCliArgs
 from ..utils.logger import ApiLogger
 
 logger = ApiLogger(__name__)
@@ -126,7 +126,7 @@ class RouteErrorHandler(APIRoute):
         ): ErrorResponseFormatters.model_not_found,
     }
 
-    api_key: Optional[str] = environ.get("LLAMA_API_API_KEY") or None
+    api_key: Optional[str] = MainCliArgs.api_key.value
 
     @cached_property
     def authorization(self) -> Optional[str]:
diff --git a/llama_api/utils/llama_cpp.py b/llama_api/utils/llama_cpp.py
index b1de480..fddd339 100644
--- a/llama_api/utils/llama_cpp.py
+++ b/llama_api/utils/llama_cpp.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+from ..shared.config import MainCliArgs
 from ..utils.dependency import install_package, run_command
 from ..utils.system import get_cuda_version
 
@@ -193,9 +194,10 @@ def build_shared_lib(
     logger: Optional[Logger] = None, force_cuda: bool = False
 ) -> None:
     """Build the shared library for llama.cpp"""
-
     global CMAKE_ARGS
-    if force_cuda or bool(environ.get("FORCE_CUDA", False)):
+    if force_cuda or bool(
+        environ.get("FORCE_CUDA", MainCliArgs.force_cuda.value)
+    ):
         assert get_cuda_version() is not None, "CUDA is not available"
         CMAKE_ARGS = CUBLAS_ARGS
 
diff --git a/main.py b/main.py
index f23613b..920896a 100644
--- a/main.py
+++ b/main.py
@@ -1,62 +1,7 @@
-from llama_api.server.app_settings import run, parser
+from llama_api.server.app_settings import run
+from llama_api.shared.config import MainCliArgs
 
 
 if __name__ == "__main__":
-    parser.add_argument(
-        "-p",
-        "--port",
-        type=int,
-        default=8000,
-        help="Port to run the server on; default is 8000",
-    )
-    parser.add_argument(
-        "-w",
-        "--max-workers",
-        type=int,
-        default=1,
-        help="Maximum number of process workers to run; default is 1",
-    )
-    parser.add_argument(
-        "-k",
-        "--api-key",
-        type=str,
-        default=None,
-        help="API key to use for the server",
-    )
-    parser.add_argument(
-        "-x",
-        "--xformers",
-        action="store_true",
-        help="Apply xformers' memory-efficient optimizations",
-    )
-    parser.add_argument(
-        "-ne",
-        "--no-embed",
-        action="store_true",
-        help="Disable embeddings endpoint",
-    )
-    parser.add_argument(
-        "-t",
-        "--tunnel",
-        action="store_true",
-        help="Tunnel the server through cloudflared",
-    )
-
-    args = parser.parse_args()
-    run(
-        port=args.port,
-        install_packages=args.install_pkgs,
-        force_cuda=args.force_cuda,
-        skip_pytorch_install=args.skip_torch_install,
-        skip_tensorflow_install=args.skip_tf_install,
-        skip_compile=args.skip_compile,
-        no_cache=args.no_cache_dir,
-        tunnel=args.tunnel,
-        environs={
-            "LLAMA_API_MAX_WORKERS": str(args.max_workers),
-            "LLAMA_API_XFORMERS": "1" if args.xformers else "",
-            "LLAMA_API_API_KEY": args.api_key or "",
-            "FORCE_CUDA": "1" if args.force_cuda else "",
-            "LLAMA_API_EMBEDDINGS": "1" if not args.no_embed else "",
-        },
-    )
+    MainCliArgs.load()
+    run()
diff --git a/readme.md b/readme.md
index f9646ee..3a74c87 100644
--- a/readme.md
+++ b/readme.md
@@ -32,26 +32,31 @@ python -m main
 ```
 Options:
 ```b
-usage: main.py [-h] [-i] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne] [-t]
+usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-s MAX_SEMAPHORES] [-k API_KEY] [-x] [--no-embed] [-t] [-i] [-c] [--no-torch] [--no-tf] [--no-compile] [--no-cache] [-u]
 
 options:
   -h, --help            show this help message and exit
-  -i, --install-pkgs    Install all required packages before running the server
-  -fc, --force-cuda     Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118
-  -st, --skip-torch-install
-                        Skip installing pytorch, if `install-pkgs` is set
-  -stf, --skip-tf-install
-                        Skip installing tensorflow, if `install-pkgs` is set
-  -sc, --skip-compile   Skip compiling the shared library of LLaMA C++ code
-  -nc, --no-cache-dir   Disable caching of pip installs, if `install-pkgs` is set
   -p PORT, --port PORT  Port to run the server on; default is 8000
   -w MAX_WORKERS, --max-workers MAX_WORKERS
                         Maximum number of process workers to run; default is 1
+  -s MAX_SEMAPHORES, --max-semaphores MAX_SEMAPHORES
+                        Maximum number of process semaphores to permit; default is 1
   -k API_KEY, --api-key API_KEY
                         API key to use for the server
   -x, --xformers        Apply xformers' memory-efficient optimizations
-  -ne, --no-embed       Disable embeddings endpoint
+  --no-embed            Disable embeddings endpoint
   -t, --tunnel          Tunnel the server through cloudflared
+  -i, --install-pkgs    Install all required packages before running the server
+  -c, --force-cuda      Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118
+  --no-torch, --skip-torch-install
+                        Skip installing pytorch, if `install-pkgs` is set
+  --no-tf, --skip-tf-install
+                        Skip installing tensorflow, if `install-pkgs` is set
+  --no-compile, --skip-compile
+                        Skip compiling the shared library of LLaMA C++ code
+  --no-cache, --no-cache-dir
+                        Disable caching of pip installs, if `install-pkgs` is set
+  -u, --upgrade-pkgs    Upgrade all packages before running the server
 ```
 
 ### Unique features
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
index e96096e..2839cfd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,16 @@
-from asyncio import gather
+from asyncio import gather, iscoroutinefunction
+from contextlib import ExitStack
 from datetime import datetime
+from functools import wraps
 import importlib
+from types import ModuleType
 import unittest
 from os import environ
 from pathlib import Path
 from re import compile, sub
 from typing import (
     TYPE_CHECKING,
+    Any,
     AsyncIterator,
     Dict,
     Iterable,
@@ -16,6 +20,7 @@
     Tuple,
     Union,
 )
+from unittest.mock import MagicMock, patch
 
 from orjson import loads
 from llama_api.schemas.api import (
@@ -42,6 +47,33 @@
 EndPoint = Literal["completions", "chat/completions"]
 
 
+def patch_module(mocking_module: ModuleType):
+    def decorator(func):
+        @wraps(func)
+        async def async_wrapper(*args, **kwargs):
+            patches = []
+            for name, attr in mocking_module.__dict__.items():
+                # Mock all functions and classes
+                if callable(attr) or isinstance(attr, (type,)):
+                    patches.append(
+                        patch.object(mocking_module, name, MagicMock())
+                    )
+
+            with ExitStack() as stack:
+                for p in patches:
+                    stack.enter_context(p)
+
+                if iscoroutinefunction(func):
+                    return await func(*args, **kwargs)
+                return func(*args, **kwargs)
+
+        if iscoroutinefunction(func):
+            return async_wrapper
+        return func
+
+    return decorator
+
+
 class TestLlamaAPI(unittest.TestCase):
     ggml_model: str = "orca-mini-3b.ggmlv3.q4_0.bin"
     ggml_path: Path = Config.project_root / Path(f"models/ggml/{ggml_model}")
@@ -65,7 +97,7 @@ def setUpClass(cls):
             "fastapi.testclient"
         ).TestClient  # type: Type[TestClient]
         cls.app = create_app_llama_cpp()
-        environ["LLAMA_API_MAX_WORKERS"] = "2"
+        environ["LLAMA_API_ARGS"] = '{"MAX_WORKERS": 1}'
 
     @classmethod
     def tearDownClass(cls):
@@ -91,6 +123,7 @@ async def arequest_completion(
         self,
         model_names: Union[List[str], Tuple[str, ...]],
         endpoints: Union[EndPoint, Iterable[EndPoint]],
+        **kwargs: Any,
     ) -> Tuple[List[List[str]], List[datetime], List[datetime]]:
         async with self.AsyncClient(
             app=self.app, base_url="http://localhost", timeout=None
@@ -111,6 +144,7 @@ async def arequest_completion(
                         else endpoints
                     ),
                 ),
+                **kwargs,
             )
 
     async def get_models(
@@ -133,6 +167,7 @@ async def submit_streaming_requests(
         self,
         client: "AsyncClient",
         model_and_endpoints: Iterable[Tuple[str, EndPoint]],
+        **kwargs: Any,
     ) -> Tuple[List[List[str]], List[datetime], List[datetime]]:
         async def send_request(
             model: str, endpoint: EndPoint
@@ -146,6 +181,7 @@ async def send_request(
                     {"messages": self.messages}
                     if endpoint.startswith("chat")
                     else {"prompt": self.prompt},
+                    kwargs,
                 ),
                 headers={"Content-Type": "application/json"},
             ) as response:
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..64ccf5e
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,57 @@
+import json
+from os import environ
+import unittest
+from llama_api.shared.config import AppSettingsCliArgs, MainCliArgs
+
+
+class TestCLIArgs(unittest.TestCase):
+    def test_cli_args(self):
+        parser = MainCliArgs.get_parser()
+
+        # Check that `--install-pkgs` is inherited from `MainCliArgs`
+        args = parser.parse_args(["--install-pkgs", "--port", "8080"])
+        AppSettingsCliArgs.load_from_namespace(args)
+        self.assertFalse(AppSettingsCliArgs.force_cuda.value)
+        self.assertTrue(AppSettingsCliArgs.install_pkgs.value)
+        self.assertFalse(MainCliArgs.force_cuda.value)
+        self.assertTrue(MainCliArgs.install_pkgs.value)
+        self.assertEqual(MainCliArgs.port.value, 8000)
+
+        # Check that both `--force-cuda` and `--port` are inherited from `MainCliArgs`  # noqa
+        args = parser.parse_args(["--port", "9000", "--force-cuda"])
+        MainCliArgs.load_from_namespace(args)
+        self.assertTrue(AppSettingsCliArgs.force_cuda.value)
+        self.assertFalse(AppSettingsCliArgs.install_pkgs.value)
+        self.assertTrue(MainCliArgs.force_cuda.value)
+        self.assertFalse(MainCliArgs.install_pkgs.value)
+        self.assertEqual(MainCliArgs.port.value, 9000)
+
+        # Set `--install-pkgs` to `False` and check that it is applied
+        args.install_pkgs = True
+        AppSettingsCliArgs.load_from_namespace(args)
+        self.assertTrue(AppSettingsCliArgs.force_cuda.value)
+        self.assertTrue(AppSettingsCliArgs.install_pkgs.value)
+        self.assertTrue(MainCliArgs.force_cuda.value)
+        self.assertTrue(MainCliArgs.install_pkgs.value)
+        self.assertEqual(MainCliArgs.port.value, 9000)
+
+        environ["LLAMA_CLI_ARGS"] = json.dumps(
+            {"force_cuda": False, "port": 7000}
+        )
+        AppSettingsCliArgs.load_from_environ("LLAMA_CLI_ARGS")
+        self.assertFalse(AppSettingsCliArgs.force_cuda.value)
+        self.assertTrue(AppSettingsCliArgs.install_pkgs.value)
+        self.assertFalse(MainCliArgs.force_cuda.value)
+        self.assertTrue(MainCliArgs.install_pkgs.value)
+        self.assertEqual(MainCliArgs.port.value, 9000)
+
+        MainCliArgs.load_from_environ("LLAMA_CLI_ARGS")
+        self.assertFalse(AppSettingsCliArgs.force_cuda.value)
+        self.assertTrue(AppSettingsCliArgs.install_pkgs.value)
+        self.assertFalse(MainCliArgs.force_cuda.value)
+        self.assertTrue(MainCliArgs.install_pkgs.value)
+        self.assertEqual(MainCliArgs.port.value, 7000)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8bc40191b759f57e948d5307557648e783426bd1 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Mon, 21 Aug 2023 00:40:50 +0900
Subject: [PATCH 13/18] Fixed CLI bugs

---
 llama_api/server/app_settings.py | 30 ++++---------
 llama_api/shared/config.py       | 76 ++++++++++++++++++++++----------
 llama_api/utils/dependency.py    | 15 +++++++
 main.py                          |  2 -
 tests/test_cli.py                | 16 ++++---
 5 files changed, 86 insertions(+), 53 deletions(-)

diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 26ab23b..11ce418 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -7,12 +7,13 @@
 from threading import Timer
 from typing import Literal, Optional
 
-from ..shared.config import AppSettingsCliArgs, MainCliArgs, CliArg, Config
+from ..shared.config import AppSettingsCliArgs, MainCliArgs, Config
 
 from ..utils.dependency import (
     get_installed_packages,
     get_poetry_executable,
     git_clone,
+    git_pull,
     run_command,
     install_all_dependencies,
     install_package,
@@ -70,27 +71,6 @@ def set_priority(
         return False
 
 
-def parse_cli_args_from_environ(prefix: str = "LLAMA_") -> None:
-    """Parse CLI arguments from environment variables"""
-    prefix = prefix.lower()
-    cli_args = {
-        cli_key: cli_arg
-        for cli_key, cli_arg in MainCliArgs.iterate_over_cli_args()
-    }  # type: dict[str, CliArg]
-    prefix_length = len(prefix)
-    for key, value in environ.items():
-        key = key.lower()
-        if not key.startswith(prefix):
-            continue
-        key = key[prefix_length:]
-        if key not in cli_args:
-            continue
-        cli_arg = cli_args[key]
-        if not isinstance(cli_arg, CliArg):
-            continue
-        cli_arg.value = cli_arg.type(value)
-
-
 def initialize_before_launch() -> None:
     """Initialize the app"""
     args = MainCliArgs
@@ -101,6 +81,9 @@ def initialize_before_launch() -> None:
     skip_tensorflow_install = args.skip_tf_install.value or False
     skip_compile = args.skip_compile.value or False
     no_cache_dir = args.no_cache_dir.value or False
+    logger.info(
+        "Starting Application with CLI args:", environ["LLAMA_API_ARGS"]
+    )
 
     # PIP arguments
     pip_args = []  # type: list[str]
@@ -118,6 +101,8 @@ def initialize_before_launch() -> None:
     # Clone all repositories
     for git_clone_args in Config.repositories.values():
         git_clone(**git_clone_args)
+        if upgrade_packages:
+            git_pull(git_clone_args["git_path"])
 
     # Install packages
     if install_packages:
@@ -187,6 +172,7 @@ async def health():
 
 
 def run() -> None:
+    MainCliArgs.load()
     port = MainCliArgs.port.value
     assert port is not None, "Port is not set"
     if MainCliArgs.force_cuda.value:
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index 55e24de..cde26f2 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -43,28 +43,16 @@ def __post_init__(self):
 
 class CliArgHelper:
     @classmethod
-    def get_parser(cls) -> argparse.ArgumentParser:
-        parser = argparse.ArgumentParser()
-        for cli_key, cli_arg in cls.iterate_over_cli_args():
-            args = []  # type: List[str]
-            if cli_arg.short_option:
-                args.append(f"-{cli_arg.short_option.replace('_', '-')}")
-            args.append(f"--{cli_key.replace('_', '-')}")
-            kwargs = {}
-            if cli_arg.help:
-                kwargs["help"] = cli_arg.help
-            if cli_arg.default is not None:
-                kwargs["default"] = cli_arg.default
-            if cli_arg.action:
-                kwargs["action"] = cli_arg.action
-            else:
-                kwargs["type"] = cli_arg.type
-            parser.add_argument(*args, **kwargs)
-        return parser
-
-    @classmethod
-    def load(cls) -> None:
-        cls.load_from_namespace(cls.get_parser().parse_args())
+    def load(
+        cls,
+        environ_key: str = "LLAMA_API_ARGS",
+        environ_key_prefix: str = "LLAMA_API_",
+    ) -> None:
+        """Load CLI arguments from environment variables and/or CLI arguments"""
+        cls.load_from_namespace(cls.parser.parse_args())
+        cls.load_from_environ(
+            environ_key=environ_key, environ_key_prefix=environ_key_prefix
+        )
 
     @classmethod
     def load_from_namespace(
@@ -82,11 +70,16 @@ def load_from_namespace(
             {
                 cli_key.upper(): cli_arg.value
                 for cli_key, cli_arg in cli_args.items()
+                if cli_arg.value is not None
             }
         )
 
     @classmethod
-    def load_from_environ(cls, environ_key: str = "LLAMA_API_ARGS") -> None:
+    def load_from_environ(
+        cls,
+        environ_key: str = "LLAMA_API_ARGS",
+        environ_key_prefix: str = "LLAMA_API_",
+    ) -> None:
         json_str = environ.get(environ_key)
         assert (
             json_str is not None
@@ -102,6 +95,20 @@ def load_from_environ(cls, environ_key: str = "LLAMA_API_ARGS") -> None:
                 cli_arg = cli_args[cli_key]
                 cli_arg.value = cli_arg.type(cli_value)
 
+        environ_key_prefix = environ_key_prefix.lower()
+        prefix_length = len(environ_key_prefix)
+        for key, value in environ.items():
+            key = key.lower()
+            if not key.startswith(environ_key_prefix):
+                continue
+            key = key[prefix_length:]
+            if key not in cli_args:
+                continue
+            cli_arg = cli_args[key]
+            if not isinstance(cli_arg, CliArg):
+                continue
+            cli_arg.value = cli_arg.type(value)
+
     @classmethod
     def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]:
         for _cls in cls.__mro__:
@@ -109,6 +116,29 @@ def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]:
                 if isinstance(attr_value, CliArg):
                     yield attr_name, attr_value
 
+    @classmethod
+    @property
+    def parser(cls) -> argparse.ArgumentParser:
+        """Parse CLI arguments from environment variables,
+        and return the parser"""
+        arg_parser = argparse.ArgumentParser()
+        for cli_key, cli_arg in cls.iterate_over_cli_args():
+            args = []  # type: List[str]
+            args.append(f"--{cli_key.replace('_', '-')}")
+            if cli_arg.short_option:
+                args.append(f"-{cli_arg.short_option.replace('_', '-')}")
+            kwargs = {}
+            if cli_arg.help:
+                kwargs["help"] = cli_arg.help
+            if cli_arg.default is not None:
+                kwargs["default"] = cli_arg.default
+            if cli_arg.action:
+                kwargs["action"] = cli_arg.action
+            else:
+                kwargs["type"] = cli_arg.type
+            arg_parser.add_argument(*args, **kwargs)
+        return arg_parser
+
 
 class AppSettingsCliArgs(CliArgHelper):
     install_pkgs: CliArg[bool] = CliArg(
diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py
index 49fc279..ca6bcde 100644
--- a/llama_api/utils/dependency.py
+++ b/llama_api/utils/dependency.py
@@ -76,6 +76,21 @@ def git_clone(
     return None
 
 
+def git_pull(
+    git_path: str,
+    options: Optional[List[str]] = None,
+) -> Optional[bool]:
+    """Pull a git repository."""
+    if Path(git_path).exists():
+        return run_command(
+            ["git", "pull", git_path, *(options or [])],
+            action="pull",
+            name=f"{git_path}",
+            try_emoji="📥",
+        )
+    return None
+
+
 def get_mac_major_version_string():
     # platform.mac_ver() returns a tuple ('10.16', ('', '', ''), 'x86_64')
     # Split the version string on '.' and take the first two components
diff --git a/main.py b/main.py
index 920896a..39948b7 100644
--- a/main.py
+++ b/main.py
@@ -1,7 +1,5 @@
 from llama_api.server.app_settings import run
-from llama_api.shared.config import MainCliArgs
 
 
 if __name__ == "__main__":
-    MainCliArgs.load()
     run()
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 64ccf5e..621ef1e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -6,7 +6,9 @@
 
 class TestCLIArgs(unittest.TestCase):
     def test_cli_args(self):
-        parser = MainCliArgs.get_parser()
+        parser = MainCliArgs.parser
+        environ_key = "LLAMA_CLI_ARGS"
+        environ_key_prefix = "LLAMA_"
 
         # Check that `--install-pkgs` is inherited from `MainCliArgs`
         args = parser.parse_args(["--install-pkgs", "--port", "8080"])
@@ -35,23 +37,25 @@ def test_cli_args(self):
         self.assertTrue(MainCliArgs.install_pkgs.value)
         self.assertEqual(MainCliArgs.port.value, 9000)
 
-        environ["LLAMA_CLI_ARGS"] = json.dumps(
-            {"force_cuda": False, "port": 7000}
-        )
-        AppSettingsCliArgs.load_from_environ("LLAMA_CLI_ARGS")
+        environ[environ_key] = json.dumps({"force_cuda": False, "port": 7000})
+        AppSettingsCliArgs.load_from_environ(environ_key, environ_key_prefix)
         self.assertFalse(AppSettingsCliArgs.force_cuda.value)
         self.assertTrue(AppSettingsCliArgs.install_pkgs.value)
         self.assertFalse(MainCliArgs.force_cuda.value)
         self.assertTrue(MainCliArgs.install_pkgs.value)
         self.assertEqual(MainCliArgs.port.value, 9000)
 
-        MainCliArgs.load_from_environ("LLAMA_CLI_ARGS")
+        MainCliArgs.load_from_environ(environ_key, environ_key_prefix)
         self.assertFalse(AppSettingsCliArgs.force_cuda.value)
         self.assertTrue(AppSettingsCliArgs.install_pkgs.value)
         self.assertFalse(MainCliArgs.force_cuda.value)
         self.assertTrue(MainCliArgs.install_pkgs.value)
         self.assertEqual(MainCliArgs.port.value, 7000)
 
+        environ[f"{environ_key_prefix}MAX_SEMAPHORES"] = "100"
+        MainCliArgs.load_from_environ(environ_key, environ_key_prefix)
+        self.assertEqual(MainCliArgs.max_semaphores.value, 100)
+
 
 if __name__ == "__main__":
     unittest.main()

From 6e5db5c483dd2743beaa0a88a1fae4f2b31a6945 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Mon, 21 Aug 2023 13:50:10 +0900
Subject: [PATCH 14/18] Added doc strings

---
 llama_api/server/app_settings.py | 10 ++--
 llama_api/shared/config.py       | 78 ++++++++++++++++++++++----------
 llama_api/utils/errors.py        |  2 +-
 readme.md                        | 29 ++++++------
 4 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index 11ce418..c3a2a78 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -75,21 +75,21 @@ def initialize_before_launch() -> None:
     """Initialize the app"""
     args = MainCliArgs
     install_packages = args.install_pkgs.value or False
-    upgrade_packages = args.upgrade_pkgs.value or False
+    upgrade = args.upgrade.value or False
     force_cuda = args.force_cuda.value or False
     skip_pytorch_install = args.skip_torch_install.value or False
     skip_tensorflow_install = args.skip_tf_install.value or False
     skip_compile = args.skip_compile.value or False
     no_cache_dir = args.no_cache_dir.value or False
-    logger.info(
-        "Starting Application with CLI args:", environ["LLAMA_API_ARGS"]
+    print(
+        "Starting Application with CLI args:" + str(environ["LLAMA_API_ARGS"])
     )
 
     # PIP arguments
     pip_args = []  # type: list[str]
     if no_cache_dir:
         pip_args.append("--no-cache-dir")
-    if upgrade_packages:
+    if upgrade:
         pip_args.append("--upgrade")
         # Upgrade pip
         run_command(
@@ -101,7 +101,7 @@ def initialize_before_launch() -> None:
     # Clone all repositories
     for git_clone_args in Config.repositories.values():
         git_clone(**git_clone_args)
-        if upgrade_packages:
+        if upgrade:
             git_pull(git_clone_args["git_path"])
 
     # Install packages
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index cde26f2..b211ed8 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -22,10 +22,19 @@
 
 
 except ImportError:
+    print("Failed to import typing_extensions, using TypedDict from typing")
     from typing import TypedDict  # When dependencies aren't installed yet
 
 
 T = TypeVar("T", bound=Union[str, int, float, bool])
+DEFAULT_ENVIRON_KEY = "LLAMA_API_ARGS"
+DEFAULT_ENVIRON_KEY_PREFIX = "LLAMA_API_"
+
+
+class GitCloneArgs(TypedDict):
+    git_path: str
+    disk_path: str
+    options: Optional[List[str]]
 
 
 @dataclass
@@ -42,13 +51,16 @@ def __post_init__(self):
 
 
 class CliArgHelper:
+    """Helper class for loading CLI arguments from environment variables
+    or a namespace of CLI arguments"""
+
     @classmethod
     def load(
         cls,
-        environ_key: str = "LLAMA_API_ARGS",
-        environ_key_prefix: str = "LLAMA_API_",
+        environ_key: str = DEFAULT_ENVIRON_KEY,
+        environ_key_prefix: str = DEFAULT_ENVIRON_KEY_PREFIX,
     ) -> None:
-        """Load CLI arguments from environment variables and/or CLI arguments"""
+        """Load CLI arguments from environment variables and CLI arguments"""
         cls.load_from_namespace(cls.parser.parse_args())
         cls.load_from_environ(
             environ_key=environ_key, environ_key_prefix=environ_key_prefix
@@ -56,38 +68,58 @@ def load(
 
     @classmethod
     def load_from_namespace(
-        cls, args: argparse.Namespace, environ_key: str = "LLAMA_API_ARGS"
+        cls,
+        args: argparse.Namespace,
+        environ_key: Optional[str] = DEFAULT_ENVIRON_KEY,
     ) -> None:
+        """Load CLI arguments from a namespace,
+        and set an environment variable with the CLI arguments as JSON"""
+        # Get all defined CLI arguments within the class
         cli_args = {
             cli_key: cli_arg
             for cli_key, cli_arg in cls.iterate_over_cli_args()
         }
+
+        # Parse the CLI arguments and set the value of the CLI argument
+        # if it's not None, otherwise keep the default value
         for cli_key, cli_arg in cli_args.items():
             cli_arg_value = getattr(args, cli_key, None)
             if cli_arg_value is not None:
                 cli_arg.value = cli_arg.type(cli_arg_value)
-        environ[environ_key] = json.dumps(
-            {
-                cli_key.upper(): cli_arg.value
-                for cli_key, cli_arg in cli_args.items()
-                if cli_arg.value is not None
-            }
-        )
+
+        # Set an environment variable with the CLI arguments as JSON,
+        # if an environment variable key is provided
+        if environ_key is not None:
+            environ[environ_key] = json.dumps(
+                {
+                    cli_key.upper(): cli_arg.value
+                    for cli_key, cli_arg in cli_args.items()
+                }
+            )
 
     @classmethod
     def load_from_environ(
         cls,
-        environ_key: str = "LLAMA_API_ARGS",
-        environ_key_prefix: str = "LLAMA_API_",
+        environ_key: str = DEFAULT_ENVIRON_KEY,
+        environ_key_prefix: Optional[str] = DEFAULT_ENVIRON_KEY_PREFIX,
     ) -> None:
+        """Load JSON CLI arguments from an environment variable.
+        If an environment variable key prefix is provided,
+        load CLI arguments from environment variables which start with
+        the prefix."""
         json_str = environ.get(environ_key)
         assert (
             json_str is not None
         ), f"Environment variable {environ_key} not found"
+        # Get all defined CLI arguments within the class
         cli_args = {
             cli_key: cli_arg
             for cli_key, cli_arg in cls.iterate_over_cli_args()
         }  # type: Dict[str, CliArg]
+
+        # Parse the CLI arguments from the JSON string
+        # and set the value of the CLI argument if it's not None,
+        # otherwise keep the default value
         cli_arg_values = json.loads(json_str)  # type: Dict[str, Any]
         for cli_key, cli_value in cli_arg_values.items():
             cli_key = cli_key.lower()
@@ -95,6 +127,10 @@ def load_from_environ(
                 cli_arg = cli_args[cli_key]
                 cli_arg.value = cli_arg.type(cli_value)
 
+        # Parse the CLI arguments from environment variables,
+        # which start with the prefix
+        if environ_key_prefix is None:
+            return
         environ_key_prefix = environ_key_prefix.lower()
         prefix_length = len(environ_key_prefix)
         for key, value in environ.items():
@@ -111,6 +147,9 @@ def load_from_environ(
 
     @classmethod
     def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]:
+        """Get all CLI arguments defined in the class,
+        including inherited classes. Yields a tuple of
+        (attribute name, CliArg)"""
         for _cls in cls.__mro__:
             for attr_name, attr_value in vars(_cls).items():
                 if isinstance(attr_value, CliArg):
@@ -119,8 +158,7 @@ def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]:
     @classmethod
     @property
     def parser(cls) -> argparse.ArgumentParser:
-        """Parse CLI arguments from environment variables,
-        and return the parser"""
+        """Return an argument parser with all CLI arguments"""
         arg_parser = argparse.ArgumentParser()
         for cli_key, cli_arg in cls.iterate_over_cli_args():
             args = []  # type: List[str]
@@ -178,11 +216,11 @@ class AppSettingsCliArgs(CliArgHelper):
         short_option="-no-cache",
         help="Disable caching of pip installs, if `install-pkgs` is set",
     )
-    upgrade_pkgs: CliArg[bool] = CliArg(
+    upgrade: CliArg[bool] = CliArg(
         type=bool,
         action="store_true",
         short_option="u",
-        help="Upgrade all packages before running the server",
+        help="Upgrade all packages and repositories before running the server",
     )
 
 
@@ -230,12 +268,6 @@ class MainCliArgs(AppSettingsCliArgs):
     )
 
 
-class GitCloneArgs(TypedDict):
-    git_path: str
-    disk_path: str
-    options: Optional[List[str]]
-
-
 class Config:
     """Configuration for the project"""
 
diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py
index 8f367f2..0f1356f 100644
--- a/llama_api/utils/errors.py
+++ b/llama_api/utils/errors.py
@@ -126,7 +126,7 @@ class RouteErrorHandler(APIRoute):
         ): ErrorResponseFormatters.model_not_found,
     }
 
-    api_key: Optional[str] = MainCliArgs.api_key.value
+    api_key: Optional[str] = MainCliArgs.api_key.value or None
 
     @cached_property
     def authorization(self) -> Optional[str]:
diff --git a/readme.md b/readme.md
index 3a74c87..65b29df 100644
--- a/readme.md
+++ b/readme.md
@@ -32,31 +32,32 @@ python -m main
 ```
 Options:
 ```b
-usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-s MAX_SEMAPHORES] [-k API_KEY] [-x] [--no-embed] [-t] [-i] [-c] [--no-torch] [--no-tf] [--no-compile] [--no-cache] [-u]
+usage: main.py [-h] [--port PORT] [--max-workers MAX_WORKERS] [--max-semaphores MAX_SEMAPHORES] [--api-key API_KEY] [--xformers] [--no-embed] [--tunnel] [--install-pkgs] [--force-cuda] [--skip-torch-install] [--skip-tf-install] [--skip-compile]
+               [--no-cache-dir] [--upgrade]
 
 options:
   -h, --help            show this help message and exit
-  -p PORT, --port PORT  Port to run the server on; default is 8000
-  -w MAX_WORKERS, --max-workers MAX_WORKERS
+  --port PORT, -p PORT  Port to run the server on; default is 8000
+  --max-workers MAX_WORKERS, -w MAX_WORKERS
                         Maximum number of process workers to run; default is 1
-  -s MAX_SEMAPHORES, --max-semaphores MAX_SEMAPHORES
+  --max-semaphores MAX_SEMAPHORES, -s MAX_SEMAPHORES
                         Maximum number of process semaphores to permit; default is 1
-  -k API_KEY, --api-key API_KEY
+  --api-key API_KEY, -k API_KEY
                         API key to use for the server
-  -x, --xformers        Apply xformers' memory-efficient optimizations
+  --xformers, -x        Apply xformers' memory-efficient optimizations
   --no-embed            Disable embeddings endpoint
-  -t, --tunnel          Tunnel the server through cloudflared
-  -i, --install-pkgs    Install all required packages before running the server
-  -c, --force-cuda      Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118
-  --no-torch, --skip-torch-install
+  --tunnel, -t          Tunnel the server through cloudflared
+  --install-pkgs, -i    Install all required packages before running the server
+  --force-cuda, -c      Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118
+  --skip-torch-install, --no-torch
                         Skip installing pytorch, if `install-pkgs` is set
-  --no-tf, --skip-tf-install
+  --skip-tf-install, --no-tf
                         Skip installing tensorflow, if `install-pkgs` is set
-  --no-compile, --skip-compile
+  --skip-compile, --no-compile
                         Skip compiling the shared library of LLaMA C++ code
-  --no-cache, --no-cache-dir
+  --no-cache-dir, --no-cache
                         Disable caching of pip installs, if `install-pkgs` is set
-  -u, --upgrade-pkgs    Upgrade all packages before running the server
+  --upgrade, -u         Upgrade all packages and repositories before running the server
 ```
 
 ### Unique features

From 419f138c9336e5aca823d40a9aab0b463befd9e6 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 22 Aug 2023 01:19:00 +0900
Subject: [PATCH 15/18] Refactored `get_event_publisher`

---
 llama_api/server/app_settings.py |   4 +-
 llama_api/server/routers/v1.py   | 131 +++++++++++++------------------
 llama_api/utils/completions.py   |  23 +++++-
 3 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
index c3a2a78..5212a3b 100644
--- a/llama_api/server/app_settings.py
+++ b/llama_api/server/app_settings.py
@@ -102,7 +102,9 @@ def initialize_before_launch() -> None:
     for git_clone_args in Config.repositories.values():
         git_clone(**git_clone_args)
         if upgrade:
-            git_pull(git_clone_args["git_path"])
+            git_pull(
+                git_clone_args["git_path"], options=["--recurse-submodules"]
+            )
 
     # Install packages
     if install_packages:
diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
index de0901b..54c2d0f 100644
--- a/llama_api/server/routers/v1.py
+++ b/llama_api/server/routers/v1.py
@@ -8,7 +8,6 @@
 from functools import partial
 from queue import Queue
 from random import choice
-from threading import Event
 from time import time
 from typing import (
     Any,
@@ -28,9 +27,7 @@
     Semaphore,
     create_memory_object_stream,
     get_cancelled_exc_class,
-    move_on_after,
 )
-from anyio.streams.memory import MemoryObjectSendStream
 from fastapi import APIRouter, Request
 from fastapi.concurrency import iterate_in_threadpool, run_in_threadpool
 from orjson import OPT_INDENT_2, dumps
@@ -41,9 +38,7 @@
 from ...mixins.completion import CompletionStatus
 from ...schemas.api import (
     ChatCompletion,
-    ChatCompletionChunk,
     Completion,
-    CompletionChunk,
     CreateChatCompletionRequest,
     CreateCompletionRequest,
     CreateEmbeddingRequest,
@@ -119,17 +114,29 @@ async def get_wix_with_semaphore(
 ) -> AsyncGenerator[int, None]:
     """Get the worker index (wix) for the key and acquire the semaphore"""
     global wix_metas
+
+    # Get the worker index (wix) with the lowest rank
+    # If the rank is -2, then the worker is processing the same model
+    # If the rank is -1, then the worker is not processing any model
+    # If the rank is greater than or equal to 0, then the worker is processing
+    # a different model
     worker_ranks = [
         get_worker_rank(wix_meta, request_key) for wix_meta in wix_metas
     ]
     min_rank = min(worker_ranks)
+
+    # Choose a random worker index (wix) with the lowest rank
     candidates = [i for i, rank in enumerate(worker_ranks) if rank == min_rank]
     if not candidates:
         raise LookupError("No available wix")
     wix_meta = wix_metas[choice(candidates)]
+
+    # Acquire the semaphore for the worker index (wix)
     async with wix_meta.semaphore:
+        # If client is already gone, then ignore the request
         if await request.is_disconnected():
             return
+        # Reserve the worker, it is now processing the request
         wix_meta.processed_key = request_key
         yield wix_meta.wix
 
@@ -145,60 +152,6 @@ def validate_item_type(item: Any, type: Type[T]) -> T:
     return item
 
 
-def get_text_from_completion(
-    completion: Union[Completion, ChatCompletion]
-) -> str:
-    """Get the generated text from a completion"""
-    if "text" in completion["choices"][0]:
-        return completion["choices"][0]["text"]
-    return completion["choices"][0]["message"]["content"]
-
-
-def get_text_from_chunk(
-    chunk: Union[CompletionChunk, ChatCompletionChunk]
-) -> str:
-    """Get the generated text from a completion chunk"""
-    if "text" in chunk["choices"][0]:
-        return chunk["choices"][0]["text"]
-    return chunk["choices"][0]["delta"].get("content", "")
-
-
-async def get_event_publisher(
-    request: Request,
-    body: Union[
-        CreateChatCompletionRequest,
-        CreateCompletionRequest,
-    ],
-    inner_send_chan: MemoryObjectSendStream[bytes],
-    task: "Task[CompletionStatus]",
-    interrupt_signal: Event,
-    iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]],
-) -> None:
-    """Publish Server-Sent-Events (SSE) to the client"""
-    is_interrupted = False  # type: bool
-    async with inner_send_chan:
-        try:
-            async for chunk in iterate_in_threadpool(iterator):
-                await inner_send_chan.send(b"data: " + dumps(chunk) + b"\n\n")
-                if await request.is_disconnected():
-                    raise get_cancelled_exc_class()()
-            await inner_send_chan.send(b"data: [DONE]\n\n")
-        except get_cancelled_exc_class():
-            is_interrupted = True
-            with move_on_after(1, shield=True):
-                raise
-        finally:
-            # Cancel the producer task and set event,
-            # so the completion task can be stopped
-            interrupt_signal.set()
-            state = "Interrupted" if is_interrupted else "Completed"
-            try:
-                status = await wait_for(task, timeout=3)
-                log_request_and_response(body, status, state)
-            finally:
-                task.cancel()
-
-
 def get_streaming_iterator(
     queue: Queue,
     first_response: Optional[Dict] = None,
@@ -225,8 +178,11 @@ def log_request_and_response(
     state: Literal["Completed", "Interrupted"],
 ) -> None:
     """Log the request and response of the completion or embedding"""
+    # If the status is None, then the request has been interrupted
     if status is None:
         return
+
+    # Measure the elapsed time, and get information about the request
     elapsed_time = time() - status.started_at
     log_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"]
     body_without_prompt = body.model_dump(
@@ -240,18 +196,20 @@ def log_request_and_response(
     if isinstance(status, EmbeddingStatus) and isinstance(
         body, CreateEmbeddingRequest
     ):
+        # Embedding usage is the number of characters in the input
+        # and the number of chunks in the embedding
         embed_usage = {
             "input_chars": len(body.input),
             "embedding_chunks": len(status.embedding["data"])
             if status.embedding
             else 0,
-        }
+        }  # type: Dict[str, int]
         log_messages.append(f"embedding chunks: {embed_usage}")
         embed_log = {
             "request": body_without_prompt,
             "input": body.input,
             "embedding": status.embedding,
-        }
+        }  # type: Dict[str, Any]
         logger.info(
             f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})"
         )
@@ -264,6 +222,7 @@ def log_request_and_response(
     tokens_per_second = tokens / elapsed_time
     log_messages.append(f"tokens: {tokens}({tokens_per_second: .1f}tok/s)")
     if isinstance(body, CreateChatCompletionRequest):
+        # Log the chat completion status
         chat_log = {
             "request": body_without_prompt,
             "chat": [
@@ -276,15 +235,16 @@ def log_request_and_response(
                     "content": status.generated_text,
                 }
             ],
-        }
+        }  # type: Dict[str, Any]
     elif isinstance(body, CreateCompletionRequest):
+        # Log the text completion status
         chat_log = {
             "request": body_without_prompt,
             "prompt": {
                 "user": body.prompt,
                 "assistant": status.generated_text,
             },
-        }
+        }  # type: Dict[str, Any]
     else:
         return
     logger.info(f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})")
@@ -316,22 +276,39 @@ async def create_chat_completion_or_completion(
         )
         if body.stream:
             send_chan, recv_chan = create_memory_object_stream(10)
+            chunk_iterator = get_streaming_iterator(
+                queue=queue,
+                first_response=validate_item_type(
+                    await run_in_threadpool(queue.get), type=dict
+                ),
+            )
+
+            async def get_event_publisher() -> None:
+                # Publish Server-Sent-Events (SSE) to the client
+                is_interrupted = False  # type: bool
+                send = send_chan.send
+                try:
+                    async for chunk in iterate_in_threadpool(chunk_iterator):
+                        await send(b"data: " + dumps(chunk) + b"\n\n")
+                    await send(b"data: [DONE]\n\n")
+                except get_cancelled_exc_class():
+                    is_interrupted = True
+                finally:
+                    send_chan.close()
+                    # Cancel the producer task and set event,
+                    # so the completion task can be stopped
+                    interrupt_signal.set()
+                    state = "Interrupted" if is_interrupted else "Completed"
+                    try:
+                        status = await wait_for(task, timeout=3)
+                        log_request_and_response(body, status, state)
+                    finally:
+                        task.cancel()
+
             return EventSourceResponse(
                 recv_chan,
-                data_sender_callable=partial(
-                    get_event_publisher,
-                    request=request,
-                    body=body,
-                    inner_send_chan=send_chan,
-                    task=task,
-                    interrupt_signal=interrupt_signal,
-                    iterator=get_streaming_iterator(  # type: ignore
-                        queue=queue,
-                        first_response=validate_item_type(
-                            await run_in_threadpool(queue.get), type=dict
-                        ),
-                    ),
-                ),
+                data_sender_callable=get_event_publisher,
+                ping=5,
             )
         else:
             # Cancel the producer task and set event,
diff --git a/llama_api/utils/completions.py b/llama_api/utils/completions.py
index 6b696f3..4459aae 100644
--- a/llama_api/utils/completions.py
+++ b/llama_api/utils/completions.py
@@ -1,5 +1,5 @@
 from time import time
-from typing import Iterator, Literal, Optional
+from typing import Iterator, Literal, Optional, Union
 from uuid import uuid4
 
 from ..schemas.api import (
@@ -327,3 +327,24 @@ def convert_text_completion_chunks_to_chat(
                 )
             ],
         )
+
+
+# ==== GET TEXT FROM COMPLETION ==== #
+
+
+def get_text_from_completion(
+    completion: Union[Completion, ChatCompletion]
+) -> str:
+    """Get the generated text from a completion"""
+    if "text" in completion["choices"][0]:
+        return completion["choices"][0]["text"]
+    return completion["choices"][0]["message"]["content"]
+
+
+def get_text_from_chunk(
+    chunk: Union[CompletionChunk, ChatCompletionChunk]
+) -> str:
+    """Get the generated text from a completion chunk"""
+    if "text" in chunk["choices"][0]:
+        return chunk["choices"][0]["text"]
+    return chunk["choices"][0]["delta"].get("content", "")

From 427d553782b9b63b36757bcb04e4ed5db24eda07 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 22 Aug 2023 01:19:32 +0900
Subject: [PATCH 16/18] Added `required` option to `FunctionCallMixin`

---
 llama_api/mixins/function_call.py | 151 ++++++++++++++++++++++--------
 1 file changed, 111 insertions(+), 40 deletions(-)

diff --git a/llama_api/mixins/function_call.py b/llama_api/mixins/function_call.py
index 7b7ae6e..5f7da00 100644
--- a/llama_api/mixins/function_call.py
+++ b/llama_api/mixins/function_call.py
@@ -34,7 +34,7 @@
 # whitespace is constrained to a single space char
 # to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
-SPACE_RULE: str = '" "?'
+SPACE_RULE: str = "([ \t\n])?"
 
 PRIMITIVE_RULES: Dict[str, str] = {
     "boolean": '("true" | "false") space',
@@ -60,7 +60,14 @@
     "boolean", "number", "integer", "string", "null", "object", "array"
 ]
 SchemaKey = Literal[
-    "type", "oneOf", "anyOf", "const", "enum", "properties", "items"
+    "type",
+    "oneOf",
+    "anyOf",
+    "const",
+    "enum",
+    "properties",
+    "items",
+    "required",
 ]
 
 
@@ -104,24 +111,43 @@ def invoke_function_call_streaming(
             "function call is not implemented for this model"
         )
 
-    @staticmethod
+    @classmethod
+    def from_json_schema(
+        cls,
+        schema: Union[Dict[SchemaKey, Any], str],
+        prop_order: Optional[Dict[str, int]] = None,
+    ) -> str:
+        """Parse a JSON schema into a BNF grammar"""
+        if isinstance(schema, str):
+            schema = json.loads(schema)
+            assert isinstance(schema, dict), "schema must be valid JSON"
+        self = cls()
+        self._prop_order = prop_order or {}
+        self._rules = {"space": SPACE_RULE}
+        self._visit(schema, "")
+        return self._format_grammar()
+
+    @classmethod
     @overload
     def from_function_calls(
+        cls,
         function_calls: FunctionCall,
         prop_order: Optional[Dict[str, int]] = None,
     ) -> str:
         ...
 
-    @staticmethod
+    @classmethod
     @overload
     def from_function_calls(
+        cls,
         function_calls: Iterable[FunctionCall],
         prop_order: Optional[Dict[str, int]] = None,
     ) -> List[str]:
         ...
 
-    @staticmethod
+    @classmethod
     def from_function_calls(
+        cls,
         function_calls: Union[FunctionCall, Iterable[FunctionCall]],
         prop_order: Optional[Dict[str, int]] = None,
     ) -> Union[str, List[str]]:
@@ -135,7 +161,7 @@ def from_function_calls(
 
         bnfs = []  # type: List[str]
         for function_call in function_calls:
-            self = FunctionCallMixin()
+            self = cls()
             self._prop_order = prop_order or {}
             self._rules = {"space": SPACE_RULE}
             parameters = function_call.to_dict().get("parameters")
@@ -144,24 +170,27 @@ def from_function_calls(
             bnfs.append(self._format_grammar())
         return bnfs if return_as_list else bnfs[0]
 
-    @staticmethod
+    @classmethod
     @overload
     def from_functions(
+        cls,
         functions: Callable,
         prop_order: Optional[Dict[str, int]] = None,
     ) -> str:
         ...
 
-    @staticmethod
+    @classmethod
     @overload
     def from_functions(
+        cls,
         functions: Iterable[Callable],
         prop_order: Optional[Dict[str, int]] = None,
     ) -> List[str]:
         ...
 
-    @staticmethod
+    @classmethod
     def from_functions(
+        cls,
         functions: Union[Callable, Iterable[Callable]],
         prop_order: Optional[Dict[str, int]] = None,
     ) -> Union[str, List[str]]:
@@ -258,7 +287,7 @@ def _visit(self, schema: Dict[SchemaKey, Any], name: str) -> str:
 
         if "oneOf" in schema or "anyOf" in schema:
             # This is a union type
-            rule: str = " | ".join(
+            rule = " | ".join(
                 (
                     self._visit(alt_schema, f'{name}{"-" if name else ""}{i}')
                     for i, alt_schema in enumerate(
@@ -282,24 +311,47 @@ def _visit(self, schema: Dict[SchemaKey, Any], name: str) -> str:
             return self._add_rule(rule_name, rule)
 
         elif schema_type == "object" and "properties" in schema:
-            # TODO: `required` keyword
+            required_properties = set(
+                schema.get("required", schema["properties"].keys())
+            )
+            if not required_properties:
+                raise ValueError(
+                    "Object schema must have at least one required property if `required` is specified"
+                )
             prop_order = self._prop_order
             prop_pairs = sorted(
                 schema["properties"].items(),
-                # sort by position in prop_order (if specified) then by key
                 key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
             )
 
-            rule = '"{" space'
-            for i, (prop_name, prop_schema) in enumerate(prop_pairs):
+            rule_parts = []  # type: List[str]
+            optional_rule_parts = []  # type: List[str]
+            first_property = True  # type: bool
+
+            for prop_name, prop_schema in prop_pairs:
                 prop_rule_name = self._visit(
                     prop_schema, f'{name}{"-" if name else ""}{prop_name}'
                 )
-                if i > 0:
-                    rule += ' "," space'
-                rule += rf' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
-            rule += ' "}" space'
+                prop_str = rf'{self._format_literal(prop_name)} space ":" space {prop_rule_name}'
+
+                if prop_name in required_properties:
+                    if not first_property:
+                        prop_str = rf'"," space {prop_str}'
+                    rule_parts.append(prop_str)
+                    first_property = False
+                else:
+                    optional_rule_parts.append(prop_str)
+
+            for i, optional_str in enumerate(optional_rule_parts):
+                if i == 0 and not rule_parts:
+                    # if no required properties
+                    combined_str = rf"({optional_str})?"
+                else:
+                    combined_str = rf'("," space {optional_str})?'
+                rule_parts.append(combined_str)
 
+            # Combine rules
+            rule = '"{" space ' + " ".join(rule_parts) + ' "}" space'
             return self._add_rule(rule_name, rule)
 
         elif schema_type == "array" and "items" in schema:
@@ -326,7 +378,7 @@ def _format_grammar(self):
 
 
 if __name__ == "__main__":
-    # from llama_cpp import LlamaGrammar, Llama
+    from repositories.llama_cpp.llama_cpp import LlamaGrammar, Llama
 
     # Define a python function and parse it into a grammar
     def get_current_weather(
@@ -340,32 +392,51 @@ def get_current_weather(
             ["fahrenheit", "celsius"],
         ],
         source: Annotated[
-            str,
+            Optional[str],
             "The source of the weather information",
             ["openweathermap", "weatherapi"],
         ] = "openweathermap",
     ):
         """Get the current weather in a given location"""
 
-    model_path = "C:/Users/sdml/Desktop/orca-mini-3b.ggmlv3.q4_0.bin"
+    model_path = r"models\ggml\orca-mini-3b.ggmlv3.q4_0.bin"
     grammar: str = FunctionCallMixin.from_functions(get_current_weather)
+    # print(f"Grammar:\n{grammar}")
+
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "location": {"type": "string"},
+            "unit": {
+                "type": "string",
+                "enum": ["fahrenheit", "celsius"],
+            },
+            "source": {
+                "type": "string",
+                "enum": ["openweathermap", "weatherapi"],
+            },
+        },
+        "required": ["location", "unit"],
+    }  # type: Dict[SchemaKey, Any]
+    grammar = FunctionCallMixin.from_json_schema(json_schema)
     print(f"Grammar:\n{grammar}")
-    # llama_grammar = LlamaGrammar.from_string(grammar, verbose=False)
-    # llm = Llama(model_path)
-    # llm.grammar = llama_grammar
-    # for city in (
-    #     "London",
-    #     "Paris",
-    #     "New York",
-    #     "Berlin",
-    #     "Tokyo",
-    #     "Sydney",
-    #     "Moscow",
-    #     "Beijing",
-    #     "Cairo",
-    #     "Rome",
-    # ):
-    #     print(llm(prompt=f"### User: What is the weather in {city} today? ### Assistant:")["choices"][0]["text"])  # type: ignore
-
-    # # Output:
-    # # { "location": "London", "source": "openweathermap","unit" : "celsius"}
+
+    llama_grammar = LlamaGrammar.from_string(grammar, verbose=False)
+    llm = Llama(model_path)
+    for city in (
+        "London",
+        "Paris",
+        "New York",
+        "Berlin",
+        "Tokyo",
+        "Sydney",
+        "Moscow",
+        "Beijing",
+        "Cairo",
+        "Rome",
+    ):
+        output = llm(prompt=f"### User: What is the weather in {city} today? ### Assistant:", grammar=llama_grammar)["choices"][0]["text"]  # type: ignore
+        print(json.loads(output))
+
+    # Output:
+    # { "location": "London", "source": "openweathermap","unit" : "celsius"}

From b1018430ad805ca054dd92f3e3bd6e4fea11b847 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 22 Aug 2023 21:12:50 +0900
Subject: [PATCH 17/18] ci fail resolve - 1

---
 llama_api/shared/config.py | 5 ++---
 tests/test_cli.py          | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index b211ed8..7b2e791 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -61,7 +61,7 @@ def load(
         environ_key_prefix: str = DEFAULT_ENVIRON_KEY_PREFIX,
     ) -> None:
         """Load CLI arguments from environment variables and CLI arguments"""
-        cls.load_from_namespace(cls.parser.parse_args())
+        cls.load_from_namespace(cls.get_parser().parse_args())
         cls.load_from_environ(
             environ_key=environ_key, environ_key_prefix=environ_key_prefix
         )
@@ -156,8 +156,7 @@ def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]:
                     yield attr_name, attr_value
 
     @classmethod
-    @property
-    def parser(cls) -> argparse.ArgumentParser:
+    def get_parser(cls) -> argparse.ArgumentParser:
         """Return an argument parser with all CLI arguments"""
         arg_parser = argparse.ArgumentParser()
         for cli_key, cli_arg in cls.iterate_over_cli_args():
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 621ef1e..df9ef72 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -6,7 +6,7 @@
 
 class TestCLIArgs(unittest.TestCase):
     def test_cli_args(self):
-        parser = MainCliArgs.parser
+        parser = MainCliArgs.get_parser()
         environ_key = "LLAMA_CLI_ARGS"
         environ_key_prefix = "LLAMA_"
 

From 7ea7b6cfce1a16edcce7ac45ae8f6bc1688fba61 Mon Sep 17 00:00:00 2001
From: c0sogi <dcas@naver.com>
Date: Tue, 22 Aug 2023 21:20:59 +0900
Subject: [PATCH 18/18] ci fail resolve - 2

---
 llama_api/mixins/completion.py | 2 +-
 llama_api/shared/config.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py
index 26f03f9..f5404bc 100644
--- a/llama_api/mixins/completion.py
+++ b/llama_api/mixins/completion.py
@@ -24,7 +24,7 @@ class CompletionStatus:
 class CompletionMixin:
     """A mixin for modules that support completion generation."""
 
-    _completion_status: Optional[defaultdict[str, CompletionStatus]] = None
+    _completion_status: Optional["defaultdict[str, CompletionStatus]"] = None
 
     @property
     def completion_status(self) -> Dict[str, CompletionStatus]:
diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
index 7b2e791..3a9c2e2 100644
--- a/llama_api/shared/config.py
+++ b/llama_api/shared/config.py
@@ -295,11 +295,11 @@ class Config:
         "exllama": GitCloneArgs(
             git_path="https://github.com/turboderp/exllama",
             disk_path="repositories/exllama",
-            options=["recurse-submodules"],
+            options=None,
         ),
         "llama_cpp": GitCloneArgs(
             git_path="https://github.com/abetlen/llama-cpp-python",
             disk_path="repositories/llama_cpp",
-            options=None,
+            options=["--recurse-submodules"],
         ),
     }