From 729f154501833e7bc23f1cde66a3f45c494fa8ce Mon Sep 17 00:00:00 2001 From: c0sogi Date: Thu, 17 Aug 2023 21:41:03 +0900 Subject: [PATCH 01/18] Refactored code --- llama_api/modules/exllama.py | 215 +++++++++++++++++++++----------- llama_api/modules/llama_cpp.py | 20 ++- llama_api/server/pools/llama.py | 13 +- llama_api/server/routers/v1.py | 2 +- llama_api/utils/completions.py | 22 +--- 5 files changed, 175 insertions(+), 97 deletions(-) diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index bd868c3..2c7e81f 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -2,6 +2,7 @@ # flake8: noqa from gc import collect from os import environ +from time import time from ..utils.logger import ApiLogger @@ -21,6 +22,7 @@ Iterable, Iterator, List, + Literal, Optional, Tuple, Union, @@ -32,13 +34,13 @@ from torch.nn.functional import log_softmax from ..logits.base import BaseLogitProcessor -from ..schemas.models import ExllamaModel -from ..utils.completions import ( - make_chat_completion, - make_chat_completion_chunk, - make_completion, - make_completion_chunk, +from ..schemas.api import ( + ChatCompletion, + ChatCompletionChunk, + Completion, + CompletionChunk, ) +from ..schemas.models import ExllamaModel from ..utils.dependency import import_repository from ..utils.system import deallocate_memory from .base import BaseCompletionGenerator @@ -440,32 +442,41 @@ def generate_completion_with_streaming( ) -> Iterator["CompletionChunk"]: completion_id = settings.completion_id model = self.model_name - last_token: Optional[str] = None - generated_text: str = "" + generated_text = "" # type: str for token in _generate_text_with_streaming( self, prompt=prompt, settings=settings ): generated_text += token - if last_token is not None: - yield make_completion_chunk( - id=completion_id, - model=model, - text=last_token, - finish_reason=None, - ) - last_token = token - yield make_completion_chunk( - id=completion_id, - model=model, - text=last_token if last_token is not None else "", - finish_reason="length" - if self._completion_status.get( - completion_id, - _encode(self.tokenizer, generated_text).shape[1], - ) - >= settings.max_tokens - else "stop", - ) + yield { + "id": completion_id, + "object": "text_completion", + "created": int(time()), + "model": model, + "choices": [ + { + "text": token, + "index": 0, + "logprobs": None, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": int(time()), + "model": model, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, + "finish_reason": _get_finish_reason( + self, settings, completion_id, generated_text + ), + } + ], + } def generate_completion( self, prompt: str, settings: "TextGenerationSettings" @@ -476,20 +487,31 @@ def generate_completion( self, prompt=prompt, settings=settings ) ) - n_prompt_tokens = _encode(self.tokenizer, prompt).shape[1] - n_completion_tokens = self._completion_status.get( + prompt_tokens = _encode(self.tokenizer, prompt).shape[1] + completion_tokens = self._completion_status.get( completion_id, _encode(self.tokenizer, generated_text).shape[1] ) - return make_completion( - id=completion_id, - model=self.model_name, - text=generated_text, - prompt_tokens=n_prompt_tokens, - completion_tokens=n_completion_tokens, - finish_reason="length" - if n_completion_tokens >= settings.max_tokens - else "stop", - ) + return { + "id": completion_id, + "object": "text_completion", + "created": int(time()), + "model": self.model_name, + "choices": [ + { + "text": generated_text, + "index": 0, + "logprobs": None, + "finish_reason": _get_finish_reason( + self, settings, completion_id, generated_text + ), + } + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } def generate_chat_completion_with_streaming( self, @@ -499,31 +521,52 @@ def generate_chat_completion_with_streaming( completion_id = settings.completion_id prompt = self.convert_messages_into_prompt(messages, settings=settings) model = self.model_name - last_token: Optional[str] = None - generated_text: str = "" + generated_text = "" # type: str + yield { + "id": completion_id, + "object": "chat.completion.chunk", + "created": int(time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"role": "assistant"}, + "finish_reason": None, + } + ], + } for token in _generate_text_with_streaming( self, prompt=prompt, settings=settings ): generated_text += token - if last_token is not None: - yield make_chat_completion_chunk( - id=completion_id, - model=model, - content=last_token, - finish_reason=None, - ) - last_token = token - yield make_chat_completion_chunk( - id=completion_id, - model=model, - content=last_token if last_token is not None else "", - finish_reason="length" - if self._completion_status.get( - completion_id, - _encode(self.tokenizer, generated_text).shape[1], - ) - else "stop", - ) + yield { + "id": completion_id, + "object": "chat.completion.chunk", + "created": int(time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"content": token}, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "chat.completion.chunk", + "created": int(time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": {}, + "finish_reason": _get_finish_reason( + self, settings, completion_id, generated_text + ), + } + ], + } def generate_chat_completion( self, @@ -541,16 +584,29 @@ def generate_chat_completion( completion_tokens = self._completion_status.get( completion_id, _encode(self.tokenizer, generated_text).shape[1] ) - return make_chat_completion( - id=completion_id, - model=self.model_name, - content=generated_text, - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - finish_reason="length" - if completion_tokens >= settings.max_tokens - else "stop", - ) + return { + "id": completion_id, + "object": "chat.completion", + "created": int(time()), + "model": self.model_name, + "choices": [ + { + "message": { + "role": "assistant", + "content": generated_text, + }, + "index": 0, + "finish_reason": _get_finish_reason( + self, settings, completion_id, generated_text + ), + } + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } def encode(self, text: str) -> List[int]: assert self._tokenizer is not None, "Tokenizer is not initialized" @@ -618,3 +674,20 @@ def _encode( ids = result[0] if isinstance(result, tuple) else result assert isinstance(ids, Tensor) return ids + + +def _get_finish_reason( + cg: ExllamaCompletionGenerator, + settings: TextGenerationSettings, + completion_id: str, + generated_text: str, +) -> Literal["length", "stop"]: + return ( + "length" + if cg._completion_status.get( + completion_id, + _encode(cg.tokenizer, generated_text).shape[1], + ) + >= settings.max_tokens + else "stop" + ) diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index d88e3e4..4652d63 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -1,6 +1,7 @@ """Wrapper for llama_cpp to generate text completions.""" from inspect import signature from typing import ( + Callable, Iterator, List, Optional, @@ -27,22 +28,33 @@ logger = ApiLogger(__name__) logger.info("🦙 llama-cpp-python repository found!") -build_shared_lib(logger=logger) with import_repository( git_path="https://github.com/abetlen/llama-cpp-python", disk_path="repositories/llama_cpp", options=["--recurse-submodules"], ): + build_shared_lib(logger=logger) from repositories.llama_cpp import llama_cpp +class LogitsProcessorList( + List[Callable[[List[int], List[float]], List[float]]] +): + def __call__( + self, input_ids: List[int], scores: List[float] + ) -> List[float]: + for processor in self: + scores = processor(input_ids, scores) + return scores + + def _create_completion( client: llama_cpp.Llama, prompt: str, stream: bool, settings: TextGenerationSettings, ) -> Union[Completion, Iterator[CompletionChunk]]: - logit_processors = llama_cpp.LogitsProcessorList( + logit_processors = LogitsProcessorList( [ processor.without_torch for processor in BaseCompletionGenerator.get_logit_processors( @@ -53,7 +65,7 @@ def _create_completion( ) ] ) - return client.create_completion( # type: ignore + return client.create_completion( stream=stream, prompt=prompt, max_tokens=settings.max_tokens, @@ -69,7 +81,7 @@ def _create_completion( mirostat_mode=settings.mirostat_mode, mirostat_tau=settings.mirostat_tau, mirostat_eta=settings.mirostat_eta, - logits_processor=logit_processors if logit_processors else None, + logits_processor=logit_processors if logit_processors else None, # type: ignore # noqa: E501 stop=settings.stop, ) diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py index c3f5756..6df7fec 100644 --- a/llama_api/server/pools/llama.py +++ b/llama_api/server/pools/llama.py @@ -70,10 +70,15 @@ def get_model_names() -> List[str]: def get_model(model_name: str) -> "BaseLLMModel": """Get a model from the model_definitions.py file""" - with logger.log_any_error( - f"Error getting model: {model_name}", exc_info=None - ): - return getattr(model_definitions, model_name) + try: + llm_model = getattr(model_definitions, model_name) + assert isinstance( + llm_model, BaseLLMModel + ), f"Not a LLM model: {model_name}" + return llm_model + except Exception as e: + logger.error(e) + raise ValueError(f"Model path does not exist: {model_name}") def get_completion_generator( diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index b2aeb47..a9a4fba 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -178,7 +178,7 @@ async def get_event_publisher( CreateChatCompletionRequest, CreateCompletionRequest, ], - inner_send_chan: MemoryObjectSendStream, + inner_send_chan: MemoryObjectSendStream[bytes], task: "Task[None]", interrupt_signal: Event, iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]], diff --git a/llama_api/utils/completions.py b/llama_api/utils/completions.py index d12b8ed..4353b81 100644 --- a/llama_api/utils/completions.py +++ b/llama_api/utils/completions.py @@ -1,5 +1,5 @@ from time import time -from typing import TYPE_CHECKING, Iterator, Literal, Optional +from typing import Iterator, Literal, Optional from uuid import uuid4 from ..schemas.api import ( @@ -70,13 +70,10 @@ def make_chat_completion( def make_chat_completion_from_json( - json_data: dict, # type: ignore + json_data: dict, index: int = 0, ) -> ChatCompletion: """Make ChatCompletion from json data(dict)""" - if TYPE_CHECKING: - # A hacky way to make mypy happy - json_data: ChatCompletion = json_data # type: ignore usage = json_data.get("usage") if usage is None: usage = CompletionUsage( @@ -146,12 +143,9 @@ def make_chat_completion_chunk( def make_chat_completion_chunk_from_json( - json_data: dict, # type: ignore + json_data: dict, ) -> ChatCompletionChunk: """Make ChatCompletionChunk from json data(dict)""" - if TYPE_CHECKING: - # A hacky way to make mypy happy - json_data: ChatCompletionChunk = json_data # type: ignore delta = json_data["choices"][0]["delta"] function_call = delta.get("function_call") if function_call: @@ -203,12 +197,9 @@ def make_completion_chunk( def make_completion_chunk_from_json( - json_data: dict, # type: ignore + json_data: dict, ) -> CompletionChunk: """Make CompletionChunk from json data(dict)""" - if TYPE_CHECKING: - # A hacky way to make mypy happy - json_data: CompletionChunk = json_data # type: ignore choice = json_data["choices"][0] return make_completion_chunk( id=json_data["id"], @@ -259,13 +250,10 @@ def make_completion( def make_completion_from_json( - json_data: dict, # type: ignore + json_data: dict, index: int = 0, ) -> Completion: """Make Completion from json data(dict)""" - if TYPE_CHECKING: - # A hacky way to make mypy happy - json_data: Completion = json_data # type: ignore usage = json_data.get("usage") if usage is None: usage = CompletionUsage( From ac8318e9e3241266ac28df50b2b68838de9976f5 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Thu, 17 Aug 2023 21:43:14 +0900 Subject: [PATCH 02/18] Added lock to completion generator --- llama_api/mixins/lock.py | 21 +++++++++++++++++++++ llama_api/modules/base.py | 7 ++++++- llama_api/server/pools/llama.py | 22 ++++++++++++---------- 3 files changed, 39 insertions(+), 11 deletions(-) create mode 100644 llama_api/mixins/lock.py diff --git a/llama_api/mixins/lock.py b/llama_api/mixins/lock.py new file mode 100644 index 0000000..9941e10 --- /dev/null +++ b/llama_api/mixins/lock.py @@ -0,0 +1,21 @@ +from threading import Lock +from typing import Optional + + +class LockMixin: + _lock: Optional[Lock] = None + + @property + def lock(self) -> Lock: + """Get the lock.""" + if self._lock is None: + self._lock = Lock() + return self._lock + + def acquire_lock(self) -> None: + """Acquire the lock.""" + self.lock.acquire() + + def release_lock(self) -> None: + """Release the lock.""" + self.lock.release() diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py index 61b1e7e..424395f 100644 --- a/llama_api/modules/base.py +++ b/llama_api/modules/base.py @@ -4,6 +4,7 @@ from typing import Any, Iterator, List, TypeVar from ..mixins.interrupt import InterruptMixin +from ..mixins.lock import LockMixin from ..mixins.logits import LogitsMixin from ..mixins.prompt_utils import PromptUtilsMixin from ..schemas.api import ( @@ -35,7 +36,11 @@ def model_path_resolved(self) -> str: class BaseCompletionGenerator( - ABC, PromptUtilsMixin, InterruptMixin, LogitsMixin + ABC, + PromptUtilsMixin, + InterruptMixin, + LogitsMixin, + LockMixin, ): """Base class for all completion generators.""" diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py index 6df7fec..c2db637 100644 --- a/llama_api/server/pools/llama.py +++ b/llama_api/server/pools/llama.py @@ -56,7 +56,9 @@ def completion_generator_manager( """Context manager for completion generators.""" completion_generator = get_completion_generator(body) completion_generator.interrupt_signal = interrupt_signal + completion_generator.acquire_lock() yield completion_generator + completion_generator.release_lock() completion_generator.interrupt_signal = None @@ -92,19 +94,19 @@ def get_completion_generator( If the model is not cached, create a new one. If the cache is full, delete the oldest completion generator.""" + # Check if the model is an OpenAI model + openai_replacement_models: Dict[str, str] = getattr( + model_definitions, "openai_replacement_models", {} + ) + if body.model in openai_replacement_models: + body.model = openai_replacement_models[body.model] + body.is_openai = True + llm_model = get_model(body.model) + with logger.log_any_error( - f"Error getting a completion generator of {body.model}" + f"Error getting a completion generator of {body.model}", ): - # Check if the model is an OpenAI model - openai_replacement_models: Dict[str, str] = getattr( - model_definitions, "openai_replacement_models", {} - ) - if body.model in openai_replacement_models: - body.model = openai_replacement_models[body.model] - body.is_openai = True - # Check if the model is defined in LLMModels enum - llm_model = get_model(body.model) # Check if the model is cached. If so, return the cached one. for completion_generator in completion_generators: From 38427681ce273f96b2a9e2e8282e8133562b76e8 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Thu, 17 Aug 2023 22:34:51 +0900 Subject: [PATCH 03/18] fix typo --- llama_api/modules/exllama.py | 2 +- llama_api/utils/completions.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index 2c7e81f..b773261 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -678,7 +678,7 @@ def _encode( def _get_finish_reason( cg: ExllamaCompletionGenerator, - settings: TextGenerationSettings, + settings: "TextGenerationSettings", completion_id: str, generated_text: str, ) -> Literal["length", "stop"]: diff --git a/llama_api/utils/completions.py b/llama_api/utils/completions.py index 4353b81..6b696f3 100644 --- a/llama_api/utils/completions.py +++ b/llama_api/utils/completions.py @@ -305,9 +305,7 @@ def convert_text_completion_chunks_to_chat( choices=[ ChatCompletionChunkChoice( index=0, - delta={ - "role": "assistant", - }, + delta={"role": "assistant"}, finish_reason=None, ) ], From de83eedc16fc707b08206319961a143911ff2ec3 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Fri, 18 Aug 2023 19:20:42 +0900 Subject: [PATCH 04/18] Do git clone repos when server startup --- llama_api/modules/exllama.py | 22 ++++++---------------- llama_api/modules/exllama_lora.py | 16 ++++++++-------- llama_api/modules/llama_cpp.py | 15 +++------------ llama_api/server/app_settings.py | 5 +++++ llama_api/shared/config.py | 21 ++++++++++++++++++++- llama_api/utils/system.py | 29 ++++++++++++++++++++--------- 6 files changed, 62 insertions(+), 46 deletions(-) diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index b773261..d3ad093 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -1,8 +1,6 @@ """Wrapper for exllama to generate text completions.""" # flake8: noqa -from gc import collect from os import environ -from time import time from ..utils.logger import ApiLogger @@ -15,9 +13,10 @@ from ..modules.xformers import hijack_attention_forward hijack_attention_forward() +from gc import collect from pathlib import Path +from time import time from typing import ( - TYPE_CHECKING, Dict, Iterable, Iterator, @@ -35,34 +34,25 @@ from ..logits.base import BaseLogitProcessor from ..schemas.api import ( + APIChatMessage, ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk, + TextGenerationSettings, ) from ..schemas.models import ExllamaModel +from ..shared.config import Config from ..utils.dependency import import_repository from ..utils.system import deallocate_memory from .base import BaseCompletionGenerator from .exllama_lora import ExLlamaLora -with import_repository( - git_path="https://github.com/turboderp/exllama", - disk_path="repositories/exllama", -): +with import_repository(**Config.repositories["exllama"]): from repositories.exllama.generator import ExLlamaGenerator from repositories.exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig from repositories.exllama.tokenizer import ExLlamaTokenizer -if TYPE_CHECKING: - from ..schemas.api import ( - APIChatMessage, - ChatCompletion, - ChatCompletionChunk, - Completion, - CompletionChunk, - TextGenerationSettings, - ) assert cuda.is_available(), "CUDA must be available to use ExLlama." diff --git a/llama_api/modules/exllama_lora.py b/llama_api/modules/exllama_lora.py index 7f2c3c9..5e54391 100644 --- a/llama_api/modules/exllama_lora.py +++ b/llama_api/modules/exllama_lora.py @@ -1,20 +1,20 @@ # flake8: noqa -from pathlib import Path -from typing import Dict, Union -from llama_api.utils.dependency import import_repository -with import_repository( - git_path="https://github.com/turboderp/exllama", - disk_path="repositories/exllama", -): - from repositories.exllama.model import ExLlama, Ex4bitLinear, ExLlamaConfig import json +from pathlib import Path +from typing import Dict, Union import torch from safetensors.torch import load_file as safe_load_file from torch import load as load_file +from ..shared.config import Config +from ..utils.dependency import import_repository + +with import_repository(**Config.repositories["exllama"]): + from repositories.exllama.model import Ex4bitLinear, ExLlama, ExLlamaConfig + class ExLlamaLora: lora_config_path: str diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index 4652d63..3207bec 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -1,12 +1,6 @@ """Wrapper for llama_cpp to generate text completions.""" from inspect import signature -from typing import ( - Callable, - Iterator, - List, - Optional, - Union, -) +from typing import Callable, Iterator, List, Optional, Union from ..schemas.api import ( APIChatMessage, @@ -17,6 +11,7 @@ TextGenerationSettings, ) from ..schemas.models import LlamaCppModel +from ..shared.config import Config from ..utils.completions import ( convert_text_completion_chunks_to_chat, convert_text_completion_to_chat, @@ -28,11 +23,7 @@ logger = ApiLogger(__name__) logger.info("🦙 llama-cpp-python repository found!") -with import_repository( - git_path="https://github.com/abetlen/llama-cpp-python", - disk_path="repositories/llama_cpp", - options=["--recurse-submodules"], -): +with import_repository(**Config.repositories["llama_cpp"]): build_shared_lib(logger=logger) from repositories.llama_cpp import llama_cpp diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 27abd75..4bb7891 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -5,9 +5,12 @@ from pathlib import Path from typing import Dict, Literal, Optional +from ..shared.config import Config + from ..utils.dependency import ( get_installed_packages, get_poetry_executable, + git_clone, install_all_dependencies, install_package, install_pytorch, @@ -72,6 +75,8 @@ def initialize_before_launch( skip_compile: bool = False, ) -> None: """Initialize the app""" + for git_clone_args in Config.repositories.values(): + git_clone(**git_clone_args) if install_packages: # Install all dependencies if not skip_compile: diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index 4ecd592..7c4c887 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -1,5 +1,12 @@ from pathlib import Path -from typing import List, Tuple +from typing import Dict, List, Literal, Optional, Tuple +from typing_extensions import TypedDict + + +class GitCloneArgs(TypedDict): + git_path: str + disk_path: str + options: Optional[List[str]] class Config: @@ -26,3 +33,15 @@ class Config: "q6_K", "q8_0", ] + repositories: Dict[Literal["exllama", "llama_cpp"], GitCloneArgs] = { + "exllama": GitCloneArgs( + git_path="https://github.com/turboderp/exllama", + disk_path="repositories/exllama", + options=["--recurse-submodules"], + ), + "llama_cpp": GitCloneArgs( + git_path="https://github.com/abetlen/llama-cpp-python", + disk_path="repositories/llama_cpp", + options=None, + ), + } diff --git a/llama_api/utils/system.py b/llama_api/utils/system.py index 78ca571..a616a56 100644 --- a/llama_api/utils/system.py +++ b/llama_api/utils/system.py @@ -11,20 +11,31 @@ from queue import Queue ContainerLike = Union["deque", "Queue", "AsyncQueue", list, dict] +cuda_version: Optional[str] = None # Memoization of get_cuda_version() def get_cuda_version() -> Optional[str]: """Returns the current CUDA version as a string. Returns None if nvidia-smi is not available or CUDA is not installed.""" - try: - result = compile(r"CUDA Version: (\d+\.\d+)").search( - check_output(["nvidia-smi"]).decode("utf-8") - ) - if result is None: - return - return result.group(1) - except Exception: - return + global cuda_version + if cuda_version is not None: # If memoized + return cuda_version or None # If cuda_version is "", return None + for cli_args, regex in ( + (["nvcc", "--version"], r"release (\d+\.\d+)"), + (["nvidia-smi"], r"CUDA Version: (\d+\.\d+)"), + ): + try: + # Try to get the CUDA version from the output of the command + cuda_version_match = compile(regex).search( + check_output(cli_args).decode("utf-8") + ) + if cuda_version_match is None: + continue + cuda_version = cuda_version_match.group(1) + return cuda_version + except Exception: + continue + cuda_version = "" def get_vram_usages() -> Optional[List[int]]: From 366c1a4e8ea3aae42b5c193a0e0ebcb49eceed3b Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sat, 19 Aug 2023 15:12:06 +0900 Subject: [PATCH 05/18] Optimized performance: llama.cpp & exllama --- llama_api/mixins/completion.py | 44 ++ llama_api/mixins/function_call.py | 371 ++++++++++++++++ llama_api/mixins/prompt_utils.py | 133 +++--- llama_api/modules/base.py | 197 ++++++++- llama_api/modules/exllama.py | 678 ++++++++++-------------------- llama_api/modules/llama_cpp.py | 346 +++++++++------ 6 files changed, 1122 insertions(+), 647 deletions(-) create mode 100644 llama_api/mixins/completion.py create mode 100644 llama_api/mixins/function_call.py diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py new file mode 100644 index 0000000..312a4b0 --- /dev/null +++ b/llama_api/mixins/completion.py @@ -0,0 +1,44 @@ +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, Literal, Optional + +from ..schemas.api import CompletionLogprobs, TextGenerationSettings + + +@dataclass +class CompletionStatus: + # These fields are set by `accept_settings` method. + input_text: str = field(default="", init=False) + input_tokens: int = field(default=0, init=False) + + # These fields are set by `generate_text` method. + generated_text: str = field(default="", init=False) + generated_tokens: int = field(default=0, init=False) + logprobs: Optional[CompletionLogprobs] = field(default=None, init=False) + + +class CompletionMixin: + """A mixin for modules that support completion generation.""" + + _completion_status: Optional[defaultdict[str, CompletionStatus]] = None + + @property + def completion_status(self) -> Dict[str, CompletionStatus]: + """Get the completion status. + key: completion_id + value: CompletionStatus""" + if self._completion_status is None: + self._completion_status = defaultdict(CompletionStatus) + return self._completion_status + + def get_finish_reason( + self, + settings: TextGenerationSettings, + ) -> Literal["length", "stop"]: + """Get the finish reason for the completion.""" + return ( + "length" + if self.completion_status[settings.completion_id].generated_tokens + >= settings.max_tokens + else "stop" + ) diff --git a/llama_api/mixins/function_call.py b/llama_api/mixins/function_call.py new file mode 100644 index 0000000..7b7ae6e --- /dev/null +++ b/llama_api/mixins/function_call.py @@ -0,0 +1,371 @@ +"""Helper classes for wrapping functions in OpenAI's API""" +# flake8: noqa +import json +from inspect import signature +from re import Pattern, compile +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Literal, + Optional, + Tuple, + Type, + Union, + overload, +) + +from typing_extensions import Annotated, get_args, get_origin + +from ..schemas.api import ( + ChatCompletion, + ChatCompletionChunk, + CreateChatCompletionRequest, +) +from ..schemas.function_call import ( + FunctionCall, + FunctionCallParameter, + JsonTypes, +) + +# whitespace is constrained to a single space char +# to prevent model "running away" in +# whitespace. Also maybe improves generation quality? +SPACE_RULE: str = '" "?' + +PRIMITIVE_RULES: Dict[str, str] = { + "boolean": '("true" | "false") space', + "number": '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', + "integer": '("-"? ([0-9] | [1-9] [0-9]*)) space', + "string": r""" "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\"" space """, + "null": '"null" space', +} + +INVALID_RULE_CHARS_RE: "Pattern[str]" = compile(r"[^a-zA-Z0-9-]+") +GRAMMAR_LITERAL_ESCAPE_RE: "Pattern[str]" = compile(r'[\r\n"]') +GRAMMAR_LITERAL_ESCAPES: Dict[str, str] = { + "\r": "\\r", + "\n": "\\n", + '"': '\\"', +} + +# Type aliases +SchemaType = Literal[ + "boolean", "number", "integer", "string", "null", "object", "array" +] +SchemaKey = Literal[ + "type", "oneOf", "anyOf", "const", "enum", "properties", "items" +] + + +def _get_type_and_optional(t: Type) -> Tuple[Type, bool]: + """Returns the type and whether it's an Optional type. + This is useful when Type can be Union and you want to know if it's an Optional type. + """ + # Optional[str] is equivalent to Union[str, None], so check if it's a Union type. + if get_origin(t) in (type(Union), Union): + args = get_args(t) # type: Tuple[Type, ...] + # If there's a None type in the Union, it's an Optional type. + optional = type(None) in args + # Return the first argument that isn't None. + first_arg = next(arg for arg in args if arg is not type(None)) + return first_arg, optional + else: + # If it's not a Union type, it's not an Optional type. + return t, False + + +class FunctionCallMixin: + """Contains helper functions converting JSON schemas to BNF grammars + Reference: https://github.com/ggerganov/llama.cpp/pull/1887""" + + _prop_order: Dict[str, int] + _rules: Dict[str, str] + + def invoke_function_call( + self, request: CreateChatCompletionRequest + ) -> ChatCompletion: + """Invoke the function call while chat completion""" + raise NotImplementedError( + "function call is not implemented for this model" + ) + + def invoke_function_call_streaming( + self, request: CreateChatCompletionRequest + ) -> Iterator[ChatCompletionChunk]: + """Invoke the function call while chat completion, streaming the results""" + raise NotImplementedError( + "function call is not implemented for this model" + ) + + @staticmethod + @overload + def from_function_calls( + function_calls: FunctionCall, + prop_order: Optional[Dict[str, int]] = None, + ) -> str: + ... + + @staticmethod + @overload + def from_function_calls( + function_calls: Iterable[FunctionCall], + prop_order: Optional[Dict[str, int]] = None, + ) -> List[str]: + ... + + @staticmethod + def from_function_calls( + function_calls: Union[FunctionCall, Iterable[FunctionCall]], + prop_order: Optional[Dict[str, int]] = None, + ) -> Union[str, List[str]]: + """Parse a FunctionCall object into a BNF grammar""" + if isinstance(function_calls, Iterable): + return_as_list = True + function_calls = list(function_calls) + else: + return_as_list = False + function_calls = [function_calls] + + bnfs = [] # type: List[str] + for function_call in function_calls: + self = FunctionCallMixin() + self._prop_order = prop_order or {} + self._rules = {"space": SPACE_RULE} + parameters = function_call.to_dict().get("parameters") + assert parameters is not None, "function call must have parameters" + self._visit(dict(parameters), "") + bnfs.append(self._format_grammar()) + return bnfs if return_as_list else bnfs[0] + + @staticmethod + @overload + def from_functions( + functions: Callable, + prop_order: Optional[Dict[str, int]] = None, + ) -> str: + ... + + @staticmethod + @overload + def from_functions( + functions: Iterable[Callable], + prop_order: Optional[Dict[str, int]] = None, + ) -> List[str]: + ... + + @staticmethod + def from_functions( + functions: Union[Callable, Iterable[Callable]], + prop_order: Optional[Dict[str, int]] = None, + ) -> Union[str, List[str]]: + """Parse a function into a BNF grammar""" + if isinstance(functions, Iterable): + return_as_list = True + functions = list(functions) + else: + return_as_list = False + functions = [functions] + + function_calls = [] # type: List[FunctionCall] + json_types = get_args(JsonTypes) + line_break_pattern = compile(r"\n\s*") + + for function in functions: + function_call_params = [] # type: List[FunctionCallParameter] + required = [] # type: List[str] + for name, param in signature(function).parameters.items(): + annotation = param.annotation + description = "" # type: str + enum = [] # type: List[Any] + + if get_origin(annotation) is Annotated: + # If the annotation is an Annotated type, + # we need to parse the metadata + _param_args = get_args(param.annotation) + _param_type = _param_args[0] + + for metadata in _param_args[1:]: + if isinstance(metadata, str): + # If the metadata is a string, it's the description + description += metadata + elif isinstance(metadata, Iterable): + # If the metadata is an iterable, it's the enum + enum.extend(metadata) + + else: + _param_type = annotation + param_type, optional = _get_type_and_optional(_param_type) + if not optional: + required.append(name) + if param_type not in json_types: + continue + function_call_params.append( + FunctionCallParameter( + name=name, + type=param_type, + description=description or None, + enum=enum or None, + ) + ) + function_calls.append( + FunctionCall( + name=function.__name__, + description=line_break_pattern.sub( + " ", function.__doc__ + ) + if function.__doc__ + else None, + parameters=function_call_params, + required=required or None, + ) + ) + return FunctionCallMixin.from_function_calls( + function_calls if return_as_list else function_calls[0], + prop_order, + ) + + def _format_literal(self, literal: Any) -> str: + escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or "", + json.dumps(literal), + ) + return f'"{escaped}"' + + def _add_rule(self, name, rule): + esc_name = INVALID_RULE_CHARS_RE.sub("-", name) + if esc_name not in self._rules or self._rules[esc_name] == rule: + key = esc_name + else: + i = 0 + while f"{esc_name}{i}" in self._rules: + i += 1 + key = f"{esc_name}{i}" + self._rules[key] = rule + return key + + def _visit(self, schema: Dict[SchemaKey, Any], name: str) -> str: + schema_type: SchemaType = schema[ + "type" + ] # The "type" key is always present + rule_name: str = name or "root" # root rule is always named "root" + + if "oneOf" in schema or "anyOf" in schema: + # This is a union type + rule: str = " | ".join( + ( + self._visit(alt_schema, f'{name}{"-" if name else ""}{i}') + for i, alt_schema in enumerate( + schema.get("oneOf") or schema["anyOf"] + ) + ) + ) + return self._add_rule(rule_name, rule) + + elif "const" in schema: + # This is a literal + return self._add_rule( + rule_name, self._format_literal(schema["const"]) + ) + + elif "enum" in schema: + # This is a set of literals + rule = " | ".join( + (self._format_literal(v) for v in schema["enum"]) + ) + return self._add_rule(rule_name, rule) + + elif schema_type == "object" and "properties" in schema: + # TODO: `required` keyword + prop_order = self._prop_order + prop_pairs = sorted( + schema["properties"].items(), + # sort by position in prop_order (if specified) then by key + key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), + ) + + rule = '"{" space' + for i, (prop_name, prop_schema) in enumerate(prop_pairs): + prop_rule_name = self._visit( + prop_schema, f'{name}{"-" if name else ""}{prop_name}' + ) + if i > 0: + rule += ' "," space' + rule += rf' {self._format_literal(prop_name)} space ":" space {prop_rule_name}' + rule += ' "}" space' + + return self._add_rule(rule_name, rule) + + elif schema_type == "array" and "items" in schema: + # TODO `prefixItems` keyword + item_rule_name = self._visit( + schema["items"], f'{name}{"-" if name else ""}item' + ) + rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' + return self._add_rule(rule_name, rule) + + else: + assert ( + schema_type in PRIMITIVE_RULES + ), f"Unrecognized schema: {schema}" + return self._add_rule( + "root" if rule_name == "root" else schema_type, + PRIMITIVE_RULES[schema_type], + ) + + def _format_grammar(self): + return "\n".join( + (f"{name} ::= {rule}" for name, rule in self._rules.items()) + ) + + +if __name__ == "__main__": + # from llama_cpp import LlamaGrammar, Llama + + # Define a python function and parse it into a grammar + def get_current_weather( + location: Annotated[ + str, + "The location to get the current weather for", + ], + unit: Annotated[ + str, + "The unit of temperature to return", + ["fahrenheit", "celsius"], + ], + source: Annotated[ + str, + "The source of the weather information", + ["openweathermap", "weatherapi"], + ] = "openweathermap", + ): + """Get the current weather in a given location""" + + model_path = "C:/Users/sdml/Desktop/orca-mini-3b.ggmlv3.q4_0.bin" + grammar: str = FunctionCallMixin.from_functions(get_current_weather) + print(f"Grammar:\n{grammar}") + # llama_grammar = LlamaGrammar.from_string(grammar, verbose=False) + # llm = Llama(model_path) + # llm.grammar = llama_grammar + # for city in ( + # "London", + # "Paris", + # "New York", + # "Berlin", + # "Tokyo", + # "Sydney", + # "Moscow", + # "Beijing", + # "Cairo", + # "Rome", + # ): + # print(llm(prompt=f"### User: What is the weather in {city} today? ### Assistant:")["choices"][0]["text"]) # type: ignore + + # # Output: + # # { "location": "London", "source": "openweathermap","unit" : "celsius"} diff --git a/llama_api/mixins/prompt_utils.py b/llama_api/mixins/prompt_utils.py index 0b19dec..8573e27 100644 --- a/llama_api/mixins/prompt_utils.py +++ b/llama_api/mixins/prompt_utils.py @@ -1,76 +1,91 @@ -from typing import List +from typing import List, Optional, Set + from ..schemas.api import APIChatMessage, TextGenerationSettings -class PromptUtilsMixin: - user_role: str = "user" - system_role: str = "system" - user_input_role: str = "User" - system_input_role: str = "System" - ai_fallback_input_role: str = "Assistant" +def _get_stop_strings(*roles: str) -> List[str]: + """A helper method to generate stop strings for a given set of roles. + Stop strings are required to stop text completion API from generating + text that does not belong to the current chat turn. + e.g. The common stop string is "### USER:", + which can prevent ai from generating user's message itself.""" - @staticmethod - def get_stop_strings(*roles: str) -> List[str]: - """A helper method to generate stop strings for a given set of roles. - Stop strings are required to stop text completion API from generating - text that does not belong to the current chat turn. - e.g. The common stop string is "### USER:", - which can prevent ai from generating user's message itself.""" + prompt_stop = set() + for role in roles: + avoids = ( + f"### {role}:", + f"###{role}:", + ) + prompt_stop.update( + avoids, + map(str.capitalize, avoids), + map(str.upper, avoids), + map(str.lower, avoids), + ) + return list(prompt_stop) - prompt_stop = set() - for role in roles: - avoids = ( - f"{role}:", - f"### {role}:", - f"###{role}:", - ) - prompt_stop.update( - avoids, - map(str.capitalize, avoids), - map(str.upper, avoids), - map(str.lower, avoids), - ) - return list(prompt_stop) - @classmethod +class PromptUtilsMixin: + _stop_set: Optional[Set[str]] = None + _stop_piece_set: Optional[Set[str]] = None + + @staticmethod def convert_messages_into_prompt( - cls, messages: List[APIChatMessage], settings: TextGenerationSettings + messages: List[APIChatMessage], settings: TextGenerationSettings ) -> str: - """A helper method to convert list of messages into one text prompt.""" - - ai_input_role: str = cls.ai_fallback_input_role - chat_history: str = "" - for message in messages: - if message.role.lower() == cls.user_role: - input_role = cls.user_input_role - elif message.role.lower() == cls.system_role: - input_role = cls.system_input_role - else: - input_role = ai_input_role = message.role - chat_history += f"### {input_role}:{message.content}" + """A helper method to convert list of messages into one text prompt. + Save the stop tokens in the settings object for later use.""" - prompt_stop: List[str] = cls.get_stop_strings( - cls.user_input_role, cls.system_input_role, ai_input_role - ) + stops = _get_stop_strings( + *set(message.role for message in messages) + ) # type: List[str] if isinstance(settings.stop, str): - settings.stop = prompt_stop + [settings.stop] + settings.stop = stops + [settings.stop] elif isinstance(settings.stop, list): - settings.stop = prompt_stop + settings.stop + settings.stop = stops + settings.stop else: - settings.stop = prompt_stop - return chat_history + f"### {ai_input_role}:" + settings.stop = stops + return ( + " ".join( + [ + f"### {message.role.upper()}: {message.content}" + for message in messages + ] + ) + + " ### ASSISTANT: " + ) - @staticmethod - def is_possible_to_generate_stops(text: str, stops: List[str]) -> bool: - """A helper method to check if - the decoded text contains any of the stop tokens.""" + def build_stops_from_settings( + self, settings: TextGenerationSettings + ) -> None: + """Pre-calculate sets for stops and the pieces of stops, + to speed up the stop checking process.""" + if isinstance(settings.stop, str): + stops = [settings.stop] # type: List[str] + elif isinstance(settings.stop, list): + stops = settings.stop + else: + stops = [] + self._stop_set = set(stops) + self._stop_piece_set = { + stop[:prefix_idx] + for stop in stops + for prefix_idx in range(1, len(stop)) + } - for stop in stops: - if stop in text or any( - [text.endswith(stop[: i + 1]) for i in range(len(stop))] - ): - return True - return False + def stop_checker(self, text_piece: str) -> Optional[bool]: + """Optimized stop checker for text completion. + Returns False if the text piece ends with any piece of stop. + Returns True if the text piece contains any stop. + Returns None if the text piece does not contain any piece of stop.""" + if any( + text_piece.endswith(stop_piece) + for stop_piece in self._stop_piece_set or () + ): + return False + if any(stop in text_piece for stop in self._stop_set or ()): + return True + return None @staticmethod def raise_for_token_limit(prompt_tokens: int, context_window: int) -> None: diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py index 424395f..27b41d1 100644 --- a/llama_api/modules/base.py +++ b/llama_api/modules/base.py @@ -1,8 +1,11 @@ from abc import ABC, abstractmethod +from collections import deque from dataclasses import asdict, dataclass from pathlib import Path +from time import time from typing import Any, Iterator, List, TypeVar +from ..mixins.completion import CompletionMixin from ..mixins.interrupt import InterruptMixin from ..mixins.lock import LockMixin from ..mixins.logits import LogitsMixin @@ -41,13 +44,23 @@ class BaseCompletionGenerator( InterruptMixin, LogitsMixin, LockMixin, + CompletionMixin, ): """Base class for all completion generators.""" @abstractmethod def __del__(self): """Clean up resources.""" - ... + + @property + @abstractmethod + def llm_model(self) -> "BaseLLMModel": + """The LLM model used by this generator.""" + + @property + def model_name(self) -> str: + """Identifier for the model used by this generator.""" + return Path(self.llm_model.model_path_resolved).stem @classmethod @abstractmethod @@ -56,31 +69,171 @@ def from_pretrained( ) -> "BaseCompletionGenerator": """Load a pretrained model into RAM.""" - @abstractmethod def generate_completion( self, prompt: str, settings: TextGenerationSettings ) -> Completion: """Generate a completion for a given prompt.""" + completion_id = settings.completion_id + completion_status = self.completion_status[completion_id] + deque( + self.generate_text(prompt=prompt, settings=settings), + maxlen=0, + ) # exhaust the generator + return { + "id": completion_id, + "object": "text_completion", + "created": int(time()), + "model": self.model_name, + "choices": [ + { + "text": completion_status.generated_text, + "index": 0, + "logprobs": completion_status.logprobs + if settings.logprobs + else None, + "finish_reason": self.get_finish_reason(settings), + } + ], + "usage": { + "prompt_tokens": completion_status.input_tokens, + "completion_tokens": completion_status.generated_tokens, + "total_tokens": completion_status.input_tokens + + completion_status.generated_tokens, + }, + } - @abstractmethod def generate_completion_with_streaming( self, prompt: str, settings: TextGenerationSettings ) -> Iterator[CompletionChunk]: """Generate a completion for a given prompt, yielding chunks of text as they are generated.""" + completion_id = settings.completion_id = ( + "chat" + settings.completion_id + ) + completion_status = self.completion_status[completion_id] + model = self.model_name + for token in self.generate_text(prompt=prompt, settings=settings): + yield { + "id": completion_id, + "object": "text_completion", + "created": int(time()), + "model": model, + "choices": [ + { + "text": token, + "index": 0, + "logprobs": completion_status.logprobs + if settings.logprobs + else None, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": int(time()), + "model": model, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": completion_status.logprobs + if settings.logprobs + else None, + "finish_reason": self.get_finish_reason(settings), + } + ], + } - @abstractmethod def generate_chat_completion( self, messages: List[APIChatMessage], settings: TextGenerationSettings ) -> ChatCompletion: """Generate a completion for a given prompt.""" + completion_id = settings.completion_id = ( + "chat" + settings.completion_id + ) + completion_status = self.completion_status[completion_id] + deque( + self.generate_text( + prompt=self.convert_messages_into_prompt( + messages, settings=settings + ), + settings=settings, + ), + maxlen=0, + ) # exhaust the generator + return { + "id": completion_id, + "object": "chat.completion", + "created": int(time()), + "model": self.model_name, + "choices": [ + { + "message": { + "role": "assistant", + "content": completion_status.generated_text, + }, + "index": 0, + "finish_reason": self.get_finish_reason(settings), + } + ], + "usage": { + "prompt_tokens": completion_status.input_tokens, + "completion_tokens": completion_status.generated_tokens, + "total_tokens": completion_status.input_tokens + + completion_status.generated_tokens, + }, + } - @abstractmethod def generate_chat_completion_with_streaming( self, messages: List[APIChatMessage], settings: TextGenerationSettings ) -> Iterator[ChatCompletionChunk]: """Generate a completion for a given prompt, yielding chunks of text as they are generated.""" + completion_id = settings.completion_id + prompt = self.convert_messages_into_prompt(messages, settings=settings) + model = self.model_name + yield { + "id": completion_id, + "object": "chat.completion.chunk", + "created": int(time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"role": "assistant"}, + "finish_reason": None, + } + ], + } + for token in self.generate_text(prompt=prompt, settings=settings): + yield { + "id": completion_id, + "object": "chat.completion.chunk", + "created": int(time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"content": token}, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "chat.completion.chunk", + "created": int(time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": {}, + "finish_reason": self.get_finish_reason(settings), + } + ], + } @abstractmethod def encode(self, text: str, **kwargs: Any) -> List[int]: @@ -90,15 +243,35 @@ def encode(self, text: str, **kwargs: Any) -> List[int]: def decode(self, ids: List[int], **kwargs: Any) -> str: """Decode a list of token IDs into a text string.""" - @property @abstractmethod - def llm_model(self) -> "BaseLLMModel": - """The LLM model used by this generator.""" + def generate_text( + self, prompt: str, settings: TextGenerationSettings + ) -> Iterator[str]: + ... - @property - def model_name(self) -> str: - """Identifier for the model used by this generator.""" - return Path(self.llm_model.model_path_resolved).stem + def accept_settings( + self, + prompt: str, + prompt_tokens: int, + settings: TextGenerationSettings, + ) -> None: + """Update the completion status.""" + # Check if the prompt is too long + context_window = self.llm_model.max_total_tokens + self.raise_for_token_limit( + prompt_tokens=prompt_tokens, context_window=context_window + ) + settings.max_tokens = min( + settings.max_tokens, context_window - prompt_tokens + ) + completion_id = settings.completion_id + + # Update completion status + self.completion_status[completion_id].input_text = prompt + self.completion_status[completion_id].input_tokens = prompt_tokens + + # Cache the stops for later use of stop_checker + self.build_stops_from_settings(settings) class BaseEmbeddingGenerator(ABC): diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index d3ad093..fc5cb29 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -1,5 +1,6 @@ """Wrapper for exllama to generate text completions.""" # flake8: noqa +from array import array from os import environ from ..utils.logger import ApiLogger @@ -15,32 +16,15 @@ hijack_attention_forward() from gc import collect from pathlib import Path -from time import time -from typing import ( - Dict, - Iterable, - Iterator, - List, - Literal, - Optional, - Tuple, - Union, - overload, -) +from re import compile +from typing import Iterable, Iterator, List, Optional, Tuple, Union, overload from torch import IntTensor, Tensor, cuda, version from torch.cuda import empty_cache from torch.nn.functional import log_softmax from ..logits.base import BaseLogitProcessor -from ..schemas.api import ( - APIChatMessage, - ChatCompletion, - ChatCompletionChunk, - Completion, - CompletionChunk, - TextGenerationSettings, -) +from ..schemas.api import TextGenerationSettings from ..schemas.models import ExllamaModel from ..shared.config import Config from ..utils.dependency import import_repository @@ -56,7 +40,228 @@ assert cuda.is_available(), "CUDA must be available to use ExLlama." -_stop_checker = BaseCompletionGenerator.is_possible_to_generate_stops + +class ExllamaCompletionGenerator(BaseCompletionGenerator): + _config: Optional[ExLlamaConfig] = None + _model: Optional[ExLlama] = None + _cache: Optional[ExLlamaCache] = None + _tokenizer: Optional[ExLlamaTokenizer] = None + _generator: Optional[ExLlamaGenerator] = None + _llm_model: Optional["ExllamaModel"] = None + _lora: Optional["ExLlamaLora"] = None + + @property + def llm_model(self) -> "ExllamaModel": + assert self._llm_model is not None + return self._llm_model + + @property + def generator(self) -> ExLlamaGenerator: + assert self._generator is not None, "Generator is not initialized." + return self._generator + + @property + def tokenizer(self) -> ExLlamaTokenizer: + assert self._tokenizer is not None, "Tokenizer is not initialized." + return self._tokenizer + + @property + def cache(self) -> ExLlamaCache: + assert self._cache is not None, "Cache is not initialized." + return self._cache + + @property + def model(self) -> ExLlama: + assert self._model is not None, "Model is not initialized." + return self._model + + @property + def config(self) -> ExLlamaConfig: + assert self._config is not None, "Config is not initialized." + return self._config + + @property + def lora(self) -> Optional[ExLlamaLora]: + return self._lora + + @classmethod + def from_pretrained( + cls, llm_model: "ExllamaModel" + ) -> "ExllamaCompletionGenerator": + model_folder_path = Path(llm_model.model_path_resolved) + lora_path = model_folder_path / "adapter_model.bin" + lora_config_path = model_folder_path / "adapter_config.json" + + result = cls() + result._llm_model = llm_model + result._config = _make_config(model_folder_path, llm_model) + result._tokenizer = ExLlamaTokenizer( + (model_folder_path / "tokenizer.model").as_posix() + ) + result._model = ExLlama(result._config) + if lora_path.exists() and lora_config_path.exists(): + logger.info(f"🦙 LORA model found for {result.model_name}") + with logger.log_any_error( + f"🦙 LORA model loading failed for {result.model_name}" + ): + result._lora = ExLlamaLora( + model=result._model, + lora_config_path=lora_config_path.as_posix(), + lora_path=lora_path.as_posix(), + ) + logger.info(f"🦙 LORA model loaded for {result.model_name}") + result._cache = ExLlamaCache(result._model) + result._generator = ExLlamaGenerator( + result._model, result._tokenizer, result._cache + ) + return result + + def encode(self, text: str) -> List[int]: + assert self._tokenizer is not None, "Tokenizer is not initialized" + return _encode(self._tokenizer, text).flatten().tolist() + + def decode(self, ids: List[int], **kwargs) -> str: + assert self._tokenizer is not None, "Tokenizer is not initialized" + return str(self._tokenizer.decode(IntTensor(ids))) + + def __del__(self) -> None: + if self._tokenizer is not None: + getattr(self._tokenizer, "__del__", lambda: None)() + del self._tokenizer + self._tokenizer = None + logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted") + if self._cache is not None: + getattr(self._cache, "__del__", lambda: None)() + del self._cache + self._cache = None + logger.info("🗑️ ExllamaCompletionGenerator cache deleted") + if self._generator is not None: + getattr(self._generator, "__del__", lambda: None)() + del self._generator + self._generator = None + logger.info("🗑️ ExllamaCompletionGenerator generator deleted") + if self._model is not None: + self._model.free_unmanaged() + del self._model + self._model = None + logger.info("🗑️ ExllamaCompletionGenerator model deleted") + collect() + empty_cache() + + def generate_text( + self, prompt: str, settings: TextGenerationSettings + ) -> Iterator[str]: + with logger.log_any_error(): + # Encode the prompt + if settings.guidance_scale == 1: + ids = _encode(self.tokenizer, prompt) + mask = None # type: Optional[Tensor] + else: + ids, mask = _encode( + self.tokenizer, + [prompt, settings.negative_prompt or ""], + return_mask=True, + ) + + # Accept and apply the settings + self.accept_settings( + prompt=prompt, + prompt_tokens=ids.shape[-1], + settings=settings, + ) + generator = _apply_settings_to_generator(self, settings=settings) + + # Apply LoRA + if self.lora: + generator.lora = self.lora # type: ignore + + # Inject the prompt + if mask is not None: + generator.gen_begin(ids, mask=mask) + else: + generator.end_beam_search() + generator.gen_begin_reuse(ids) + + # Generate text + yield from self._generate_text(settings, mask) + + def _generate_text( + self, + settings: TextGenerationSettings, + cfg_mask: Optional[Tensor] = None, + ) -> Iterator[str]: + # Set up the variables + IdToPiece = self.tokenizer.tokenizer.IdToPiece + generator = self.generator + initial_len = generator.sequence[0].shape[0] # type: int + eos_token_id = generator.tokenizer.eos_token_id # type: int + completion_status = self.completion_status[settings.completion_id] + text_buffer = "" # type: str + byte_array = array("B") # type: array[int] + byte_pattern = compile(r"<0x([0-9a-fA-F]{2})>") + + for _ in range(settings.max_tokens): + # If the generator was interrupted, stop the generation + if self.is_interrupted: + break + + # Predict next token id + token_id = ( + _gen_single_token_with_cfg( + generator=generator, + mask=cfg_mask, + cfg_alpha=settings.guidance_scale, + ) + if cfg_mask is not None + else _gen_single_token_without_cfg( + generator=generator, + input_ids=generator.sequence[0][initial_len:], + logit_processors=( + [ + processor + for processor in self.get_logit_processors( + settings=settings, + encoder=self.encode, + ) + ] + if cfg_mask is None + else None + ) + or None, + ) + ) # type: int + + # Check if the token is a stop token + if self.is_interrupted or token_id == eos_token_id: + break + + # Update the completion status + completion_status.generated_tokens += 1 + + # Try to decode the token + piece = IdToPiece(token_id) # type: str + if piece[0] == "<" and piece[-1] == ">": + byte_match = byte_pattern.match(piece) + if byte_match is None: + continue + try: + byte_array.append(int(byte_match.group(1), 16)) + piece = byte_array.tobytes().decode() + del byte_array[:] + except UnicodeDecodeError: + continue + text_to_yield = text_buffer + piece.replace("▁", " ") + + # Check if the decoded text contains any of the stop tokens. + stop_status = self.stop_checker(text_to_yield) + if stop_status is None: # Good to go + text_buffer = "" # Clear the buffer + completion_status.generated_text += text_to_yield + yield text_to_yield + elif stop_status is True: # Contains any of the stop tokens + break # Stop generating + else: # Contains any piece of the stop tokens + text_buffer = text_to_yield # Save the buffer def _make_config( @@ -129,7 +334,7 @@ def _make_config( def _apply_settings_to_generator( cg: "ExllamaCompletionGenerator", - settings: "TextGenerationSettings", + settings: TextGenerationSettings, ) -> ExLlamaGenerator: """Apply the settings to the generator.""" # Make sure that the batch size is correct @@ -181,7 +386,7 @@ def _gen_single_token_with_cfg( def _gen_single_token_without_cfg( generator: ExLlamaGenerator, - initial_len: int, + input_ids: Tensor, constraints: Optional[Tensor] = None, mask: Optional[Tensor] = None, logit_processors: Optional[Iterable[BaseLogitProcessor]] = None, @@ -200,7 +405,6 @@ def _gen_single_token_without_cfg( logits[:, :, generator.tokenizer.bos_token_id] = -10000.0 if logit_processors is not None: - input_ids = generator.sequence[0][initial_len:] for logit_processor in logit_processors: logits = logit_processor.with_torch(input_ids, logits) @@ -230,421 +434,16 @@ def _gen_single_token_without_cfg( return int(token.item()) -def _generator( - cg: "ExllamaCompletionGenerator", - settings: "TextGenerationSettings", - stops: List[str], - cfg_mask: Optional[Tensor] = None, -) -> Iterator[str]: - IdToPiece = cg.tokenizer.tokenizer.IdToPiece - decoder = cg.tokenizer.decode - generator = cg.generator - - cfg_alpha = settings.guidance_scale # type: float - initial_len = generator.sequence[0].shape[0] # type: int - eos_token_id = generator.tokenizer.eos_token_id # type: int - has_leading_space = False # type: bool - text_cursor = 0 # type: int - n_tokens = 0 # type: int - logit_processors = ( - [ - processor - for processor in cg.get_logit_processors( - settings=settings, - encoder=cg.encode, - ) - ] - if cfg_mask is None - else None - ) # type: Optional[Iterable[BaseLogitProcessor]] - for n_tokens in range(1, settings.max_tokens + 1): - if cg.is_interrupted: - break # the generator was interrupted - - # Predict the next token id - if cfg_mask is not None: - token_id = _gen_single_token_with_cfg( - generator, mask=cfg_mask, cfg_alpha=cfg_alpha - ) - else: - token_id = _gen_single_token_without_cfg( - generator, - initial_len=initial_len, - logit_processors=logit_processors or None, - ) - if cg.is_interrupted or token_id == eos_token_id: - break - - # Yield the text piece - if n_tokens == 1: - has_leading_space = IdToPiece(token_id).startswith("▁") - decoded_text = ( - " " + str(decoder(generator.sequence[0][initial_len:])) - if has_leading_space - else str(decoder(generator.sequence[0][initial_len:])) - ) - text_piece = decoded_text[text_cursor:] - if "�" in text_piece: # Decode error when decoding multi-byte char - continue - if _stop_checker(text_piece, stops=stops): # Stop token found maybe - if any(stop in decoded_text for stop in stops): - break # Stop token found - continue - yield text_piece - text_cursor += len(text_piece) - # End of generation - cg._completion_status[settings.completion_id] = n_tokens - - -def _generate_text_with_streaming( - cg: "ExllamaCompletionGenerator", - prompt: str, - settings: "TextGenerationSettings", -) -> Iterator[str]: - with logger.log_any_error(): - # Make sure that the stop token is a list - if isinstance(settings.stop, str): - stops = [settings.stop] # type: List[str] - elif isinstance(settings.stop, list): - stops = settings.stop - else: - stops = [] - - # Apply the settings to the generator - generator = _apply_settings_to_generator(cg, settings=settings) - - # Apply the LORA model - if cg.lora: - generator.lora = cg.lora # type: ignore - - # Start the generator - context_window = cg.llm_model.max_total_tokens - if settings.guidance_scale == 1: - ids = _encode(cg.tokenizer, prompt) - prompt_tokens = ids.shape[-1] - cg.raise_for_token_limit( - prompt_tokens=prompt_tokens, context_window=context_window - ) - mask = None # type: Optional[Tensor] - generator.end_beam_search() - generator.gen_begin_reuse(ids) - else: - ids, mask = _encode( - cg.tokenizer, - [prompt, settings.negative_prompt or ""], - return_mask=True, - ) - prompt_tokens = ids.shape[-1] - cg.raise_for_token_limit( - prompt_tokens=prompt_tokens, context_window=context_window - ) - generator.gen_begin(ids, mask=mask) - - settings.max_tokens = min( - settings.max_tokens, context_window - prompt_tokens - ) - - yield from _generator( - cg, settings=settings, cfg_mask=mask, stops=stops - ) - - -class ExllamaCompletionGenerator(BaseCompletionGenerator): - _config: Optional[ExLlamaConfig] = None - _model: Optional[ExLlama] = None - _cache: Optional[ExLlamaCache] = None - _tokenizer: Optional[ExLlamaTokenizer] = None - _generator: Optional[ExLlamaGenerator] = None - _llm_model: Optional["ExllamaModel"] = None - _lora: Optional["ExLlamaLora"] = None - _completion_status: Dict[ - str, int - ] = {} # key: completion_id, value: number of completion tokens - - @property - def llm_model(self) -> "ExllamaModel": - assert self._llm_model is not None - return self._llm_model - - @property - def generator(self) -> ExLlamaGenerator: - assert self._generator is not None, "Generator is not initialized." - return self._generator - - @property - def tokenizer(self) -> ExLlamaTokenizer: - assert self._tokenizer is not None, "Tokenizer is not initialized." - return self._tokenizer - - @property - def cache(self) -> ExLlamaCache: - assert self._cache is not None, "Cache is not initialized." - return self._cache - - @property - def model(self) -> ExLlama: - assert self._model is not None, "Model is not initialized." - return self._model - - @property - def config(self) -> ExLlamaConfig: - assert self._config is not None, "Config is not initialized." - return self._config - - @property - def lora(self) -> Optional[ExLlamaLora]: - return self._lora - - @classmethod - def from_pretrained( - cls, llm_model: "ExllamaModel" - ) -> "ExllamaCompletionGenerator": - model_folder_path = Path(llm_model.model_path_resolved) - lora_path = model_folder_path / "adapter_model.bin" - lora_config_path = model_folder_path / "adapter_config.json" - - result = cls() - result._llm_model = llm_model - result._config = _make_config(model_folder_path, llm_model) - result._tokenizer = ExLlamaTokenizer( - (model_folder_path / "tokenizer.model").as_posix() - ) - result._model = ExLlama(result._config) - if lora_path.exists() and lora_config_path.exists(): - logger.info(f"🦙 LORA model found for {result.model_name}") - with logger.log_any_error( - f"🦙 LORA model loading failed for {result.model_name}" - ): - result._lora = ExLlamaLora( - model=result._model, - lora_config_path=lora_config_path.as_posix(), - lora_path=lora_path.as_posix(), - ) - logger.info(f"🦙 LORA model loaded for {result.model_name}") - result._cache = ExLlamaCache(result._model) - result._generator = ExLlamaGenerator( - result._model, result._tokenizer, result._cache - ) - return result - - def generate_completion_with_streaming( - self, prompt: str, settings: "TextGenerationSettings" - ) -> Iterator["CompletionChunk"]: - completion_id = settings.completion_id - model = self.model_name - generated_text = "" # type: str - for token in _generate_text_with_streaming( - self, prompt=prompt, settings=settings - ): - generated_text += token - yield { - "id": completion_id, - "object": "text_completion", - "created": int(time()), - "model": model, - "choices": [ - { - "text": token, - "index": 0, - "logprobs": None, - "finish_reason": None, - } - ], - } - yield { - "id": completion_id, - "object": "text_completion", - "created": int(time()), - "model": model, - "choices": [ - { - "text": "", - "index": 0, - "logprobs": None, - "finish_reason": _get_finish_reason( - self, settings, completion_id, generated_text - ), - } - ], - } - - def generate_completion( - self, prompt: str, settings: "TextGenerationSettings" - ) -> "Completion": - completion_id = settings.completion_id - generated_text = "".join( - _generate_text_with_streaming( - self, prompt=prompt, settings=settings - ) - ) - prompt_tokens = _encode(self.tokenizer, prompt).shape[1] - completion_tokens = self._completion_status.get( - completion_id, _encode(self.tokenizer, generated_text).shape[1] - ) - return { - "id": completion_id, - "object": "text_completion", - "created": int(time()), - "model": self.model_name, - "choices": [ - { - "text": generated_text, - "index": 0, - "logprobs": None, - "finish_reason": _get_finish_reason( - self, settings, completion_id, generated_text - ), - } - ], - "usage": { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens, - }, - } - - def generate_chat_completion_with_streaming( - self, - messages: List["APIChatMessage"], - settings: "TextGenerationSettings", - ) -> Iterator["ChatCompletionChunk"]: - completion_id = settings.completion_id - prompt = self.convert_messages_into_prompt(messages, settings=settings) - model = self.model_name - generated_text = "" # type: str - yield { - "id": completion_id, - "object": "chat.completion.chunk", - "created": int(time()), - "model": model, - "choices": [ - { - "index": 0, - "delta": {"role": "assistant"}, - "finish_reason": None, - } - ], - } - for token in _generate_text_with_streaming( - self, prompt=prompt, settings=settings - ): - generated_text += token - yield { - "id": completion_id, - "object": "chat.completion.chunk", - "created": int(time()), - "model": model, - "choices": [ - { - "index": 0, - "delta": {"content": token}, - "finish_reason": None, - } - ], - } - yield { - "id": completion_id, - "object": "chat.completion.chunk", - "created": int(time()), - "model": model, - "choices": [ - { - "index": 0, - "delta": {}, - "finish_reason": _get_finish_reason( - self, settings, completion_id, generated_text - ), - } - ], - } - - def generate_chat_completion( - self, - messages: List["APIChatMessage"], - settings: "TextGenerationSettings", - ) -> "ChatCompletion": - completion_id = settings.completion_id - prompt = self.convert_messages_into_prompt(messages, settings=settings) - generated_text = "".join( - _generate_text_with_streaming( - self, prompt=prompt, settings=settings - ) - ) - prompt_tokens = _encode(self.tokenizer, prompt).shape[1] - completion_tokens = self._completion_status.get( - completion_id, _encode(self.tokenizer, generated_text).shape[1] - ) - return { - "id": completion_id, - "object": "chat.completion", - "created": int(time()), - "model": self.model_name, - "choices": [ - { - "message": { - "role": "assistant", - "content": generated_text, - }, - "index": 0, - "finish_reason": _get_finish_reason( - self, settings, completion_id, generated_text - ), - } - ], - "usage": { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens, - }, - } - - def encode(self, text: str) -> List[int]: - assert self._tokenizer is not None, "Tokenizer is not initialized" - return _encode(self._tokenizer, text).flatten().tolist() - - def decode(self, ids: List[int], **kwargs) -> str: - assert self._tokenizer is not None, "Tokenizer is not initialized" - return str(self._tokenizer.decode(IntTensor(ids))) - - def __del__(self) -> None: - if self._tokenizer is not None: - getattr(self._tokenizer, "__del__", lambda: None)() - del self._tokenizer - self._tokenizer = None - logger.info("🗑️ ExllamaCompletionGenerator tokenizer deleted") - if self._cache is not None: - getattr(self._cache, "__del__", lambda: None)() - del self._cache - self._cache = None - logger.info("🗑️ ExllamaCompletionGenerator cache deleted") - if self._generator is not None: - getattr(self._generator, "__del__", lambda: None)() - del self._generator - self._generator = None - logger.info("🗑️ ExllamaCompletionGenerator generator deleted") - if self._model is not None: - self._model.free_unmanaged() - del self._model - self._model = None - logger.info("🗑️ ExllamaCompletionGenerator model deleted") - collect() - empty_cache() - - @overload def _encode( - tokenizer: ExLlamaTokenizer, - text: str, - return_mask: bool = False, + tokenizer: ExLlamaTokenizer, text: str, return_mask: bool = False ) -> Tensor: ... @overload def _encode( - tokenizer: ExLlamaTokenizer, - text: List[str], - return_mask: bool = True, + tokenizer: ExLlamaTokenizer, text: List[str], return_mask: bool = True ) -> Tuple[Tensor, Tensor]: ... @@ -664,20 +463,3 @@ def _encode( ids = result[0] if isinstance(result, tuple) else result assert isinstance(ids, Tensor) return ids - - -def _get_finish_reason( - cg: ExllamaCompletionGenerator, - settings: "TextGenerationSettings", - completion_id: str, - generated_text: str, -) -> Literal["length", "stop"]: - return ( - "length" - if cg._completion_status.get( - completion_id, - _encode(cg.tokenizer, generated_text).shape[1], - ) - >= settings.max_tokens - else "stop" - ) diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index 3207bec..c58e743 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -1,21 +1,18 @@ """Wrapper for llama_cpp to generate text completions.""" +# flake8: noqa +import sys +from array import array from inspect import signature -from typing import Callable, Iterator, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Iterator, List, Optional, Union from ..schemas.api import ( - APIChatMessage, - ChatCompletion, ChatCompletionChunk, - Completion, CompletionChunk, + CompletionLogprobs, TextGenerationSettings, ) from ..schemas.models import LlamaCppModel from ..shared.config import Config -from ..utils.completions import ( - convert_text_completion_chunks_to_chat, - convert_text_completion_to_chat, -) from ..utils.dependency import import_repository from ..utils.llama_cpp import build_shared_lib from ..utils.logger import ApiLogger @@ -28,6 +25,20 @@ from repositories.llama_cpp import llama_cpp +if TYPE_CHECKING: + from llama_api.mixins.completion import CompletionStatus + + +class StoppingCriteriaList(List[Callable[[List[int], List[float]], bool]]): + def __call__(self, input_ids: List[int], logits: List[float]) -> bool: + return any( + [ + stopping_criteria(input_ids, logits) + for stopping_criteria in self + ] + ) + + class LogitsProcessorList( List[Callable[[List[int], List[float]], List[float]]] ): @@ -39,66 +50,6 @@ def __call__( return scores -def _create_completion( - client: llama_cpp.Llama, - prompt: str, - stream: bool, - settings: TextGenerationSettings, -) -> Union[Completion, Iterator[CompletionChunk]]: - logit_processors = LogitsProcessorList( - [ - processor.without_torch - for processor in BaseCompletionGenerator.get_logit_processors( - settings=settings, - encoder=lambda s: client.tokenize( - s.encode("utf-8"), add_bos=False - ), - ) - ] - ) - return client.create_completion( - stream=stream, - prompt=prompt, - max_tokens=settings.max_tokens, - temperature=settings.temperature, - top_p=settings.top_p, - logprobs=settings.logprobs, - echo=settings.echo, - frequency_penalty=settings.frequency_penalty, - presence_penalty=settings.presence_penalty, - repeat_penalty=settings.repeat_penalty, - top_k=settings.top_k, - tfs_z=settings.tfs_z, - mirostat_mode=settings.mirostat_mode, - mirostat_tau=settings.mirostat_tau, - mirostat_eta=settings.mirostat_eta, - logits_processor=logit_processors if logit_processors else None, # type: ignore # noqa: E501 - stop=settings.stop, - ) - - -def _create_chat_completion( - client: llama_cpp.Llama, - messages: List[APIChatMessage], - stream: bool, - settings: TextGenerationSettings, -) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - prompt: str = LlamaCppCompletionGenerator.convert_messages_into_prompt( - messages, settings=settings - ) - completion_or_chunks = _create_completion( - client=client, prompt=prompt, stream=stream, settings=settings - ) - if isinstance(completion_or_chunks, Iterator): - return convert_text_completion_chunks_to_chat( - completion_or_chunks, - ) - else: - return convert_text_completion_to_chat( - completion_or_chunks, - ) - - class LlamaCppCompletionGenerator(BaseCompletionGenerator): generator: Optional[ Iterator[Union[CompletionChunk, ChatCompletionChunk]] @@ -164,66 +115,6 @@ def from_pretrained( self._llm_model = llm_model return self - def generate_completion( - self, - prompt: str, - settings: TextGenerationSettings = TextGenerationSettings(), - ) -> Completion: - assert self.client is not None - completion = _create_completion( - client=self.client, prompt=prompt, stream=False, settings=settings - ) - assert not isinstance(completion, Iterator) - return completion - - def generate_completion_with_streaming( - self, - prompt: str, - settings: TextGenerationSettings = TextGenerationSettings(), - ) -> Iterator[CompletionChunk]: - assert self.client is not None - completion_chunk_generator = _create_completion( - client=self.client, prompt=prompt, stream=True, settings=settings - ) - assert isinstance(completion_chunk_generator, Iterator) - self.generator = completion_chunk_generator - for chunk in completion_chunk_generator: - if self.is_interrupted: - yield chunk - return # the generator was interrupted - yield chunk - - def generate_chat_completion( - self, messages: List[APIChatMessage], settings: TextGenerationSettings - ) -> ChatCompletion: - assert self.client is not None - chat_completion = _create_chat_completion( - client=self.client, - messages=messages, - stream=False, - settings=settings, - ) - assert not isinstance(chat_completion, Iterator) - return chat_completion - - def generate_chat_completion_with_streaming( - self, messages: List[APIChatMessage], settings: TextGenerationSettings - ) -> Iterator[ChatCompletionChunk]: - assert self.client is not None - chat_completion_chunk_generator = _create_chat_completion( - client=self.client, - messages=messages, - stream=True, - settings=settings, - ) - assert isinstance(chat_completion_chunk_generator, Iterator) - self.generator = chat_completion_chunk_generator - for chunk in chat_completion_chunk_generator: - if self.is_interrupted: - yield chunk - return # the generator was interrupted - yield chunk - def encode(self, text: str, add_bos: bool = True, **kwargs) -> List[int]: assert self.client is not None, "Client is not initialized" return self.client.tokenize( @@ -233,3 +124,202 @@ def encode(self, text: str, add_bos: bool = True, **kwargs) -> List[int]: def decode(self, ids: List[int], **kwargs) -> str: assert self.client is not None, "Client is not initialized" return self.client.detokenize(ids).decode("utf-8", errors="ignore") + + def generate_text( + self, prompt: str, settings: TextGenerationSettings + ) -> Iterator[str]: + client = self.client + assert client is not None, "Llama is not initialized" + self.llm_model.max_total_tokens = client.n_ctx() + assert client.ctx is not None, "Llama context is not initialized" + n_ctx = client.n_ctx() + tokens = (llama_cpp.llama_token * n_ctx)() + n_tokens = llama_cpp.llama_tokenize( + client.ctx, + b" " + prompt.encode("utf-8"), + tokens, + llama_cpp.c_int(n_ctx), + llama_cpp.c_bool(True), + ) + if n_tokens < 0: + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * n_tokens)() + n_tokens = llama_cpp.llama_tokenize( + client.ctx, + b" " + prompt.encode("utf-8"), + tokens, + llama_cpp.c_int(n_tokens), + llama_cpp.c_bool(True), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{prompt}" n_tokens={n_tokens}' + ) + input_ids = array("i", tokens[:n_tokens]) # type: array[int] + self.accept_settings( + prompt=prompt, prompt_tokens=len(input_ids), settings=settings + ) + yield from self._generate_text(client, input_ids, settings) + + def _generate_text( + self, + client: llama_cpp.Llama, + input_ids: "array[int]", + settings: TextGenerationSettings, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, + grammar: Optional[llama_cpp.LlamaGrammar] = None, + ) -> Iterator[str]: + ctx = client.ctx + assert ctx is not None, "Llama context is not initialized" + verbose = self.llm_model.verbose + if verbose: + llama_cpp.llama_reset_timings(ctx) + + # Cache the variables frequently used in the loop + completion_status = self.completion_status[settings.completion_id] + generated_ids = array("i") # type: array[int] + byte_array = bytearray() # type: bytearray + eos_token = llama_cpp.llama_token_eos() + logprobs = settings.logprobs + text_buffer = "" # type: str + llama_token_to_str = llama_cpp.llama_token_to_str + llama_token = llama_cpp.llama_token + + if logprobs is not None and client.params.logits_all is False: + raise ValueError( + "logprobs is not supported for models " + "created with logits_all=False" + ) + + if client.cache: + _load_cache(client, client.cache, input_ids) + + for _, token_id in zip( + range(settings.max_tokens), + client.generate( + input_ids, + **{ + key: value + for key, value in { + **self.llm_model.asdict, + **{ + "temp": settings.temperature, + "stopping_criteria": stopping_criteria, + "logits_processor": logits_processor, + "grammar": grammar, + }, + }.items() + # Hacky way to pass arguments safely to older versions of llama-cpp-python + if key in signature(client.generate).parameters.keys() + }, + ), + ): + if self.is_interrupted or token_id == eos_token: + break + + # Update the generated id + generated_ids.append(token_id) + completion_status.generated_tokens += 1 + + piece = llama_token_to_str( + ctx, llama_token(token_id) + ) # type: bytes + try: + # Try to decode the token + text_to_yield = text_buffer + (byte_array + piece).decode() + byte_array.clear() + except UnicodeDecodeError: + # Multi-byte characters are not decoded correctly if partial + byte_array.extend(piece) + continue + + # Check if the decoded text contains any of the stop tokens. + stop_status = self.stop_checker(text_to_yield) + if stop_status is None: # Good to go + text_buffer = "" # Clear the buffer + completion_status.generated_text += text_to_yield + yield text_to_yield + elif stop_status is True: # Contains any of the stop tokens + break # Stop generating + else: # Contains any piece of the stop tokens + text_buffer = text_to_yield # Save the buffer + + # End of the loop + if verbose: + llama_cpp.llama_print_timings(ctx) + if client.cache: + if verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + client.cache[input_ids + generated_ids] = client.save_state() + print("Llama._create_completion: cache saved", file=sys.stderr) + return + + +def _load_cache( + client: llama_cpp.Llama, cache: llama_cpp.BaseLlamaCache, ids: "array[int]" +) -> None: + try: + cache_item = cache[ids] + cache_prefix_len = client.longest_token_prefix( + cache_item.input_ids.tolist(), ids + ) + eval_prefix_len = client.longest_token_prefix( + client._input_ids.tolist(), ids + ) + if cache_prefix_len > eval_prefix_len: + client.load_state(cache_item) + if client.verbose: + print( + "Llama._create_completion: cache hit", + file=sys.stderr, + ) + except KeyError: + if client.verbose: + print("Llama._create_completion: cache miss", file=sys.stderr) + + +def _get_log_probs( + client: llama_cpp.Llama, + completion_status: "CompletionStatus", + prompt_tokens: int, + generated_ids: "array[int]", + generated_tokens: int, + logprobs: int, + token: int, +) -> CompletionLogprobs: + assert client.ctx is not None, "Llama context is not initialized" + token_str = client.detokenize([token]).decode("utf-8", errors="ignore") + text_offset = len(completion_status.input_text) + len( + completion_status.generated_text + ) + token_offset = prompt_tokens + generated_tokens + current_logprobs = client.logits_to_logprobs( + client.scores[: client.n_tokens, :][token_offset - 1, :].tolist() + ) + return { + "tokens": [ + client.detokenize([token]).decode("utf-8", errors="ignore") + ], + "text_offset": [text_offset], + "token_logprobs": [current_logprobs[int(token)]], + "top_logprobs": [ + { + **{ + client.detokenize([i]).decode( + "utf-8", errors="ignore" + ): logprob + for logprob, i in list( + sorted( + zip( + current_logprobs, + range(len(current_logprobs)), + ), + reverse=True, + ) + )[:logprobs] + }, + token_str: current_logprobs[int(token)], + } + ], + } From 6cea61ee258a947d0c2238d04aac5c3e7942f366 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sat, 19 Aug 2023 18:59:10 +0900 Subject: [PATCH 06/18] Removed TaskStatus and task_manager --- llama_api/mixins/completion.py | 4 + llama_api/modules/base.py | 8 +- llama_api/modules/llama_cpp.py | 7 +- llama_api/schemas/api.py | 4 + llama_api/server/pools/llama.py | 75 +++++----- llama_api/server/routers/v1.py | 238 +++++++++++++------------------- 6 files changed, 150 insertions(+), 186 deletions(-) diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py index 312a4b0..26f03f9 100644 --- a/llama_api/mixins/completion.py +++ b/llama_api/mixins/completion.py @@ -1,5 +1,6 @@ from collections import defaultdict from dataclasses import dataclass, field +from time import time from typing import Dict, Literal, Optional from ..schemas.api import CompletionLogprobs, TextGenerationSettings @@ -7,6 +8,9 @@ @dataclass class CompletionStatus: + # These fields are automatically set + started_at: float = field(default_factory=time, init=False) + # These fields are set by `accept_settings` method. input_text: str = field(default="", init=False) input_tokens: int = field(default=0, init=False) diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py index 27b41d1..df101f9 100644 --- a/llama_api/modules/base.py +++ b/llama_api/modules/base.py @@ -107,9 +107,7 @@ def generate_completion_with_streaming( ) -> Iterator[CompletionChunk]: """Generate a completion for a given prompt, yielding chunks of text as they are generated.""" - completion_id = settings.completion_id = ( - "chat" + settings.completion_id - ) + completion_id = settings.completion_id completion_status = self.completion_status[completion_id] model = self.model_name for token in self.generate_text(prompt=prompt, settings=settings): @@ -150,9 +148,7 @@ def generate_chat_completion( self, messages: List[APIChatMessage], settings: TextGenerationSettings ) -> ChatCompletion: """Generate a completion for a given prompt.""" - completion_id = settings.completion_id = ( - "chat" + settings.completion_id - ) + completion_id = settings.completion_id completion_status = self.completion_status[completion_id] deque( self.generate_text( diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py index c58e743..0815e4d 100644 --- a/llama_api/modules/llama_cpp.py +++ b/llama_api/modules/llama_cpp.py @@ -3,8 +3,9 @@ import sys from array import array from inspect import signature -from typing import TYPE_CHECKING, Callable, Iterator, List, Optional, Union +from typing import Callable, Iterator, List, Optional, Union +from ..mixins.completion import CompletionStatus from ..schemas.api import ( ChatCompletionChunk, CompletionChunk, @@ -25,10 +26,6 @@ from repositories.llama_cpp import llama_cpp -if TYPE_CHECKING: - from llama_api.mixins.completion import CompletionStatus - - class StoppingCriteriaList(List[Callable[[List[int], List[float]], bool]]): def __call__(self, input_ids: List[int], logits: List[float]) -> bool: return any( diff --git a/llama_api/schemas/api.py b/llama_api/schemas/api.py index e052324..2f4e870 100644 --- a/llama_api/schemas/api.py +++ b/llama_api/schemas/api.py @@ -253,6 +253,10 @@ class Config: class CreateChatCompletionRequest(TextGenerationSettings): + completion_id: str = Field( + default_factory=lambda: f"chatcmpl-{str(uuid4())}", + description="The unique ID of the chat generation", + ) model: str = Field( default=..., description="The model to use for completion." ) diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py index c2db637..a7d741f 100644 --- a/llama_api/server/pools/llama.py +++ b/llama_api/server/pools/llama.py @@ -1,13 +1,16 @@ from collections import deque from contextlib import contextmanager +from dataclasses import dataclass, field from multiprocessing.dummy import current_process from os import getpid from queue import Queue from threading import Event -from typing import Deque, Dict, Iterator, List, Union +from time import time +from typing import Deque, Dict, Iterator, List, Optional, Union import model_definitions +from ...mixins.completion import CompletionStatus from ...modules.base import ( BaseCompletionGenerator, BaseEmbeddingGenerator, @@ -31,7 +34,6 @@ from ...utils.logger import ApiLogger from ...utils.system import free_memory_of_first_item_from_container - logger = ApiLogger(__name__) logger.info(f"🔧 {current_process()} is initiated with PID: {getpid()}") @@ -40,6 +42,12 @@ embedding_generators: Deque["BaseEmbeddingGenerator"] = deque(maxlen=1) +@dataclass +class EmbeddingStatus: + started_at: float = field(default_factory=time, init=False) + embedding: Optional[Embedding] = None + + def init() -> None: pass @@ -78,8 +86,7 @@ def get_model(model_name: str) -> "BaseLLMModel": llm_model, BaseLLMModel ), f"Not a LLM model: {model_name}" return llm_model - except Exception as e: - logger.error(e) + except Exception: raise ValueError(f"Model path does not exist: {model_name}") @@ -89,7 +96,7 @@ def get_completion_generator( CreateChatCompletionRequest, CreateEmbeddingRequest, ], -) -> "BaseCompletionGenerator": +) -> BaseCompletionGenerator: """Get a completion generator for the given model. If the model is not cached, create a new one. If the cache is full, delete the oldest completion generator.""" @@ -153,7 +160,7 @@ def get_completion_generator( def get_embedding_generator( body: CreateEmbeddingRequest, -) -> "BaseEmbeddingGenerator": +) -> BaseEmbeddingGenerator: """Get an embedding generator for the given model. If the model is not cached, create a new one. If the cache is full, delete the oldest completion generator.""" @@ -203,7 +210,7 @@ def generate_completion_chunks( body: Union[CreateChatCompletionRequest, CreateCompletionRequest], queue: Queue, interrupt_signal: Event, -) -> None: +) -> CompletionStatus: with queue_manager(queue=queue): with completion_generator_manager( body=body, interrupt_signal=interrupt_signal @@ -233,16 +240,17 @@ def iterator() -> ( for chunk in iterator(): if interrupt_signal.is_set(): - # If the event is set, it means the client has disconnected - return + # If the event is set, the client is disconnected + return cg.completion_status[body.completion_id] queue.put(chunk) + return cg.completion_status[body.completion_id] def generate_completion( body: Union[CreateChatCompletionRequest, CreateCompletionRequest], queue: Queue, interrupt_signal: Event, -) -> None: +) -> CompletionStatus: with queue_manager(queue=queue): with completion_generator_manager( body=body, interrupt_signal=interrupt_signal @@ -260,17 +268,18 @@ def generate_completion( settings=body, ) queue.put(completion) + return cg.completion_status[body.completion_id] def generate_embeddings( - body: CreateEmbeddingRequest, queue: Queue, interrupt_signal: Event -) -> None: + body: CreateEmbeddingRequest, queue: Queue +) -> EmbeddingStatus: + embedding_status = EmbeddingStatus() with queue_manager(queue=queue): try: llm_model = get_model(body.model) if not isinstance(llm_model, LlamaCppModel): raise NotImplementedError("Using non-llama-cpp model") - except Exception: # Embedding model from local # "intfloat/e5-large-v2", @@ -290,23 +299,21 @@ def generate_embeddings( context_length=512, batch=1000, ) - queue.put( - Embedding( - object="list", - data=[ - EmbeddingData( - index=embedding_idx, - object="embedding", - embedding=embedding, - ) - for embedding_idx, embedding in enumerate(embeddings) - ], - model=body.model, - usage=EmbeddingUsage( - prompt_tokens=-1, - total_tokens=-1, - ), - ) + embedding = Embedding( + object="list", + data=[ + EmbeddingData( + index=embedding_idx, + object="embedding", + embedding=embedding, + ) + for embedding_idx, embedding in enumerate(embeddings) + ], + model=body.model, + usage=EmbeddingUsage( + prompt_tokens=-1, + total_tokens=-1, + ), ) else: @@ -323,7 +330,9 @@ def generate_embeddings( completion_generator, lazy.LlamaCppCompletionGenerator ), f"Model {body.model} is not supported for llama.cpp embeddings." assert completion_generator.client, "Model not loaded yet." - queue.put( - completion_generator.client.create_embedding, - **body.model_dump(exclude={"user"}), + embedding = completion_generator.client.create_embedding( + **body.model_dump(exclude={"user"}) ) + queue.put(embedding) + embedding_status.embedding = embedding + return embedding_status diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index a9a4fba..0afa584 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -3,7 +3,7 @@ from asyncio import Task, create_task -from contextlib import asynccontextmanager, contextmanager +from contextlib import asynccontextmanager from dataclasses import dataclass, field from functools import partial from os import environ @@ -14,18 +14,16 @@ from typing import ( Any, AsyncGenerator, - Callable, Dict, - Generator, Iterator, List, + Literal, Optional, Tuple, Type, TypeVar, Union, ) -from typing_extensions import TypedDict from anyio import ( Semaphore, @@ -39,6 +37,7 @@ from orjson import OPT_INDENT_2, dumps from sse_starlette.sse import EventSourceResponse +from ...mixins.completion import CompletionStatus from ...schemas.api import ( ChatCompletion, ChatCompletionChunk, @@ -58,6 +57,7 @@ from ...utils.errors import RouteErrorHandler from ...utils.logger import ApiLogger, LoggingConfig from ..pools.llama import ( + EmbeddingStatus, generate_completion, generate_completion_chunks, generate_embeddings, @@ -77,16 +77,6 @@ T = TypeVar("T") -class TaskStatus(TypedDict): - """Completion status""" - - completion_tokens: int - started_at: float - interrupted: bool - embedding_chunks: Optional[int] - generated_text: str - - @dataclass class WixMetadata: """Worker index (wix) metadata""" @@ -179,31 +169,33 @@ async def get_event_publisher( CreateCompletionRequest, ], inner_send_chan: MemoryObjectSendStream[bytes], - task: "Task[None]", + task: "Task[CompletionStatus]", interrupt_signal: Event, iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]], ) -> None: """Publish Server-Sent-Events (SSE) to the client""" - with task_manager( - body=body, - task=task, - interrupt_signal=interrupt_signal, - ) as task_status: - async with inner_send_chan: + is_interrupted = False # type: bool + async with inner_send_chan: + try: + async for chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(b"data: " + dumps(chunk) + b"\n\n") + if await request.is_disconnected(): + raise get_cancelled_exc_class()() + await inner_send_chan.send(b"data: [DONE]\n\n") + except get_cancelled_exc_class(): + is_interrupted = True + with move_on_after(1, shield=True): + raise + finally: + # Cancel the producer task and set event, + # so the completion task can be stopped + interrupt_signal.set() + state = "Interrupted" if is_interrupted else "Completed" try: - async for chunk in iterate_in_threadpool(iterator): - task_status["completion_tokens"] += 1 - task_status["generated_text"] += get_text_from_chunk(chunk) - await inner_send_chan.send( - b"data: " + dumps(chunk) + b"\n\n" - ) - if await request.is_disconnected(): - raise get_cancelled_exc_class()() - await inner_send_chan.send(b"data: [DONE]\n\n") - except get_cancelled_exc_class(): - with move_on_after(1, shield=True): - task_status["interrupted"] = True - raise + status = await task + log_request_and_response(body, status, state) + finally: + task.cancel() def get_streaming_iterator( @@ -228,14 +220,46 @@ def log_request_and_response( CreateCompletionRequest, CreateEmbeddingRequest, ], - task_status: TaskStatus, + status: Union[CompletionStatus, EmbeddingStatus], + state: Literal["Completed", "Interrupted"], ) -> None: + """Log the request and response of the completion or embedding""" + elapsed_time = time() - status.started_at + log_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"] body_without_prompt = body.model_dump( exclude={"prompt", "messages", "input"}, exclude_defaults=True, exclude_unset=True, exclude_none=True, ) + + # Log the embedding status + if isinstance(status, EmbeddingStatus) and isinstance( + body, CreateEmbeddingRequest + ): + embed_usage = { + "input_chars": len(body.input), + "embedding_chunks": len(status.embedding["data"]) + if status.embedding + else 0, + } + log_messages.append(f"embedding chunks: {embed_usage}") + embed_log = { + "request": body_without_prompt, + "input": body.input, + "embedding": status.embedding, + } + logger.info( + f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})" + ) + return chat_logger.info(dumps(embed_log, option=OPT_INDENT_2).decode()) + if not isinstance(status, CompletionStatus): + return + + # Log the completion status + tokens = status.generated_tokens + tokens_per_second = tokens / elapsed_time + log_messages.append(f"tokens: {tokens}({tokens_per_second: .1f}tok/s)") if isinstance(body, CreateChatCompletionRequest): chat_log = { "request": body_without_prompt, @@ -246,7 +270,7 @@ def log_request_and_response( + [ { "role": "assistant", - "content": task_status["generated_text"], + "content": status.generated_text, } ], } @@ -255,69 +279,15 @@ def log_request_and_response( "request": body_without_prompt, "prompt": { "user": body.prompt, - "assistant": task_status["generated_text"], + "assistant": status.generated_text, }, } else: - chat_log = { - "request": body_without_prompt, - "input": body.input, - "embedding": task_status["embedding_chunks"], - } + return + logger.info(f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})") chat_logger.info(dumps(chat_log, option=OPT_INDENT_2).decode()) -@contextmanager -def task_manager( - body: Union[ - CreateChatCompletionRequest, - CreateCompletionRequest, - CreateEmbeddingRequest, - ], - task: "Task[None]", - interrupt_signal: Event, -) -> Generator[TaskStatus, None, None]: - """Start the producer task and cancel it when the client disconnects. - Also, log the completion status.""" - task_status = TaskStatus( - completion_tokens=0, - started_at=time(), - interrupted=False, - embedding_chunks=None, - generated_text="", - ) - try: - logger.info(f"🦙 Handling request of {body.model}...") - yield task_status - finally: - # Cancel the producer task and set event, - # so the completion task can be stopped - task.cancel() - interrupt_signal.set() - - # Log the completion status - if task_status["interrupted"]: - status = "Interrupted" - else: - status = "Completed" - - elapsed_time = time() - task_status["started_at"] - basic_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"] - if task_status["completion_tokens"]: - tokens = task_status["completion_tokens"] - tokens_per_second = tokens / elapsed_time - basic_messages.append( - f"tokens: {tokens}({tokens_per_second: .1f}tok/s)" - ) - if task_status["embedding_chunks"] is not None: - embedding_chunks = task_status["embedding_chunks"] - basic_messages.append(f"embedding chunks: {embedding_chunks}") - logger.info( - f"🦙 [{status} for {body.model}]: ({' | '.join(basic_messages)})" - ) - log_request_and_response(body=body, task_status=task_status) - - async def create_chat_completion_or_completion( request: Request, body: Union[CreateChatCompletionRequest, CreateCompletionRequest], @@ -328,21 +298,18 @@ async def create_chat_completion_or_completion( If streaming is enabled, then return an EventSourceResponse.""" async with get_wix_with_semaphore(request, body.model) as wix: queue, interrupt_signal = get_queue_and_event() - producer: Callable[ - [ - Union[CreateChatCompletionRequest, CreateCompletionRequest], - Queue, - Event, - ], - None, - ] = partial( - generate_completion_chunks if body.stream else generate_completion, - body=body, - queue=queue, - interrupt_signal=interrupt_signal, - ) - task: "Task[None]" = create_task( - run_in_processpool_with_wix(producer, wix=wix) + task: "Task[CompletionStatus]" = create_task( + run_in_processpool_with_wix( + partial( + generate_completion_chunks + if body.stream + else generate_completion, + body=body, + queue=queue, + interrupt_signal=interrupt_signal, + ), + wix=wix, + ) ) if body.stream: send_chan, recv_chan = create_memory_object_stream(10) @@ -364,24 +331,16 @@ async def create_chat_completion_or_completion( ), ) else: - with task_manager( - body=body, - task=task, - interrupt_signal=interrupt_signal, - ) as task_status: - completion: Union[ - ChatCompletion, Completion - ] = validate_item_type( + # Cancel the producer task and set event, + # so the completion task can be stopped + try: + return validate_item_type( await run_in_threadpool(queue.get), type=dict, # type: ignore ) - task_status["completion_tokens"] = completion["usage"][ - "completion_tokens" - ] - task_status["generated_text"] = get_text_from_completion( - completion - ) - return completion + finally: + interrupt_signal.set() + log_request_and_response(body, await task, "Completed") @router.post("/chat/completions") @@ -409,29 +368,24 @@ async def create_embedding( assert body.model is not None, "Model is required" async with get_wix_with_semaphore(request, body.model) as wix: queue, interrupt_signal = get_queue_and_event() - producer: Callable[ - [CreateEmbeddingRequest, Queue, Event], - None, - ] = partial( - generate_embeddings, - body=body, - queue=queue, - interrupt_signal=interrupt_signal, - ) - task: "Task[None]" = create_task( - run_in_processpool_with_wix(producer, wix=wix) + task: Task["EmbeddingStatus"] = create_task( + run_in_processpool_with_wix( + partial( + generate_embeddings, + body=body, + queue=queue, + ), + wix=wix, + ) ) - with task_manager( - body=body, - task=task, - interrupt_signal=interrupt_signal, - ) as task_status: - embedding: Embedding = validate_item_type( + try: + return validate_item_type( await run_in_threadpool(queue.get), type=dict, # type: ignore ) - task_status["embedding_chunks"] = len(embedding["data"]) - return embedding + finally: + interrupt_signal.set() + log_request_and_response(body, await task, "Completed") @router.get("/models") From 7c1251c9fc5566afa00a00a5740c52f235b95f6c Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sat, 19 Aug 2023 22:16:57 +0900 Subject: [PATCH 07/18] Bump dependencies --- llama_api/modules/exllama.py | 4 +- llama_api/server/app_settings.py | 4 +- llama_api/server/routers/v1.py | 4 +- llama_api/shared/config.py | 8 +- llama_api/utils/process_pool.py | 8 +- poetry.lock | 356 ++++++++++++++++++------------- requirements.txt | 12 +- 7 files changed, 225 insertions(+), 171 deletions(-) diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index fc5cb29..9fafffa 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -154,12 +154,12 @@ def generate_text( with logger.log_any_error(): # Encode the prompt if settings.guidance_scale == 1: - ids = _encode(self.tokenizer, prompt) + ids = _encode(self.tokenizer, prompt or " ") mask = None # type: Optional[Tensor] else: ids, mask = _encode( self.tokenizer, - [prompt, settings.negative_prompt or ""], + [prompt or " ", settings.negative_prompt or ""], return_mask=True, ) diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 4bb7891..4939932 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -77,6 +77,8 @@ def initialize_before_launch( """Initialize the app""" for git_clone_args in Config.repositories.values(): git_clone(**git_clone_args) + if environ.get("LLAMA_API_XFORMERS") == "1": + install_package("xformers") if install_packages: # Install all dependencies if not skip_compile: @@ -100,8 +102,6 @@ def initialize_before_launch( # Get current packages installed logger.info(f"📦 Installed packages: {get_installed_packages()}") - if environ.get("LLAMA_API_XFORMERS") == "1": - install_package("xformers") else: logger.warning( "🏃‍♂️ Skipping package installation... " diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 0afa584..7e579d0 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -2,7 +2,7 @@ Use same format as OpenAI API""" -from asyncio import Task, create_task +from asyncio import Task, create_task, wait_for from contextlib import asynccontextmanager from dataclasses import dataclass, field from functools import partial @@ -192,7 +192,7 @@ async def get_event_publisher( interrupt_signal.set() state = "Interrupted" if is_interrupted else "Completed" try: - status = await task + status = await wait_for(task, timeout=3) log_request_and_response(body, status, state) finally: task.cancel() diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index 7c4c887..6469e76 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -1,6 +1,12 @@ from pathlib import Path from typing import Dict, List, Literal, Optional, Tuple -from typing_extensions import TypedDict + +try: + from typing_extensions import TypedDict + + +except ImportError: + from typing import TypedDict # When dependencies aren't installed yet class GitCloneArgs(TypedDict): diff --git a/llama_api/utils/process_pool.py b/llama_api/utils/process_pool.py index c3358d4..9b848e2 100644 --- a/llama_api/utils/process_pool.py +++ b/llama_api/utils/process_pool.py @@ -1,12 +1,12 @@ -from itertools import islice -from os import kill import pickle import queue -from signal import SIGINT import sys from concurrent.futures import Future from functools import partial +from itertools import islice from multiprocessing import Process, Queue, cpu_count +from os import kill +from signal import SIGINT from threading import Thread from time import sleep from traceback import format_exception @@ -23,7 +23,7 @@ Union, ) -from llama_api.utils.logger import ApiLogger +from ..utils.logger import ApiLogger if sys.version_info >= (3, 10): from typing import ParamSpec diff --git a/poetry.lock b/poetry.lock index a969ac9..fc82221 100644 --- a/poetry.lock +++ b/poetry.lock @@ -348,13 +348,13 @@ rapidfuzz = ">=2.2.0,<3.0.0" [[package]] name = "click" -version = "8.1.6" +version = "8.1.7" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" files = [ - {file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"}, - {file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"}, + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, ] [package.dependencies] @@ -681,13 +681,13 @@ dev = ["flake8", "markdown", "twine", "wheel"] [[package]] name = "griffe" -version = "0.32.3" +version = "0.33.0" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." optional = false python-versions = ">=3.8" files = [ - {file = "griffe-0.32.3-py3-none-any.whl", hash = "sha256:d9471934225818bf8f309822f70451cc6abb4b24e59e0bb27402a45f9412510f"}, - {file = "griffe-0.32.3.tar.gz", hash = "sha256:14983896ad581f59d5ad7b6c9261ff12bdaa905acccc1129341d13e545da8521"}, + {file = "griffe-0.33.0-py3-none-any.whl", hash = "sha256:16af15d0140c0a5f0b2628d33235fef91a6d8a832dd7cff5759dfe6b7d7a7a49"}, + {file = "griffe-0.33.0.tar.gz", hash = "sha256:783bcb7e7f0d346fcb0cb8072667ca8f6ce7ff776bb278fbccf8a3a753a793e4"}, ] [package.dependencies] @@ -1283,17 +1283,17 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] [[package]] name = "mkdocstrings-python" -version = "1.3.0" +version = "1.4.0" description = "A Python handler for mkdocstrings." optional = false python-versions = ">=3.8" files = [ - {file = "mkdocstrings_python-1.3.0-py3-none-any.whl", hash = "sha256:36c224c86ab77e90e0edfc9fea3307f7d0d245dd7c28f48bbb2203cf6e125530"}, - {file = "mkdocstrings_python-1.3.0.tar.gz", hash = "sha256:f967f84bab530fcc13cc9c02eccf0c18bdb2c3bab5c55fa2045938681eec4fc4"}, + {file = "mkdocstrings_python-1.4.0-py3-none-any.whl", hash = "sha256:46f4b0ed8540c6bfd0c3f50471831a7bdb9a1bf35f24400525721d7555aa355c"}, + {file = "mkdocstrings_python-1.4.0.tar.gz", hash = "sha256:c92304c402928a05c793203dadee7a1a51b5ae56404fd594d0b2db49a7b3957a"}, ] [package.dependencies] -griffe = ">=0.30,<0.33" +griffe = ">=0.33" mkdocstrings = ">=0.20" [[package]] @@ -1458,28 +1458,71 @@ files = [ [[package]] name = "orjson" -version = "3.9.4" +version = "3.9.5" description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" optional = false python-versions = ">=3.7" files = [ - {file = "orjson-3.9.4-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2e83ec1ee66d83b558a6d273d8a01b86563daa60bea9bc040e2c1cb8008de61f"}, - {file = "orjson-3.9.4-cp310-none-win32.whl", hash = "sha256:04cd7f4a4f4cd2fe43d104eb70e7435c6fcbdde7aa0cde4230e444fbc66924d3"}, - {file = "orjson-3.9.4-cp310-none-win_amd64.whl", hash = "sha256:4fdb59cfa00e10c82e09d1c32a9ce08a38bd29496ba20a73cd7f498e3a0a5024"}, - {file = "orjson-3.9.4-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:daeed2502ddf1f2b29ec8da2fe2ea82807a5c4acf869608ce6c476db8171d070"}, - {file = "orjson-3.9.4-cp311-none-win32.whl", hash = "sha256:e12492ce65cb10f385e70a88badc6046bc720fa7d468db27b7429d85d41beaeb"}, - {file = "orjson-3.9.4-cp311-none-win_amd64.whl", hash = "sha256:3b9f8bf43a5367d5522f80e7d533c98d880868cd0b640b9088c9237306eca6e8"}, - {file = "orjson-3.9.4-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:0b400cf89c15958cd829c8a4ade8f5dd73588e63d2fb71a00483e7a74e9f92da"}, - {file = "orjson-3.9.4-cp312-none-win_amd64.whl", hash = "sha256:a533e664a0e3904307d662c5d45775544dc2b38df6e39e213ff6a86ceaa3d53c"}, - {file = "orjson-3.9.4-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:149d1b7630771222f73ecb024ab5dd8e7f41502402b02015494d429bacc4d5c1"}, - {file = "orjson-3.9.4-cp37-none-win32.whl", hash = "sha256:bcda6179eb863c295eb5ea832676d33ef12c04d227b4c98267876c8322e5a96e"}, - {file = "orjson-3.9.4-cp37-none-win_amd64.whl", hash = "sha256:3d947366127abef192419257eb7db7fcee0841ced2b49ccceba43b65e9ce5e3f"}, - {file = "orjson-3.9.4-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:a7d029fc34a516f7eae29b778b30371fcb621134b2acfe4c51c785102aefc6cf"}, - {file = "orjson-3.9.4-cp38-none-win32.whl", hash = "sha256:94d15ee45c2aaed334688e511aa73b4681f7c08a0810884c6b3ae5824dea1222"}, - {file = "orjson-3.9.4-cp38-none-win_amd64.whl", hash = "sha256:336ec8471102851f0699198031924617b7a77baadea889df3ffda6000bd59f4c"}, - {file = "orjson-3.9.4-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2f57ccb50e9e123709e9f2d7b1a9e09e694e49d1fa5c5585e34b8e3f01929dc3"}, - {file = "orjson-3.9.4-cp39-none-win32.whl", hash = "sha256:b5b5038187b74e2d33e5caee8a7e83ddeb6a21da86837fa2aac95c69aeb366e6"}, - {file = "orjson-3.9.4-cp39-none-win_amd64.whl", hash = "sha256:915da36bc93ef0c659fa50fe7939d4f208804ad252fc4fc8d55adbbb82293c48"}, + {file = "orjson-3.9.5-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ad6845912a71adcc65df7c8a7f2155eba2096cf03ad2c061c93857de70d699ad"}, + {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e298e0aacfcc14ef4476c3f409e85475031de24e5b23605a465e9bf4b2156273"}, + {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83c9939073281ef7dd7c5ca7f54cceccb840b440cec4b8a326bda507ff88a0a6"}, + {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e174cc579904a48ee1ea3acb7045e8a6c5d52c17688dfcb00e0e842ec378cabf"}, + {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f8d51702f42c785b115401e1d64a27a2ea767ae7cf1fb8edaa09c7cf1571c660"}, + {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f13d61c0c7414ddee1ef4d0f303e2222f8cced5a2e26d9774751aecd72324c9e"}, + {file = "orjson-3.9.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d748cc48caf5a91c883d306ab648df1b29e16b488c9316852844dd0fd000d1c2"}, + {file = "orjson-3.9.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bd19bc08fa023e4c2cbf8294ad3f2b8922f4de9ba088dbc71e6b268fdf54591c"}, + {file = "orjson-3.9.5-cp310-none-win32.whl", hash = "sha256:5793a21a21bf34e1767e3d61a778a25feea8476dcc0bdf0ae1bc506dc34561ea"}, + {file = "orjson-3.9.5-cp310-none-win_amd64.whl", hash = "sha256:2bcec0b1024d0031ab3eab7a8cb260c8a4e4a5e35993878a2da639d69cdf6a65"}, + {file = "orjson-3.9.5-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8547b95ca0e2abd17e1471973e6d676f1d8acedd5f8fb4f739e0612651602d66"}, + {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87ce174d6a38d12b3327f76145acbd26f7bc808b2b458f61e94d83cd0ebb4d76"}, + {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a960bb1bc9a964d16fcc2d4af5a04ce5e4dfddca84e3060c35720d0a062064fe"}, + {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a7aa5573a949760d6161d826d34dc36db6011926f836851fe9ccb55b5a7d8e8"}, + {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b2852afca17d7eea85f8e200d324e38c851c96598ac7b227e4f6c4e59fbd3df"}, + {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa185959c082475288da90f996a82e05e0c437216b96f2a8111caeb1d54ef926"}, + {file = "orjson-3.9.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:89c9332695b838438ea4b9a482bce8ffbfddde4df92750522d928fb00b7b8dce"}, + {file = "orjson-3.9.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2493f1351a8f0611bc26e2d3d407efb873032b4f6b8926fed8cfed39210ca4ba"}, + {file = "orjson-3.9.5-cp311-none-win32.whl", hash = "sha256:ffc544e0e24e9ae69301b9a79df87a971fa5d1c20a6b18dca885699709d01be0"}, + {file = "orjson-3.9.5-cp311-none-win_amd64.whl", hash = "sha256:89670fe2732e3c0c54406f77cad1765c4c582f67b915c74fda742286809a0cdc"}, + {file = "orjson-3.9.5-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:15df211469625fa27eced4aa08dc03e35f99c57d45a33855cc35f218ea4071b8"}, + {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9f17c59fe6c02bc5f89ad29edb0253d3059fe8ba64806d789af89a45c35269a"}, + {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca6b96659c7690773d8cebb6115c631f4a259a611788463e9c41e74fa53bf33f"}, + {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a26fafe966e9195b149950334bdbe9026eca17fe8ffe2d8fa87fdc30ca925d30"}, + {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9006b1eb645ecf460da067e2dd17768ccbb8f39b01815a571bfcfab7e8da5e52"}, + {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebfdbf695734b1785e792a1315e41835ddf2a3e907ca0e1c87a53f23006ce01d"}, + {file = "orjson-3.9.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4a3943234342ab37d9ed78fb0a8f81cd4b9532f67bf2ac0d3aa45fa3f0a339f3"}, + {file = "orjson-3.9.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e6762755470b5c82f07b96b934af32e4d77395a11768b964aaa5eb092817bc31"}, + {file = "orjson-3.9.5-cp312-none-win_amd64.whl", hash = "sha256:c74df28749c076fd6e2157190df23d43d42b2c83e09d79b51694ee7315374ad5"}, + {file = "orjson-3.9.5-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:88e18a74d916b74f00d0978d84e365c6bf0e7ab846792efa15756b5fb2f7d49d"}, + {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d28514b5b6dfaf69097be70d0cf4f1407ec29d0f93e0b4131bf9cc8fd3f3e374"}, + {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b81aca8c7be61e2566246b6a0ca49f8aece70dd3f38c7f5c837f398c4cb142"}, + {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:385c1c713b1e47fd92e96cf55fd88650ac6dfa0b997e8aa7ecffd8b5865078b1"}, + {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9850c03a8e42fba1a508466e6a0f99472fd2b4a5f30235ea49b2a1b32c04c11"}, + {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4449f84bbb13bcef493d8aa669feadfced0f7c5eea2d0d88b5cc21f812183af8"}, + {file = "orjson-3.9.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:86127bf194f3b873135e44ce5dc9212cb152b7e06798d5667a898a00f0519be4"}, + {file = "orjson-3.9.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0abcd039f05ae9ab5b0ff11624d0b9e54376253b7d3217a358d09c3edf1d36f7"}, + {file = "orjson-3.9.5-cp37-none-win32.whl", hash = "sha256:10cc8ad5ff7188efcb4bec196009d61ce525a4e09488e6d5db41218c7fe4f001"}, + {file = "orjson-3.9.5-cp37-none-win_amd64.whl", hash = "sha256:ff27e98532cb87379d1a585837d59b187907228268e7b0a87abe122b2be6968e"}, + {file = "orjson-3.9.5-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:5bfa79916ef5fef75ad1f377e54a167f0de334c1fa4ebb8d0224075f3ec3d8c0"}, + {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e87dfa6ac0dae764371ab19b35eaaa46dfcb6ef2545dfca03064f21f5d08239f"}, + {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:50ced24a7b23058b469ecdb96e36607fc611cbaee38b58e62a55c80d1b3ad4e1"}, + {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1b74ea2a3064e1375da87788897935832e806cc784de3e789fd3c4ab8eb3fa5"}, + {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7cb961efe013606913d05609f014ad43edfaced82a576e8b520a5574ce3b2b9"}, + {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1225d2d5ee76a786bda02f8c5e15017462f8432bb960de13d7c2619dba6f0275"}, + {file = "orjson-3.9.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f39f4b99199df05c7ecdd006086259ed25886cdbd7b14c8cdb10c7675cfcca7d"}, + {file = "orjson-3.9.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a461dc9fb60cac44f2d3218c36a0c1c01132314839a0e229d7fb1bba69b810d8"}, + {file = "orjson-3.9.5-cp38-none-win32.whl", hash = "sha256:dedf1a6173748202df223aea29de814b5836732a176b33501375c66f6ab7d822"}, + {file = "orjson-3.9.5-cp38-none-win_amd64.whl", hash = "sha256:fa504082f53efcbacb9087cc8676c163237beb6e999d43e72acb4bb6f0db11e6"}, + {file = "orjson-3.9.5-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6900f0248edc1bec2a2a3095a78a7e3ef4e63f60f8ddc583687eed162eedfd69"}, + {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17404333c40047888ac40bd8c4d49752a787e0a946e728a4e5723f111b6e55a5"}, + {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0eefb7cfdd9c2bc65f19f974a5d1dfecbac711dae91ed635820c6b12da7a3c11"}, + {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:68c78b2a3718892dc018adbc62e8bab6ef3c0d811816d21e6973dee0ca30c152"}, + {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:591ad7d9e4a9f9b104486ad5d88658c79ba29b66c5557ef9edf8ca877a3f8d11"}, + {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6cc2cbf302fbb2d0b2c3c142a663d028873232a434d89ce1b2604ebe5cc93ce8"}, + {file = "orjson-3.9.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b26b5aa5e9ee1bad2795b925b3adb1b1b34122cb977f30d89e0a1b3f24d18450"}, + {file = "orjson-3.9.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ef84724f7d29dcfe3aafb1fc5fc7788dca63e8ae626bb9298022866146091a3e"}, + {file = "orjson-3.9.5-cp39-none-win32.whl", hash = "sha256:664cff27f85939059472afd39acff152fbac9a091b7137092cb651cf5f7747b5"}, + {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"}, + {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, ] [[package]] @@ -1642,24 +1685,24 @@ poetry-core = ">=1.6.0,<2.0.0" [[package]] name = "protobuf" -version = "4.24.0" +version = "4.24.1" description = "" optional = false python-versions = ">=3.7" files = [ - {file = "protobuf-4.24.0-cp310-abi3-win32.whl", hash = "sha256:81cb9c4621d2abfe181154354f63af1c41b00a4882fb230b4425cbaed65e8f52"}, - {file = "protobuf-4.24.0-cp310-abi3-win_amd64.whl", hash = "sha256:6c817cf4a26334625a1904b38523d1b343ff8b637d75d2c8790189a4064e51c3"}, - {file = "protobuf-4.24.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:ae97b5de10f25b7a443b40427033e545a32b0e9dda17bcd8330d70033379b3e5"}, - {file = "protobuf-4.24.0-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:567fe6b0647494845d0849e3d5b260bfdd75692bf452cdc9cb660d12457c055d"}, - {file = "protobuf-4.24.0-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:a6b1ca92ccabfd9903c0c7dde8876221dc7d8d87ad5c42e095cc11b15d3569c7"}, - {file = "protobuf-4.24.0-cp37-cp37m-win32.whl", hash = "sha256:a38400a692fd0c6944c3c58837d112f135eb1ed6cdad5ca6c5763336e74f1a04"}, - {file = "protobuf-4.24.0-cp37-cp37m-win_amd64.whl", hash = "sha256:5ab19ee50037d4b663c02218a811a5e1e7bb30940c79aac385b96e7a4f9daa61"}, - {file = "protobuf-4.24.0-cp38-cp38-win32.whl", hash = "sha256:e8834ef0b4c88666ebb7c7ec18045aa0f4325481d724daa624a4cf9f28134653"}, - {file = "protobuf-4.24.0-cp38-cp38-win_amd64.whl", hash = "sha256:8bb52a2be32db82ddc623aefcedfe1e0eb51da60e18fcc908fb8885c81d72109"}, - {file = "protobuf-4.24.0-cp39-cp39-win32.whl", hash = "sha256:ae7a1835721086013de193311df858bc12cd247abe4ef9710b715d930b95b33e"}, - {file = "protobuf-4.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:44825e963008f8ea0d26c51911c30d3e82e122997c3c4568fd0385dd7bacaedf"}, - {file = "protobuf-4.24.0-py3-none-any.whl", hash = "sha256:82e6e9ebdd15b8200e8423676eab38b774624d6a1ad696a60d86a2ac93f18201"}, - {file = "protobuf-4.24.0.tar.gz", hash = "sha256:5d0ceb9de6e08311832169e601d1fc71bd8e8c779f3ee38a97a78554945ecb85"}, + {file = "protobuf-4.24.1-cp310-abi3-win32.whl", hash = "sha256:d414199ca605eeb498adc4d2ba82aedc0379dca4a7c364ff9bc9a179aa28e71b"}, + {file = "protobuf-4.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:5906c5e79ff50fe38b2d49d37db5874e3c8010826f2362f79996d83128a8ed9b"}, + {file = "protobuf-4.24.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:970c701ee16788d74f3de20938520d7a0aebc7e4fff37096a48804c80d2908cf"}, + {file = "protobuf-4.24.1-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:fc361148e902949dcb953bbcb148c99fe8f8854291ad01107e4120361849fd0e"}, + {file = "protobuf-4.24.1-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:5d32363d14aca6e5c9e9d5918ad8fb65b091b6df66740ae9de50ac3916055e43"}, + {file = "protobuf-4.24.1-cp37-cp37m-win32.whl", hash = "sha256:df015c47d6855b8efa0b9be706c70bf7f050a4d5ac6d37fb043fbd95157a0e25"}, + {file = "protobuf-4.24.1-cp37-cp37m-win_amd64.whl", hash = "sha256:d4af4fd9e9418e819be30f8df2a16e72fbad546a7576ac7f3653be92a6966d30"}, + {file = "protobuf-4.24.1-cp38-cp38-win32.whl", hash = "sha256:302e8752c760549ed4c7a508abc86b25d46553c81989343782809e1a062a2ef9"}, + {file = "protobuf-4.24.1-cp38-cp38-win_amd64.whl", hash = "sha256:06437f0d4bb0d5f29e3d392aba69600188d4be5ad1e0a3370e581a9bf75a3081"}, + {file = "protobuf-4.24.1-cp39-cp39-win32.whl", hash = "sha256:0b2b224e9541fe9f046dd7317d05f08769c332b7e4c54d93c7f0f372dedb0b1a"}, + {file = "protobuf-4.24.1-cp39-cp39-win_amd64.whl", hash = "sha256:bd39b9094a4cc003a1f911b847ab379f89059f478c0b611ba1215053e295132e"}, + {file = "protobuf-4.24.1-py3-none-any.whl", hash = "sha256:55dd644adc27d2a624339332755fe077c7f26971045b469ebb9732a69ce1f2ca"}, + {file = "protobuf-4.24.1.tar.gz", hash = "sha256:44837a5ed9c9418ad5d502f89f28ba102e9cd172b6668bc813f21716f9273348"}, ] [[package]] @@ -1723,18 +1766,18 @@ files = [ [[package]] name = "pydantic" -version = "2.1.1" +version = "2.2.1" description = "Data validation using Python type hints" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-2.1.1-py3-none-any.whl", hash = "sha256:43bdbf359d6304c57afda15c2b95797295b702948082d4c23851ce752f21da70"}, - {file = "pydantic-2.1.1.tar.gz", hash = "sha256:22d63db5ce4831afd16e7c58b3192d3faf8f79154980d9397d9867254310ba4b"}, + {file = "pydantic-2.2.1-py3-none-any.whl", hash = "sha256:0c88bd2b63ed7a5109c75ab180d55f58f80a4b559682406812d0684d3f4b9192"}, + {file = "pydantic-2.2.1.tar.gz", hash = "sha256:31b5cada74b2320999fb2577e6df80332a200ff92e7775a52448b6b036fce24a"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.4.0" +pydantic-core = "2.6.1" typing-extensions = ">=4.6.1" [package.extras] @@ -1742,112 +1785,117 @@ email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.4.0" +version = "2.6.1" description = "" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic_core-2.4.0-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:2ca4687dd996bde7f3c420def450797feeb20dcee2b9687023e3323c73fc14a2"}, - {file = "pydantic_core-2.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:782fced7d61469fd1231b184a80e4f2fa7ad54cd7173834651a453f96f29d673"}, - {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6213b471b68146af97b8551294e59e7392c2117e28ffad9c557c65087f4baee3"}, - {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63797499a219d8e81eb4e0c42222d0a4c8ec896f5c76751d4258af95de41fdf1"}, - {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_24_armv7l.whl", hash = "sha256:0455876d575a35defc4da7e0a199596d6c773e20d3d42fa1fc29f6aa640369ed"}, - {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:8c938c96294d983dcf419b54dba2d21056959c22911d41788efbf949a29ae30d"}, - {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_24_s390x.whl", hash = "sha256:878a5017d93e776c379af4e7b20f173c82594d94fa073059bcc546789ad50bf8"}, - {file = "pydantic_core-2.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69159afc2f2dc43285725f16143bc5df3c853bc1cb7df6021fce7ef1c69e8171"}, - {file = "pydantic_core-2.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:54df7df399b777c1fd144f541c95d351b3aa110535a6810a6a569905d106b6f3"}, - {file = "pydantic_core-2.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e412607ca89a0ced10758dfb8f9adcc365ce4c1c377e637c01989a75e9a9ec8a"}, - {file = "pydantic_core-2.4.0-cp310-none-win32.whl", hash = "sha256:853f103e2b9a58832fdd08a587a51de8b552ae90e1a5d167f316b7eabf8d7dde"}, - {file = "pydantic_core-2.4.0-cp310-none-win_amd64.whl", hash = "sha256:3ba2c9c94a9176f6321a879c8b864d7c5b12d34f549a4c216c72ce213d7d953c"}, - {file = "pydantic_core-2.4.0-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:a8b7acd04896e8f161e1500dc5f218017db05c1d322f054e89cbd089ce5d0071"}, - {file = "pydantic_core-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16468bd074fa4567592d3255bf25528ed41e6b616d69bf07096bdb5b66f947d1"}, - {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cba5ad5eef02c86a1f3da00544cbc59a510d596b27566479a7cd4d91c6187a11"}, - {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7206e41e04b443016e930e01685bab7a308113c0b251b3f906942c8d4b48fcb"}, - {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_24_armv7l.whl", hash = "sha256:c1375025f0bfc9155286ebae8eecc65e33e494c90025cda69e247c3ccd2bab00"}, - {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_24_ppc64le.whl", hash = "sha256:3534118289e33130ed3f1cc487002e8d09b9f359be48b02e9cd3de58ce58fba9"}, - {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_24_s390x.whl", hash = "sha256:94d2b36a74623caab262bf95f0e365c2c058396082bd9d6a9e825657d0c1e7fa"}, - {file = "pydantic_core-2.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:af24ad4fbaa5e4a2000beae0c3b7fd1c78d7819ab90f9370a1cfd8998e3f8a3c"}, - {file = "pydantic_core-2.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bf10963d8aed8bbe0165b41797c9463d4c5c8788ae6a77c68427569be6bead41"}, - {file = "pydantic_core-2.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:68199ada7c310ddb8c76efbb606a0de656b40899388a7498954f423e03fc38be"}, - {file = "pydantic_core-2.4.0-cp311-none-win32.whl", hash = "sha256:6f855bcc96ed3dd56da7373cfcc9dcbabbc2073cac7f65c185772d08884790ce"}, - {file = "pydantic_core-2.4.0-cp311-none-win_amd64.whl", hash = "sha256:de39eb3bab93a99ddda1ac1b9aa331b944d8bcc4aa9141148f7fd8ee0299dafc"}, - {file = "pydantic_core-2.4.0-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:f773b39780323a0499b53ebd91a28ad11cde6705605d98d999dfa08624caf064"}, - {file = "pydantic_core-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a297c0d6c61963c5c3726840677b798ca5b7dfc71bc9c02b9a4af11d23236008"}, - {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:546064c55264156b973b5e65e5fafbe5e62390902ce3cf6b4005765505e8ff56"}, - {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36ba9e728588588f0196deaf6751b9222492331b5552f865a8ff120869d372e0"}, - {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_24_armv7l.whl", hash = "sha256:57a53a75010c635b3ad6499e7721eaa3b450e03f6862afe2dbef9c8f66e46ec8"}, - {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_24_ppc64le.whl", hash = "sha256:4b262bbc13022f2097c48a21adcc360a81d83dc1d854c11b94953cd46d7d3c07"}, - {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_24_s390x.whl", hash = "sha256:01947ad728f426fa07fcb26457ebf90ce29320259938414bc0edd1476e75addb"}, - {file = "pydantic_core-2.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b2799c2eaf182769889761d4fb4d78b82bc47dae833799fedbf69fc7de306faa"}, - {file = "pydantic_core-2.4.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a08fd490ba36d1fbb2cd5dcdcfb9f3892deb93bd53456724389135712b5fc735"}, - {file = "pydantic_core-2.4.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1e8a7c62d15a5c4b307271e4252d76ebb981d6251c6ecea4daf203ef0179ea4f"}, - {file = "pydantic_core-2.4.0-cp312-none-win32.whl", hash = "sha256:9206c14a67c38de7b916e486ae280017cf394fa4b1aa95cfe88621a4e1d79725"}, - {file = "pydantic_core-2.4.0-cp312-none-win_amd64.whl", hash = "sha256:884235507549a6b2d3c4113fb1877ae263109e787d9e0eb25c35982ab28d0399"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:4cbe929efa77a806e8f1a97793f2dc3ea3475ae21a9ed0f37c21320fe93f6f50"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:9137289de8fe845c246a8c3482dd0cb40338846ba683756d8f489a4bd8fddcae"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5d8e764b5646623e57575f624f8ebb8f7a9f7fd1fae682ef87869ca5fec8dcf"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fba0aff4c407d0274e43697e785bcac155ad962be57518d1c711f45e72da70f"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_24_armv7l.whl", hash = "sha256:30527d173e826f2f7651f91c821e337073df1555e3b5a0b7b1e2c39e26e50678"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:bd7d1dde70ff3e09e4bc7a1cbb91a7a538add291bfd5b3e70ef1e7b45192440f"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_24_s390x.whl", hash = "sha256:72f1216ca8cef7b8adacd4c4c6b89c3b0c4f97503197f5284c80f36d6e4edd30"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b013c7861a7c7bfcec48fd709513fea6f9f31727e7a0a93ca0dd12e056740717"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:478f5f6d7e32bd4a04d102160efb2d389432ecf095fe87c555c0a6fc4adfc1a4"}, - {file = "pydantic_core-2.4.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d9610b47b5fe4aacbbba6a9cb5f12cbe864eec99dbfed5710bd32ef5dd8a5d5b"}, - {file = "pydantic_core-2.4.0-cp37-none-win32.whl", hash = "sha256:ff246c0111076c8022f9ba325c294f2cb5983403506989253e04dbae565e019b"}, - {file = "pydantic_core-2.4.0-cp37-none-win_amd64.whl", hash = "sha256:d0c2b713464a8e263a243ae7980d81ce2de5ac59a9f798a282e44350b42dc516"}, - {file = "pydantic_core-2.4.0-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:12ef6838245569fd60a179fade81ca4b90ae2fa0ef355d616f519f7bb27582db"}, - {file = "pydantic_core-2.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49db206eb8fdc4b4f30e6e3e410584146d813c151928f94ec0db06c4f2595538"}, - {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a507d7fa44688bbac76af6521e488b3da93de155b9cba6f2c9b7833ce243d59"}, - {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffe18407a4d000c568182ce5388bbbedeb099896904e43fc14eee76cfae6dec5"}, - {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_24_armv7l.whl", hash = "sha256:fa8e48001b39d54d97d7b380a0669fa99fc0feeb972e35a2d677ba59164a9a22"}, - {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:394f12a2671ff8c4dfa2e85be6c08be0651ad85bc1e6aa9c77c21671baaf28cd"}, - {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_24_s390x.whl", hash = "sha256:2f9ea0355f90db2a76af530245fa42f04d98f752a1236ed7c6809ec484560d5b"}, - {file = "pydantic_core-2.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:61d4e713f467abcdd59b47665d488bb898ad3dd47ce7446522a50e0cbd8e8279"}, - {file = "pydantic_core-2.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:453862ab268f6326b01f067ed89cb3a527d34dc46f6f4eeec46a15bbc706d0da"}, - {file = "pydantic_core-2.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:56a85fa0dab1567bd0cac10f0c3837b03e8a0d939e6a8061a3a420acd97e9421"}, - {file = "pydantic_core-2.4.0-cp38-none-win32.whl", hash = "sha256:0d726108c1c0380b88b6dd4db559f0280e0ceda9e077f46ff90bc85cd4d03e77"}, - {file = "pydantic_core-2.4.0-cp38-none-win_amd64.whl", hash = "sha256:047580388644c473b934d27849f8ed8dbe45df0adb72104e78b543e13bf69762"}, - {file = "pydantic_core-2.4.0-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:867d3eea954bea807cabba83cfc939c889a18576d66d197c60025b15269d7cc0"}, - {file = "pydantic_core-2.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:664402ef0c238a7f8a46efb101789d5f2275600fb18114446efec83cfadb5b66"}, - {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64e8012ad60a5f0da09ed48725e6e923d1be25f2f091a640af6079f874663813"}, - {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac2b680de398f293b68183317432b3d67ab3faeba216aec18de0c395cb5e3060"}, - {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_24_armv7l.whl", hash = "sha256:8efc1be43b036c2b6bcfb1451df24ee0ddcf69c31351003daf2699ed93f5687b"}, - {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:d93aedbc4614cc21b9ab0d0c4ccd7143354c1f7cffbbe96ae5216ad21d1b21b5"}, - {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_24_s390x.whl", hash = "sha256:af788b64e13d52fc3600a68b16d31fa8d8573e3ff2fc9a38f8a60b8d94d1f012"}, - {file = "pydantic_core-2.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97c6349c81cee2e69ef59eba6e6c08c5936e6b01c2d50b9e4ac152217845ae09"}, - {file = "pydantic_core-2.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc086ddb6dc654a15deeed1d1f2bcb1cb924ebd70df9dca738af19f64229b06c"}, - {file = "pydantic_core-2.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e953353180bec330c3b830891d260b6f8e576e2d18db3c78d314e56bb2276066"}, - {file = "pydantic_core-2.4.0-cp39-none-win32.whl", hash = "sha256:6feb4b64d11d5420e517910d60a907d08d846cacaf4e029668725cd21d16743c"}, - {file = "pydantic_core-2.4.0-cp39-none-win_amd64.whl", hash = "sha256:153a61ac4030fa019b70b31fb7986461119230d3ba0ab661c757cfea652f4332"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:3fcf529382b282a30b466bd7af05be28e22aa620e016135ac414f14e1ee6b9e1"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2edef05b63d82568b877002dc4cb5cc18f8929b59077120192df1e03e0c633f8"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da055a1b0bfa8041bb2ff586b2cb0353ed03944a3472186a02cc44a557a0e661"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:77dadc764cf7c5405e04866181c5bd94a447372a9763e473abb63d1dfe9b7387"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a4ea23b07f29487a7bef2a869f68c7ee0e05424d81375ce3d3de829314c6b5ec"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:382f0baa044d674ad59455a5eff83d7965572b745cc72df35c52c2ce8c731d37"}, - {file = "pydantic_core-2.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:08f89697625e453421401c7f661b9d1eb4c9e4c0a12fd256eeb55b06994ac6af"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:43a405ce520b45941df9ff55d0cd09762017756a7b413bbad3a6e8178e64a2c2"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:584a7a818c84767af16ce8bda5d4f7fedb37d3d231fc89928a192f567e4ef685"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04922fea7b13cd480586fa106345fe06e43220b8327358873c22d8dfa7a711c7"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17156abac20a9feed10feec867fddd91a80819a485b0107fe61f09f2117fe5f3"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e562cc63b04636cde361fd47569162f1daa94c759220ff202a8129902229114"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:90f3785146f701e053bb6b9e8f53acce2c919aca91df88bd4975be0cb926eb41"}, - {file = "pydantic_core-2.4.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:e40b1e97edd3dc127aa53d8a5e539a3d0c227d71574d3f9ac1af02d58218a122"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:b27f3e67f6e031f6620655741b7d0d6bebea8b25d415924b3e8bfef2dd7bd841"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be86c2eb12fb0f846262ace9d8f032dc6978b8cb26a058920ecb723dbcb87d05"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4665f7ed345012a8d2eddf4203ef145f5f56a291d010382d235b94e91813f88a"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:79262be5a292d1df060f29b9a7cdd66934801f987a817632d7552534a172709a"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:5fd905a69ac74eaba5041e21a1e8b1a479dab2b41c93bdcc4c1cede3c12a8d86"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:2ad538b7e07343001934417cdc8584623b4d8823c5b8b258e75ec8d327cec969"}, - {file = "pydantic_core-2.4.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:dd2429f7635ad4857b5881503f9c310be7761dc681c467a9d27787b674d1250a"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:efff8b6761a1f6e45cebd1b7a6406eb2723d2d5710ff0d1b624fe11313693989"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32a1e0352558cd7ccc014ffe818c7d87b15ec6145875e2cc5fa4bb7351a1033d"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a027f41c5008571314861744d83aff75a34cf3a07022e0be32b214a5bc93f7f1"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1927f0e15d190f11f0b8344373731e28fd774c6d676d8a6cfadc95c77214a48b"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:7aa82d483d5fb867d4fb10a138ffd57b0f1644e99f2f4f336e48790ada9ada5e"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b85778308bf945e9b33ac604e6793df9b07933108d20bdf53811bc7c2798a4af"}, - {file = "pydantic_core-2.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3ded19dcaefe2f6706d81e0db787b59095f4ad0fbadce1edffdf092294c8a23f"}, - {file = "pydantic_core-2.4.0.tar.gz", hash = "sha256:ec3473c9789cc00c7260d840c3db2c16dbfc816ca70ec87a00cddfa3e1a1cdd5"}, + {file = "pydantic_core-2.6.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:f55001a689111a297c0006c46c0589cfd559261baaa9a37bc35eff05b8cae1a6"}, + {file = "pydantic_core-2.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bb6273068e9450c5c91f58dd277fbd406b896ffa30f0ef312edc5519d07f16ae"}, + {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:043212f21c75cb6ee3a92fffbc747410e32b08e1a419ce16a9da98a16d660a7c"}, + {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:db0c12f1e9d3bf658634621f3423486803d749fef77a64cfb4252f9d619e1817"}, + {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:81424dc05c4342a19fb64323bb9d4468e7407b745c00377ccc4d3dd96d5e02fe"}, + {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c8f3aebaf92f088b1dafd7101d1ccca0459ae0f5b26017411b9969667d289a9"}, + {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd9f14454b4bc89c705ce17951f9c783db82efd2b44a424487c593e2269eef61"}, + {file = "pydantic_core-2.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2effc71653247e76c5b95d15c58d4ca3f591f42f714eb3b32df9d6ec613794a5"}, + {file = "pydantic_core-2.6.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:56672429f8a89d2a0f4402d912f0dad68c2d05f7c278d3152c6fb4a76c2a429a"}, + {file = "pydantic_core-2.6.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d0bf1c2545ab253732229c7fe8294d98eb08f99aa25a388267e1bc4d2d7e0a34"}, + {file = "pydantic_core-2.6.1-cp310-none-win32.whl", hash = "sha256:c5be947ad41a7602f941dc834d03e64dd1c7fae65fa85cb4f1004a95c5d50df1"}, + {file = "pydantic_core-2.6.1-cp310-none-win_amd64.whl", hash = "sha256:3d14ae98a8d251402ef8ed017039d2fc3e29fb155f909cd3816ba259fd30fb48"}, + {file = "pydantic_core-2.6.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:4a3c20808d3ced90e29439f72a563eadf21d29560935cc818b2dab80b92c114a"}, + {file = "pydantic_core-2.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:da240bbd8191edc6009e7793d5d4d67c55f56225c4788f068d6286c20e5a2038"}, + {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de1a3e56e34264d5216c67d2a48185216ada8f5f35a7f4c96a3971847c0de897"}, + {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9b623e09239ed333d14c02c9fcd1a7bb350b95eca8383f6e9b0d8e373d5a14b5"}, + {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a12520a6d502a25f6e47319874e47056b290f1b3c2ed9391444ce81c8cc5b83"}, + {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1141f18414aee8865c7917ae1432e419c1983272f53625152493692ff3d6783"}, + {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7888b3ee7566865cff3e9edab5d6cdf2e7cf793df17fe53d5e7be3e57eae45ec"}, + {file = "pydantic_core-2.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3bdf293b6304bc451678b7016c2505b7d97aa85ff13dac4420027b1b69e15d3d"}, + {file = "pydantic_core-2.6.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7ef56a05bb60336d5e795bf166d6712b2362e6478522c77e8336cb0da8909913"}, + {file = "pydantic_core-2.6.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3210eb73707e3487c16ef25cfd1663660f4e7d647a181d6c2fb18bc6167985fb"}, + {file = "pydantic_core-2.6.1-cp311-none-win32.whl", hash = "sha256:707e3005e8c129bdac117285b71717c13b9ed81a81eae0b1642f4ddc60028e63"}, + {file = "pydantic_core-2.6.1-cp311-none-win_amd64.whl", hash = "sha256:2b8ccec2189d8a8b83929f79e5bc00c0656f6c2ba4345125c0c82d1b77e15a26"}, + {file = "pydantic_core-2.6.1-cp311-none-win_arm64.whl", hash = "sha256:c1e44b77442fb5b1b6fccea30e3359b14d0a2e5896801243defe54482a591500"}, + {file = "pydantic_core-2.6.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:c82fb25f965f6777032fc2f2856c86149f7709c8f7fd0c020a8631b8211f2bab"}, + {file = "pydantic_core-2.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:494b211b12b8fedd184dbba609f6ed582e23561db57c1996fd6773989dbaef9b"}, + {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1281c940f47e5c89b594ef7580045647df1f9ad687edd503bcc0485be94576f4"}, + {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2d41701c88d8b678c16c10562949f2d28aceacd767cbe51dac9c8c41e6e609fb"}, + {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a839c95d5cc91eed053d8dafde4e200c4bc82f56fb1cf7bbfaeb03e2d907929"}, + {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c22e4fbfb5823d0fcb2c20ed164b39c3588554f9635f70765e8c9cff0fef67ad"}, + {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2fed4ad60ccf2698bd04e95dfc3bd84149ced9605a29fd27d624701e1da300c"}, + {file = "pydantic_core-2.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33b9343aa464d60c31937b361abde08d3af9943f3eb09d3216211b6236bd40c4"}, + {file = "pydantic_core-2.6.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:56e4953cd911293d6d755e2a97c651826aca76201db8f1ee298939e703721390"}, + {file = "pydantic_core-2.6.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cd163109047ab41ef1ea34258b35beb3ccac90af2012927ee8ab6ff122fef671"}, + {file = "pydantic_core-2.6.1-cp312-none-win32.whl", hash = "sha256:f5b51ec04743c94288c46e3759769611ab7c5ce0f941113363da96d20d345fb6"}, + {file = "pydantic_core-2.6.1-cp312-none-win_amd64.whl", hash = "sha256:ca5606bd82e255b1d704a4334e5ebf05ae966b69686fae02dcd31c057bdcb113"}, + {file = "pydantic_core-2.6.1-cp312-none-win_arm64.whl", hash = "sha256:dfc8f534a21b60b00f87e5a4fc36b8b8945160a6cc9e7b6e67db541c766c9597"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:b1aed20778092f8334c8eaf91550fa2805221d5e9b40ebdd1f46ee7efc159a48"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:64ff7a4b7ee2a56735af28da76c5dacbba6995801080f739d14610f4aa3de35d"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2d8faedb138c704957642fdf154c94f1b3d2a15cbd2472e45665f80463e85ee"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:55aac69d7339a63e37164f0a629c3034becc6746d68d126118a3ee4493514bed"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfdb1617af455a551be4cc0471f0bf3bfb1e882db71afad0e587c821326bb749"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aadc84f5bd7b1421b5a6b389ceff46062dd4a58c44cfb75990e9ca2d9d8270df"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1a01dce87507b9a8f1b71933ade85c573a22c9bd4649590e28d8a497afb68bd"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cd6f05f3e237ed6b3949464e7679e55843645fe0fe8d3b33277c321386836f6a"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:760f8a0aeb43ceeff1e536859e071a72e91075d4d37d51470812c4f49e682702"}, + {file = "pydantic_core-2.6.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a1ad48e77935d7dbbc2d75aeb638abbfbd0df0cfacf774dbe98d52271468f00c"}, + {file = "pydantic_core-2.6.1-cp37-none-win32.whl", hash = "sha256:153a5dd24c09ab7544beda967366afbaae8350b327a4ebd5807ed45ec791baa0"}, + {file = "pydantic_core-2.6.1-cp37-none-win_amd64.whl", hash = "sha256:cc7fc3e81b4ea6bce7e0e1d9797f496e957c5e66adf483f89afdce2d81d19986"}, + {file = "pydantic_core-2.6.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:5482d692ae37857695feccb179022728b275b7bfcc1c85bcdf7b556e76bffcd8"}, + {file = "pydantic_core-2.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:45d248c3c5c5c23a8d048cfdebc8151ae7b32a6dc6d68fbca995521e54692207"}, + {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6dd6c9f47e26779bf1f7da4d6ccd60f66973e63b0a143438f1e20bae296c3fde"}, + {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:55701608e60418a423db2486b5c64d790f86eb78a11b9077efb6302c50e62564"}, + {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:420a76a62dd20a6ef08445abf7cf04dcd8a845a5bb15932c2e88a8e518c70d43"}, + {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f253d20314e53ba0fb2b95541b6ed23f44fbcd927fe7674de341545c3327c3d"}, + {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5127b811c6a26deb85f5b17a06c26c28ce204e51e0a963b75bdf8612b22546d"}, + {file = "pydantic_core-2.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:51ffa985b874ca7d0dc199bb75c67b77907379291c91532a9e2d981f7b681527"}, + {file = "pydantic_core-2.6.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4902300e763a2fcc49ae14366493ef1fdbd3c7128b9acf37aef505f671aa681f"}, + {file = "pydantic_core-2.6.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e1c69334bb843c9bff98f52a1fa6c06420081a561fcecb03c6b9376960bd7de2"}, + {file = "pydantic_core-2.6.1-cp38-none-win32.whl", hash = "sha256:e84812b1ca989b2e9f4913d7b75ae0eece2a90154de35b4c5411ad640bfd387c"}, + {file = "pydantic_core-2.6.1-cp38-none-win_amd64.whl", hash = "sha256:775098e3629a959dfec8444667a53e0916839e9fbf6b55e07d6e2aadde006400"}, + {file = "pydantic_core-2.6.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:a32ed5a794918a61bf77b967c197eb78f31ad4e3145860193dc381bde040717e"}, + {file = "pydantic_core-2.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:66eda8ac48ac33e9e5c6541c8e30c702924b70a6f2e9732b74230d9b2dd35fb6"}, + {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb5131d75d69b0547ef9a8f46f7b94857411c9badcdd5092de61a3b4943f08c7"}, + {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20e850f3242d7836a5e15453f798d8569b9754350c8e184ba32d102c515dd507"}, + {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f4327fa6a1ac3da62b27d43bb0f27657ed4e601b141ecbfcf8523814b6c33b6"}, + {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c7b89b2875b967ad5c3c980bf72773851554f80c2529796e815a10c99295d872"}, + {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78eadd8d7d5cd8c3616e363c394d721437c339feaa4c28404e2eda79add69781"}, + {file = "pydantic_core-2.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17ab25bb24e98b61d120b7248c2b49ea56ce754a050d6b348be42015fcb7aa25"}, + {file = "pydantic_core-2.6.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6ea8dd2854fe6cee5ea0d60304ee7877dffe487cf118f221e85029269dd1235d"}, + {file = "pydantic_core-2.6.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9bf3ba6b4878ee692f6e24230801f682807fd97356bc2064f630fc0a2ad2ead6"}, + {file = "pydantic_core-2.6.1-cp39-none-win32.whl", hash = "sha256:b974d65692333931b4c7f730e7a3135ff854a1e5384bc260de3327ea364c835a"}, + {file = "pydantic_core-2.6.1-cp39-none-win_amd64.whl", hash = "sha256:f34f26d8a5f1a45366189ec30a57f43b21e2172d0d3b62822638dd885cc8eaab"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:f7ec4c6edafa3f0eb1aa461e31ea263736cc541b2459dddfbda7085b30844801"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:3679b9a1f41eb1b699e9556f91281d78c416cdc59ae90d5733fbe2017f1effe9"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3ff36f945342086ee917d4219dd0e59660a2dfcdb86a07696c2791f5d59c07d"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:734864605d722a6f8db3b9c96371710f7cb591fbfca40cfeaedf5b67df282438"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7188359b95a2b1aef5744a2ee6af2d9cfc733dd823f8840f4c896129477a172b"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:382d40843ae759d43ef65b67dec713390f9417135c1dd730afbf03cf2f450f45"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:4525b8498d362e4e324e3e175239b364768f52bd3563ac4ef9750160f5789de8"}, + {file = "pydantic_core-2.6.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e55514a022c768cccf07a675d20d07b847980dcd9250f6b516a86bab5612fc01"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:34734d486d059f0f6f5bfa9ba4a41449f666e2abbde002e9fa8b050bc50e3347"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a809498dceb0cd1cd1e57a2bfdc70ea82f424776e0196f4d63c4b6fcdaeb5aab"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:588a5ffd8bbf1b2230611ed1b45221adcf05b981037b2f853b5f20465849b5c1"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:26b81017aeae0d96f776fbce34a3a763d26ac575d8ad3f1202bdfdd2b935954b"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:7ddaa2c3c66682f0ff4ebc8c85ef2d8305f32deba79416464c47c93d94ca3740"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d6971131de66d1a37293f2e032206b6984b0dec44f568b453dfe89a84a2de0cc"}, + {file = "pydantic_core-2.6.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:200704f6824f8014bdccb1ce57cbd328666e6de4ecd77f0b8ab472cdea9c49ce"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:6916b27072c957947919fb32551f08486562bb8616f2e3db9e4e9c1d83d36886"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:136de286abf53f326b90389aaaca8a8050c2570adfc74afe06ab1c35d5d242bf"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60a238bb4ab09a81a6b25c9a0bb12756cfab2d9f3a7a471f857a179f83da0df6"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2034d9b83a59b3b74b9dbf97ddb99de86c08863c1c33aabf80bc95791c7d50c3"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7c3a2b4d1636446dc71da1e949d2cf9ac1ee691ca63a640b77fce0360b4b75be"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:09e4ebd11a0b333b1fca75c1004c76dc9719f3aaf83ae38c42358754d8a76148"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:a4536d132a8bbd05bf368fb802a264cb9828f6c85e4029a6a3670bc98ba97323"}, + {file = "pydantic_core-2.6.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6221c97d6d58f2370650cfe3d81408901a1951c99960e1df9f6f9f8482d73d08"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4223e8bdad41d846a84cda400cd538e1cdc63d98eb4d41951396bfdb88fd8ce9"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c07cdb2e02733e5f26b9b004a1a8b99814d175f8953fa9f59e4293de2b8e9787"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8714e958d01342d08e520ffec6c1acf66cdec83ce51302f9a1a6efb2f784d0b6"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f03541c25a77fb5445055e070b69d292c9818a9195ffbfd3962c0ad0da983e8"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:364c13ef48c9e2f8c2ea8ee0da5ea23db5e218f99e796cbf360a2a7cab511439"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:27ba58bbfd1b2b9da45bfe524e680e2bc747a1ca9738ee5aa18d8cbdcc08e5e6"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:92321582e59da185b76b2eca4488ea95e41800672e57107509d32ebf8ad550f8"}, + {file = "pydantic_core-2.6.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2da1d21a4f2675d5b8a749674993a65c0537e2066e7ab7b1a4a54ef0b3ac8efd"}, + {file = "pydantic_core-2.6.1.tar.gz", hash = "sha256:5b4efa68bcfa6f2b93624c6660b6cf4b7b4336d4225afb314254a0ed9c9f4153"}, ] [package.dependencies] @@ -2170,13 +2218,13 @@ full = ["numpy"] [[package]] name = "readme-renderer" -version = "40.0" +version = "41.0" description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" optional = false python-versions = ">=3.8" files = [ - {file = "readme_renderer-40.0-py3-none-any.whl", hash = "sha256:e18feb2a1e7706f2865b81ebb460056d93fb29d69daa10b223c00faa7bd9a00a"}, - {file = "readme_renderer-40.0.tar.gz", hash = "sha256:9f77b519d96d03d7d7dce44977ba543090a14397c4f60de5b6eb5b8048110aa4"}, + {file = "readme_renderer-41.0-py3-none-any.whl", hash = "sha256:a38243d5b6741b700a850026e62da4bd739edc7422071e95fd5c4bb60171df86"}, + {file = "readme_renderer-41.0.tar.gz", hash = "sha256:4f4b11e5893f5a5d725f592c5a343e0dc74f5f273cb3dcf8c42d9703a27073f7"}, ] [package.dependencies] @@ -2607,13 +2655,13 @@ files = [ [[package]] name = "shellingham" -version = "1.5.0.post1" +version = "1.5.3" description = "Tool to Detect Surrounding Shell" optional = false python-versions = ">=3.7" files = [ - {file = "shellingham-1.5.0.post1-py2.py3-none-any.whl", hash = "sha256:368bf8c00754fd4f55afb7bbb86e272df77e4dc76ac29dbcbb81a59e9fc15744"}, - {file = "shellingham-1.5.0.post1.tar.gz", hash = "sha256:823bc5fb5c34d60f285b624e7264f4dda254bc803a3774a147bf99c0e3004a28"}, + {file = "shellingham-1.5.3-py2.py3-none-any.whl", hash = "sha256:419c6a164770c9c7cfcaeddfacb3d31ac7a8db0b0f3e9c1287679359734107e9"}, + {file = "shellingham-1.5.3.tar.gz", hash = "sha256:cb4a6fec583535bc6da17b647dd2330cf7ef30239e05d547d99ae3705fd0f7f8"}, ] [[package]] diff --git a/requirements.txt b/requirements.txt index 7801854..18c91b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ certifi==2023.7.22 ; python_full_version >= "3.8.1" and python_version < "3.12" cffi==1.15.1 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux") charset-normalizer==3.2.0 ; python_full_version >= "3.8.1" and python_version < "3.12" cleo==2.0.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -click==8.1.6 ; python_full_version >= "3.8.1" and python_version < "3.12" +click==8.1.7 ; python_full_version >= "3.8.1" and python_version < "3.12" cmake==3.27.2 ; python_full_version >= "3.8.1" and python_version < "3.12" colorama==0.4.6 ; python_full_version >= "3.8.1" and python_version < "3.12" and (os_name == "nt" or platform_system == "Windows") crashtest==0.4.1 ; python_full_version >= "3.8.1" and python_version < "3.12" @@ -37,7 +37,7 @@ more-itertools==10.1.0 ; python_full_version >= "3.8.1" and python_version < "3. msgpack==1.0.5 ; python_full_version >= "3.8.1" and python_version < "3.12" ninja==1.11.1 ; python_full_version >= "3.8.1" and python_version < "3.12" numpy==1.24.4 ; python_full_version >= "3.8.1" and python_version < "3.12" -orjson==3.9.4 ; python_full_version >= "3.8.1" and python_version < "3.12" +orjson==3.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12" packaging==23.1 ; python_full_version >= "3.8.1" and python_version < "3.12" pexpect==4.8.0 ; python_full_version >= "3.8.1" and python_version < "3.12" pkginfo==1.9.6 ; python_full_version >= "3.8.1" and python_version < "3.12" @@ -46,13 +46,13 @@ platformdirs==3.10.0 ; python_full_version >= "3.8.1" and python_version < "3.12 poetry-core==1.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12" poetry-plugin-export==1.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12" poetry==1.5.1 ; python_full_version >= "3.8.1" and python_version < "3.12" -protobuf==4.24.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +protobuf==4.24.1 ; python_full_version >= "3.8.1" and python_version < "3.12" psutil==5.9.5 ; python_full_version >= "3.8.1" and python_version < "3.12" ptyprocess==0.7.0 ; python_full_version >= "3.8.1" and python_version < "3.12" pycparser==2.21 ; python_full_version >= "3.8.1" and python_version < "3.12" and (sys_platform == "darwin" or sys_platform == "linux") -pydantic-core==2.4.0 ; python_full_version >= "3.8.1" and python_version < "3.12" +pydantic-core==2.6.1 ; python_full_version >= "3.8.1" and python_version < "3.12" pydantic-settings==2.0.3 ; python_full_version >= "3.8.1" and python_version < "3.12" -pydantic==2.1.1 ; python_full_version >= "3.8.1" and python_version < "3.12" +pydantic==2.2.1 ; python_full_version >= "3.8.1" and python_version < "3.12" pyproject-hooks==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12" python-dotenv==1.0.0 ; python_full_version >= "3.8.1" and python_version < "3.12" pywin32-ctypes==0.2.2 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "win32" @@ -66,7 +66,7 @@ rpds-py==0.9.2 ; python_full_version >= "3.8.1" and python_version < "3.12" safetensors==0.3.2 ; python_full_version >= "3.8.1" and python_version < "3.12" secretstorage==3.3.3 ; python_full_version >= "3.8.1" and python_version < "3.12" and sys_platform == "linux" sentencepiece==0.1.99 ; python_full_version >= "3.8.1" and python_version < "3.12" -shellingham==1.5.0.post1 ; python_full_version >= "3.8.1" and python_version < "3.12" +shellingham==1.5.3 ; python_full_version >= "3.8.1" and python_version < "3.12" six==1.16.0 ; python_full_version >= "3.8.1" and python_version < "3.12" sniffio==1.3.0 ; python_full_version >= "3.8.1" and python_version < "3.12" sse-starlette==1.6.5 ; python_full_version >= "3.8.1" and python_version < "3.12" From 28c6925665eaff7f5dab8971a9bb7269b62eb71e Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 20 Aug 2023 01:09:39 +0900 Subject: [PATCH 08/18] Docker image update --- Dockerfile | 55 ++++++++------------ Dockerfile.compressed | 36 +++++++++++++ docker-compose.persistent.yml | 30 ++--------- docker-compose.yml | 29 ++--------- llama_api/server/app_settings.py | 86 +++++++++++++++++++------------- llama_api/utils/dependency.py | 14 ++++-- main.py | 36 ++----------- readme.md | 18 ++++--- 8 files changed, 139 insertions(+), 165 deletions(-) create mode 100644 Dockerfile.compressed diff --git a/Dockerfile b/Dockerfile index 7757736..23d842d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,48 +1,33 @@ -### Dockerfile for Python 3.11.4 & CUDA 12.1.1 & Ubuntu 22.04 -### Approximately 5 ~ 10 minutes to build - # Select the required CUDA version. -FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 -ENV PYTHON_VERSION="3.11.4" -ENV PYTHON_VERSION_SHORT="3.11" +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder -# Copy the necessary files. -COPY llama_api /app/llama_api -COPY pyproject.toml /app/pyproject.toml -COPY requirements.txt /app/requirements.txt -COPY main.py /app/main.py -COPY model_definitions.py /app/model_definitions.py +ENV PYTHON_VERSION="3.11.4" \ + PYTHON_VERSION_SHORT="3.11" \ + DEBIAN_FRONTEND=noninteractive \ + CUDA_DOCKER_ARCH=all # Install the necessary applications, and then install Python. -# Then, install the necessary Python packages(Dependencies). RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - zlib1g-dev \ - libncurses5-dev \ - libgdbm-dev \ - libnss3-dev \ - libssl-dev \ - libreadline-dev \ - libffi-dev \ - wget \ - git \ - libsqlite3-dev\ + git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \ && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \ && tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \ && cd /tmp/Python-${PYTHON_VERSION} \ - && ./configure \ - && make \ - && make install \ + && ./configure && make && make install \ + && python3 -m pip install --upgrade pip --no-cache-dir \ + && rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \ && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \ - && python3 -m pip install --upgrade pip \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean \ - && rm -rf /tmp/* \ - && cd /app \ - && python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda - # Need to skip complie because GPU access to host is not supported when building image. + && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \ + && nvcc --version + +# Copy the necessary files. +COPY llama_api /app/llama_api +COPY pyproject.toml requirements.txt main.py model_definitions.py /app/ + +# Install the necessary Python packages(Dependencies). +RUN cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir # Set the working directory and start the server. +STOPSIGNAL SIGINT WORKDIR /app -ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ] \ No newline at end of file +ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ] diff --git a/Dockerfile.compressed b/Dockerfile.compressed new file mode 100644 index 0000000..2eac700 --- /dev/null +++ b/Dockerfile.compressed @@ -0,0 +1,36 @@ +### Compressed version of the Dockerfile. +### It is compressed by only do RUN once to reduce the number of layers. +### However, it takes a long time to build the image compared to the original Dockerfile +### because it does not use the cache. + +# Select the required CUDA version. +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder + +ENV PYTHON_VERSION="3.11.4" \ + PYTHON_VERSION_SHORT="3.11" \ + DEBIAN_FRONTEND=noninteractive \ + CUDA_DOCKER_ARCH=all + +# Copy the necessary files. +COPY llama_api /app/llama_api +COPY pyproject.toml requirements.txt main.py model_definitions.py /app/ + +# Install the necessary applications, and then install Python. +RUN apt-get update && apt-get install -y --no-install-recommends \ + git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \ + && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \ + && tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \ + && cd /tmp/Python-${PYTHON_VERSION} \ + && ./configure && make && make install \ + && python3 -m pip install --upgrade pip --no-cache-dir \ + && rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \ + && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \ + && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \ + && nvcc --version \ + && cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir + +# Set the working directory and start the server. +STOPSIGNAL SIGINT +WORKDIR /app +ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ] diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml index f018d07..e09df69 100644 --- a/docker-compose.persistent.yml +++ b/docker-compose.persistent.yml @@ -5,7 +5,9 @@ volumes: services: llama-api: - image: cosogi/llama-api:230816 + image: c0sogi/llama-api:latest + cap_add: + - IPC_LOCK entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 @@ -14,7 +16,6 @@ services: volumes: - llama-api-models:/app/models - ./model_definitions.py:/app/model_definitions.py - - ./main.py:/app/main.py ports: - 8000:8000 deploy: @@ -22,27 +23,4 @@ services: reservations: devices: - driver: nvidia - capabilities: [gpu] - - -# services: -# llama-api: -# build: -# context: . -# dockerfile: Dockerfile -# entrypoint: ["python3", "-m", "main", "--port", "8000"] -# environment: -# - LLAMA_API_MAX_WORKERS=1 -# - LLAMA_API_API_KEY= -# volumes: -# - llama-api-models:/app/models -# - ./model_definitions.py:/app/model_definitions.py -# - ./main.py:/app/main.py -# ports: -# - 8000:8000 -# deploy: -# resources: -# reservations: -# devices: -# - driver: nvidia -# capabilities: [gpu] \ No newline at end of file + capabilities: [gpu] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 3c910ea..f2ae587 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,9 @@ version: '3' services: llama-api: - image: cosogi/llama-api:230816 + image: c0sogi/llama-api:latest + cap_add: + - IPC_LOCK entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 @@ -22,27 +24,4 @@ services: reservations: devices: - driver: nvidia - capabilities: [gpu] - -# services: -# llama-api: -# build: -# context: . -# dockerfile: Dockerfile -# entrypoint: ["python3", "-m", "main", "--port", "8000"] -# environment: -# - MAX_WORKERS=1 -# volumes: -# - ./models:/app/models -# - ./llama_api:/app/llama_api -# - ./model_definitions.py:/app/model_definitions.py -# - ./main.py:/app/main.py -# - ./requirements.txt:/app/requirements.txt -# ports: -# - 8000:8000 -# deploy: -# resources: -# reservations: -# devices: -# - driver: nvidia -# capabilities: [gpu] \ No newline at end of file + capabilities: [gpu] \ No newline at end of file diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 4939932..1938c4a 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -73,12 +73,14 @@ def initialize_before_launch( skip_pytorch_install: bool = False, skip_tensorflow_install: bool = False, skip_compile: bool = False, + no_cache: bool = False, ) -> None: """Initialize the app""" + args = ["--no-cache-dir"] if no_cache else [] for git_clone_args in Config.repositories.values(): git_clone(**git_clone_args) if environ.get("LLAMA_API_XFORMERS") == "1": - install_package("xformers") + install_package("xformers", args=args) if install_packages: # Install all dependencies if not skip_compile: @@ -88,17 +90,17 @@ def initialize_before_launch( if not poetry.exists(): # Install poetry logger.warning(f"⚠️ Poetry not found: {poetry}") - install_package("poetry", force=True) + install_package("poetry", force=True, args=args) if not skip_pytorch_install: # Install pytorch - install_pytorch(force_cuda=force_cuda) + install_pytorch(force_cuda=force_cuda, args=args) if not skip_tensorflow_install: # Install tensorflow - install_tensorflow() + install_tensorflow(args=args) # Install all dependencies of our project and other repositories project_paths = [Path(".")] + list(Path("repositories").glob("*")) - install_all_dependencies(project_paths=project_paths) + install_all_dependencies(project_paths=project_paths, args=args) # Get current packages installed logger.info(f"📦 Installed packages: {get_installed_packages()}") @@ -151,6 +153,7 @@ def run( skip_pytorch_install: bool = False, skip_tensorflow_install: bool = False, skip_compile: bool = False, + no_cache: bool = False, environs: Optional[Dict[str, str]] = None, ) -> None: initialize_before_launch( @@ -159,6 +162,7 @@ def run( skip_pytorch_install=skip_pytorch_install, skip_tensorflow_install=skip_tensorflow_install, skip_compile=skip_compile, + no_cache=no_cache, ) from uvicorn import Config as UvicornConfig @@ -177,43 +181,53 @@ def run( ).run() +parser = argparse.ArgumentParser() +parser.add_argument( + "-i", + "--install-pkgs", + action="store_true", + help="Install all required packages before running the server", +) +parser.add_argument( + "-fc", + "--force-cuda", + action="store_true", + help=( + "Force CUDA version of pytorch to be used" + "when installing pytorch. e.g. torch==2.0.1+cu118" + ), +) +parser.add_argument( + "-st", + "--skip-torch-install", + action="store_true", + help="Skip installing pytorch, if `install-pkgs` is set", +) +parser.add_argument( + "-stf", + "--skip-tf-install", + action="store_true", + help="Skip installing tensorflow, if `install-pkgs` is set", +) +parser.add_argument( + "-sc", + "--skip-compile", + action="store_true", + help="Skip compiling the shared library of LLaMA C++ code", +) +parser.add_argument( + "-nc", + "--no-cache-dir", + action="store_true", + help="Disable caching of pip installs, if `install-pkgs` is set", +) if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--install-pkgs", - action="store_true", - help="Install all required packages before running the server", - ) - parser.add_argument( - "--force-cuda", - action="store_true", - help=( - "Force CUDA version of pytorch to be used" - "when installing pytorch. e.g. torch==2.0.1+cu118" - ), - ) - parser.add_argument( - "--skip-torch-install", - action="store_true", - help="Skip installing pytorch, if `install-pkgs` is set", - ) - parser.add_argument( - "--skip-tf-install", - action="store_true", - help="Skip installing tensorflow, if `install-pkgs` is set", - ) - parser.add_argument( - "--skip-compile", - action="store_true", - help="Skip compiling the shared library of LLaMA C++ code", - ) - args = parser.parse_args() - initialize_before_launch( install_packages=args.install_pkgs, force_cuda=args.force_cuda, skip_pytorch_install=args.skip_torch_install, skip_tensorflow_install=args.skip_tf_install, skip_compile=args.skip_compile, + no_cache=args.no_cache_dir, ) diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py index 9fc4dd4..2ad2be2 100644 --- a/llama_api/utils/dependency.py +++ b/llama_api/utils/dependency.py @@ -224,12 +224,14 @@ def import_repository( sys.path.remove(str(disk_path)) -def install_package(package: str, *args, force: bool = False) -> bool: +def install_package( + package: str, force: bool = False, args: Optional[List[str]] = None +) -> bool: """Install a package with pip.""" if not force and is_package_available(package): return True return run_command( - [sys.executable, "-m", "pip", "install", package, *args], + [sys.executable, "-m", "pip", "install", package, *(args or [])], action="install", name=package, ) @@ -250,6 +252,7 @@ def install_pytorch( cuda_version: Optional[str] = Config.cuda_version, source: Optional[str] = Config.torch_source, force_cuda: bool = False, + args: Optional[List[str]] = None, ) -> bool: """Try to install Pytorch. If CUDA is available, install the CUDA version of torch. @@ -304,12 +307,14 @@ def install_pytorch( pip_install.append(f"torch{torch_version}") # Install torch + pip_install += args or [] return run_command(pip_install, action="install", name="PyTorch") def install_tensorflow( tensorflow_version: str = Config.tensorflow_version, source: Optional[str] = None, + args: Optional[List[str]] = None, ) -> bool: """Try to install TensorFlow. @@ -332,6 +337,8 @@ def install_tensorflow( # If a source is specified, add it to the pip install command if source: pip_install += ["-f", source] + if args: + pip_install += args # Install TensorFlow return run_command(pip_install, action="install", name="TensorFlow") @@ -339,6 +346,7 @@ def install_tensorflow( def install_all_dependencies( project_paths: Optional[Union[List[Path], List[str]]] = None, + args: Optional[List[str]] = None, ) -> Optional[bool]: """Install every dependencies.""" pip_install = [sys.executable, "-m", "pip", "install", "-r"] @@ -356,7 +364,7 @@ def install_all_dependencies( ) continue result &= run_command( - pip_install + [requirements_path.as_posix()], + pip_install + [requirements_path.as_posix()] + (args or []), action="install", name="dependencies", ) diff --git a/main.py b/main.py index 15877df..49b1850 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,7 @@ -import argparse -from llama_api.server.app_settings import run +from llama_api.server.app_settings import run, parser if __name__ == "__main__": - parser = argparse.ArgumentParser() parser.add_argument( "-p", "--port", @@ -18,36 +16,6 @@ default=1, help="Maximum number of process workers to run; default is 1", ) - parser.add_argument( - "-i", - "--install-pkgs", - action="store_true", - help="Install all required packages before running the server", - ) - parser.add_argument( - "-c", - "--force-cuda", - action="store_true", - help=( - "Force CUDA version of pytorch to be used" - "when installing pytorch. e.g. torch==2.0.1+cu118" - ), - ) - parser.add_argument( - "--skip-torch-install", - action="store_true", - help="Skip installing pytorch, if `install-pkgs` is set", - ) - parser.add_argument( - "--skip-tf-install", - action="store_true", - help="Skip installing tensorflow, if `install-pkgs` is set", - ) - parser.add_argument( - "--skip-compile", - action="store_true", - help="Skip compiling the shared library of LLaMA C++ code", - ) parser.add_argument( "-k", "--api-key", @@ -62,6 +30,7 @@ help="Apply xformers' memory-efficient optimizations", ) parser.add_argument( + "-ne", "--no-embed", action="store_true", help="Disable embeddings endpoint", @@ -75,6 +44,7 @@ skip_pytorch_install=args.skip_torch_install, skip_tensorflow_install=args.skip_tf_install, skip_compile=args.skip_compile, + no_cache=args.no_cache_dir, environs={ "LLAMA_API_MAX_WORKERS": str(args.max_workers), "LLAMA_API_XFORMERS": "1" if args.xformers else "", diff --git a/readme.md b/readme.md index 5a54926..75a9c36 100644 --- a/readme.md +++ b/readme.md @@ -27,22 +27,26 @@ python -m main ``` Options: ```b -usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-i] [-c] [--skip-torch-install] [--skip-tf-install] [--skip-compile] [-k API_KEY] [-x] [--no-embed] +usage: main.py [-h] [-i--install-pkgs] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne] options: -h, --help show this help message and exit + -i, --install-pkgs Install all required packages before running the server + -fc, --force-cuda Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118 + -st, --skip-torch-install + Skip installing pytorch, if `install-pkgs` is set + -stf, --skip-tf-install + Skip installing tensorflow, if `install-pkgs` is set + -sc, --skip-compile Skip compiling the shared library of LLaMA C++ code + -nc, --no-cache-dir + Disable caching of pip installs, if `install-pkgs` is set -p PORT, --port PORT Port to run the server on; default is 8000 -w MAX_WORKERS, --max-workers MAX_WORKERS Maximum number of process workers to run; default is 1 - -i, --install-pkgs Install all required packages before running the server - -c, --force-cuda Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118 - --skip-torch-install Skip installing pytorch, if `install-pkgs` is set - --skip-tf-install Skip installing tensorflow, if `install-pkgs` is set - --skip-compile Skip compiling the shared library of LLaMA C++ code -k API_KEY, --api-key API_KEY API key to use for the server -x, --xformers Apply xformers' memory-efficient optimizations - --no-embed Disable embeddings endpoint + -ne, --no-embed Disable embeddings endpoint ``` ### Unique features From 99ab6c94f2c800b9d349b427b376071eee604390 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 20 Aug 2023 01:21:53 +0900 Subject: [PATCH 09/18] Fix typo --- Dockerfile | 1 + Dockerfile.compressed | 1 + docker-compose.persistent.yml | 2 +- docker-compose.yml | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 23d842d..3974d50 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,7 @@ COPY pyproject.toml requirements.txt main.py model_definitions.py /app/ RUN cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir # Set the working directory and start the server. +ENV PORT=${PORT:-8000} STOPSIGNAL SIGINT WORKDIR /app ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ] diff --git a/Dockerfile.compressed b/Dockerfile.compressed index 2eac700..7892d41 100644 --- a/Dockerfile.compressed +++ b/Dockerfile.compressed @@ -31,6 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir # Set the working directory and start the server. +ENV PORT=${PORT:-8000} STOPSIGNAL SIGINT WORKDIR /app ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ] diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml index e09df69..9a762c2 100644 --- a/docker-compose.persistent.yml +++ b/docker-compose.persistent.yml @@ -5,7 +5,7 @@ volumes: services: llama-api: - image: c0sogi/llama-api:latest + image: cosogi/llama-api:latest cap_add: - IPC_LOCK entrypoint: ["python3", "-m", "main", "--port", "8000"] diff --git a/docker-compose.yml b/docker-compose.yml index f2ae587..8236e02 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3' services: llama-api: - image: c0sogi/llama-api:latest + image: cosogi/llama-api:latest cap_add: - IPC_LOCK entrypoint: ["python3", "-m", "main", "--port", "8000"] From 970b1c52ec09e1f6cd55c5b7bc2984f0778af320 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 20 Aug 2023 11:20:41 +0900 Subject: [PATCH 10/18] Added docker option: NICENESS of process --- docker-compose.persistent.yml | 2 ++ docker-compose.yml | 2 ++ llama_api/modules/exllama.py | 23 +++++++++++------------ llama_api/server/routers/v1.py | 4 +++- llama_api/utils/errors.py | 4 ++++ 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml index 9a762c2..cbc9ec3 100644 --- a/docker-compose.persistent.yml +++ b/docker-compose.persistent.yml @@ -8,6 +8,8 @@ services: image: cosogi/llama-api:latest cap_add: - IPC_LOCK + - SYS_NICE + - SYS_RESOURCE entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 diff --git a/docker-compose.yml b/docker-compose.yml index 8236e02..a370ce8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,8 @@ services: image: cosogi/llama-api:latest cap_add: - IPC_LOCK + - SYS_NICE + - SYS_RESOURCE entrypoint: ["python3", "-m", "main", "--port", "8000"] environment: - FORCE_CUDA=1 diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index 9fafffa..e86e0af 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -199,6 +199,16 @@ def _generate_text( text_buffer = "" # type: str byte_array = array("B") # type: array[int] byte_pattern = compile(r"<0x([0-9a-fA-F]{2})>") + logit_processors = ( + [ + processor + for processor in self.get_logit_processors( + settings=settings, encoder=self.encode + ) + ] + if cfg_mask is None + else None + ) or None for _ in range(settings.max_tokens): # If the generator was interrupted, stop the generation @@ -216,18 +226,7 @@ def _generate_text( else _gen_single_token_without_cfg( generator=generator, input_ids=generator.sequence[0][initial_len:], - logit_processors=( - [ - processor - for processor in self.get_logit_processors( - settings=settings, - encoder=self.encode, - ) - ] - if cfg_mask is None - else None - ) - or None, + logit_processors=logit_processors, ) ) # type: int diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 7e579d0..2b3b56c 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -220,10 +220,12 @@ def log_request_and_response( CreateCompletionRequest, CreateEmbeddingRequest, ], - status: Union[CompletionStatus, EmbeddingStatus], + status: Optional[Union[CompletionStatus, EmbeddingStatus]], state: Literal["Completed", "Interrupted"], ) -> None: """Log the request and response of the completion or embedding""" + if status is None: + return elapsed_time = time() - status.started_at log_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"] body_without_prompt = body.model_dump( diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index e647c79..3d1cef0 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -247,6 +247,10 @@ async def custom_route_handler(self, request: Request) -> Response: status_code, error_message, ) = self.error_message_wrapper(error=error, body=body) + client = request.client.host if request.client else "UNKNOWN" + logger.error( + f'"{client} → {request.url.path}": {error_message["message"]}' + ) return JSONResponse( {"error": error_message}, status_code=status_code, From 95cb3761d51a64fb665c05d19caa178a8c255c38 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 20 Aug 2023 11:29:52 +0900 Subject: [PATCH 11/18] Added feature: Tunnel through cloudflare --- llama_api/server/app_settings.py | 12 ++++++++++++ llama_api/utils/dependency.py | 2 +- main.py | 7 +++++++ readme.md | 8 ++++---- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 1938c4a..2d95232 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -3,6 +3,8 @@ from contextlib import asynccontextmanager from os import environ, getpid from pathlib import Path +from random import randint +from threading import Timer from typing import Dict, Literal, Optional from ..shared.config import Config @@ -154,6 +156,7 @@ def run( skip_tensorflow_install: bool = False, skip_compile: bool = False, no_cache: bool = False, + tunnel: bool = False, environs: Optional[Dict[str, str]] = None, ) -> None: initialize_before_launch( @@ -170,6 +173,15 @@ def run( if environs: environ.update(environs) + if tunnel: + install_package("flask-cloudflared") + from flask_cloudflared import start_cloudflared + + thread = Timer( + 2, start_cloudflared, args=(port, randint(8100, 9000), None, None) + ) + thread.daemon = True + thread.start() UvicornServer( config=UvicornConfig( diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py index 2ad2be2..49fc279 100644 --- a/llama_api/utils/dependency.py +++ b/llama_api/utils/dependency.py @@ -228,7 +228,7 @@ def install_package( package: str, force: bool = False, args: Optional[List[str]] = None ) -> bool: """Install a package with pip.""" - if not force and is_package_available(package): + if not force and is_package_available(package.replace("-", "_")): return True return run_command( [sys.executable, "-m", "pip", "install", package, *(args or [])], diff --git a/main.py b/main.py index 49b1850..f23613b 100644 --- a/main.py +++ b/main.py @@ -35,6 +35,12 @@ action="store_true", help="Disable embeddings endpoint", ) + parser.add_argument( + "-t", + "--tunnel", + action="store_true", + help="Tunnel the server through cloudflared", + ) args = parser.parse_args() run( @@ -45,6 +51,7 @@ skip_tensorflow_install=args.skip_tf_install, skip_compile=args.skip_compile, no_cache=args.no_cache_dir, + tunnel=args.tunnel, environs={ "LLAMA_API_MAX_WORKERS": str(args.max_workers), "LLAMA_API_XFORMERS": "1" if args.xformers else "", diff --git a/readme.md b/readme.md index 1649447..f9646ee 100644 --- a/readme.md +++ b/readme.md @@ -32,19 +32,18 @@ python -m main ``` Options: ```b -usage: main.py [-h] [-i--install-pkgs] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne] +usage: main.py [-h] [-i] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne] [-t] options: -h, --help show this help message and exit - -i, --install-pkgs Install all required packages before running the server + -i, --install-pkgs Install all required packages before running the server -fc, --force-cuda Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118 -st, --skip-torch-install Skip installing pytorch, if `install-pkgs` is set -stf, --skip-tf-install Skip installing tensorflow, if `install-pkgs` is set -sc, --skip-compile Skip compiling the shared library of LLaMA C++ code - -nc, --no-cache-dir - Disable caching of pip installs, if `install-pkgs` is set + -nc, --no-cache-dir Disable caching of pip installs, if `install-pkgs` is set -p PORT, --port PORT Port to run the server on; default is 8000 -w MAX_WORKERS, --max-workers MAX_WORKERS Maximum number of process workers to run; default is 1 @@ -52,6 +51,7 @@ options: API key to use for the server -x, --xformers Apply xformers' memory-efficient optimizations -ne, --no-embed Disable embeddings endpoint + -t, --tunnel Tunnel the server through cloudflared ``` ### Unique features From 3a835e509f3e01e0033097b9dab820224c3d592b Mon Sep 17 00:00:00 2001 From: c0sogi Date: Sun, 20 Aug 2023 23:27:07 +0900 Subject: [PATCH 12/18] Refactored CLI args --- llama_api/modules/exllama.py | 7 +- llama_api/server/app_settings.py | 156 +++++++++++-------------- llama_api/server/routers/v1.py | 9 +- llama_api/shared/config.py | 195 ++++++++++++++++++++++++++++++- llama_api/utils/concurrency.py | 8 +- llama_api/utils/errors.py | 4 +- llama_api/utils/llama_cpp.py | 6 +- main.py | 63 +--------- readme.md | 25 ++-- tests/__init__.py | 0 tests/conftest.py | 40 ++++++- tests/test_cli.py | 57 +++++++++ 12 files changed, 396 insertions(+), 174 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_cli.py diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py index e86e0af..548890e 100644 --- a/llama_api/modules/exllama.py +++ b/llama_api/modules/exllama.py @@ -6,7 +6,7 @@ from ..utils.logger import ApiLogger logger = ApiLogger(__name__) -if environ.get("LLAMA_API_XFORMERS") == "1": +if environ.get("XFORMERS") == "1": with logger.log_any_error( "xformers mode is enabled, but xformers is not installed", suppress_exception=True, @@ -140,6 +140,11 @@ def __del__(self) -> None: del self._generator self._generator = None logger.info("🗑️ ExllamaCompletionGenerator generator deleted") + if self._lora is not None: + getattr(self._lora, "__del__", lambda: None)() + del self._lora + self._lora = None + logger.info("🗑️ ExllamaCompletionGenerator lora deleted") if self._model is not None: self._model.free_unmanaged() del self._model diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 2d95232..26ab23b 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -1,18 +1,19 @@ -import argparse import platform from contextlib import asynccontextmanager from os import environ, getpid from pathlib import Path from random import randint +import sys from threading import Timer -from typing import Dict, Literal, Optional +from typing import Literal, Optional -from ..shared.config import Config +from ..shared.config import AppSettingsCliArgs, MainCliArgs, CliArg, Config from ..utils.dependency import ( get_installed_packages, get_poetry_executable, git_clone, + run_command, install_all_dependencies, install_package, install_pytorch, @@ -69,22 +70,57 @@ def set_priority( return False -def initialize_before_launch( - install_packages: bool = False, - force_cuda: bool = False, - skip_pytorch_install: bool = False, - skip_tensorflow_install: bool = False, - skip_compile: bool = False, - no_cache: bool = False, -) -> None: +def parse_cli_args_from_environ(prefix: str = "LLAMA_") -> None: + """Parse CLI arguments from environment variables""" + prefix = prefix.lower() + cli_args = { + cli_key: cli_arg + for cli_key, cli_arg in MainCliArgs.iterate_over_cli_args() + } # type: dict[str, CliArg] + prefix_length = len(prefix) + for key, value in environ.items(): + key = key.lower() + if not key.startswith(prefix): + continue + key = key[prefix_length:] + if key not in cli_args: + continue + cli_arg = cli_args[key] + if not isinstance(cli_arg, CliArg): + continue + cli_arg.value = cli_arg.type(value) + + +def initialize_before_launch() -> None: """Initialize the app""" - args = ["--no-cache-dir"] if no_cache else [] + args = MainCliArgs + install_packages = args.install_pkgs.value or False + upgrade_packages = args.upgrade_pkgs.value or False + force_cuda = args.force_cuda.value or False + skip_pytorch_install = args.skip_torch_install.value or False + skip_tensorflow_install = args.skip_tf_install.value or False + skip_compile = args.skip_compile.value or False + no_cache_dir = args.no_cache_dir.value or False + + # PIP arguments + pip_args = [] # type: list[str] + if no_cache_dir: + pip_args.append("--no-cache-dir") + if upgrade_packages: + pip_args.append("--upgrade") + # Upgrade pip + run_command( + [sys.executable, "-m", "pip", "install", "--upgrade", "pip"], + action="upgrade", + name="pip", + ) + + # Clone all repositories for git_clone_args in Config.repositories.values(): git_clone(**git_clone_args) - if environ.get("LLAMA_API_XFORMERS") == "1": - install_package("xformers", args=args) + + # Install packages if install_packages: - # Install all dependencies if not skip_compile: # Build the shared library of LLaMA C++ code build_shared_lib(logger=logger, force_cuda=force_cuda) @@ -92,17 +128,17 @@ def initialize_before_launch( if not poetry.exists(): # Install poetry logger.warning(f"⚠️ Poetry not found: {poetry}") - install_package("poetry", force=True, args=args) + install_package("poetry", force=True, args=pip_args) if not skip_pytorch_install: # Install pytorch - install_pytorch(force_cuda=force_cuda, args=args) + install_pytorch(force_cuda=force_cuda, args=pip_args) if not skip_tensorflow_install: # Install tensorflow - install_tensorflow(args=args) + install_tensorflow(args=pip_args) # Install all dependencies of our project and other repositories project_paths = [Path(".")] + list(Path("repositories").glob("*")) - install_all_dependencies(project_paths=project_paths, args=args) + install_all_dependencies(project_paths=project_paths, args=pip_args) # Get current packages installed logger.info(f"📦 Installed packages: {get_installed_packages()}") @@ -112,6 +148,8 @@ def initialize_before_launch( "If any packages are missing, " "use `--install-pkgs` option to install them." ) + if MainCliArgs.xformers.value: + install_package("xformers", args=pip_args) @asynccontextmanager @@ -148,32 +186,17 @@ async def health(): return new_app -def run( - port: int, - install_packages: bool = False, - force_cuda: bool = False, - skip_pytorch_install: bool = False, - skip_tensorflow_install: bool = False, - skip_compile: bool = False, - no_cache: bool = False, - tunnel: bool = False, - environs: Optional[Dict[str, str]] = None, -) -> None: - initialize_before_launch( - install_packages=install_packages, - force_cuda=force_cuda, - skip_pytorch_install=skip_pytorch_install, - skip_tensorflow_install=skip_tensorflow_install, - skip_compile=skip_compile, - no_cache=no_cache, - ) +def run() -> None: + port = MainCliArgs.port.value + assert port is not None, "Port is not set" + if MainCliArgs.force_cuda.value: + environ["FORCE_CUDA"] = "1" + initialize_before_launch() from uvicorn import Config as UvicornConfig from uvicorn import Server as UvicornServer - if environs: - environ.update(environs) - if tunnel: + if MainCliArgs.tunnel.value: install_package("flask-cloudflared") from flask_cloudflared import start_cloudflared @@ -193,53 +216,6 @@ def run( ).run() -parser = argparse.ArgumentParser() -parser.add_argument( - "-i", - "--install-pkgs", - action="store_true", - help="Install all required packages before running the server", -) -parser.add_argument( - "-fc", - "--force-cuda", - action="store_true", - help=( - "Force CUDA version of pytorch to be used" - "when installing pytorch. e.g. torch==2.0.1+cu118" - ), -) -parser.add_argument( - "-st", - "--skip-torch-install", - action="store_true", - help="Skip installing pytorch, if `install-pkgs` is set", -) -parser.add_argument( - "-stf", - "--skip-tf-install", - action="store_true", - help="Skip installing tensorflow, if `install-pkgs` is set", -) -parser.add_argument( - "-sc", - "--skip-compile", - action="store_true", - help="Skip compiling the shared library of LLaMA C++ code", -) -parser.add_argument( - "-nc", - "--no-cache-dir", - action="store_true", - help="Disable caching of pip installs, if `install-pkgs` is set", -) if __name__ == "__main__": - args = parser.parse_args() - initialize_before_launch( - install_packages=args.install_pkgs, - force_cuda=args.force_cuda, - skip_pytorch_install=args.skip_torch_install, - skip_tensorflow_install=args.skip_tf_install, - skip_compile=args.skip_compile, - no_cache=args.no_cache_dir, - ) + AppSettingsCliArgs.load() + initialize_before_launch() diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index 2b3b56c..de0901b 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -6,7 +6,6 @@ from contextlib import asynccontextmanager from dataclasses import dataclass, field from functools import partial -from os import environ from queue import Queue from random import choice from threading import Event @@ -37,6 +36,8 @@ from orjson import OPT_INDENT_2, dumps from sse_starlette.sse import EventSourceResponse +from llama_api.shared.config import MainCliArgs + from ...mixins.completion import CompletionStatus from ...schemas.api import ( ChatCompletion, @@ -72,8 +73,8 @@ ) logger = ApiLogger(__name__) router = APIRouter(prefix="/v1", route_class=RouteErrorHandler) -max_workers = int(environ.get("LLAMA_API_MAX_WORKERS", 1)) -max_semaphores = int(environ.get("LLAMA_API_MAX_SEMAPHORES", 1)) +max_workers = int(MainCliArgs.max_workers.value or 1) +max_semaphores = int(MainCliArgs.max_semaphores.value or 1) T = TypeVar("T") @@ -365,7 +366,7 @@ async def create_completion(request: Request, body: CreateCompletionRequest): async def create_embedding( request: Request, body: CreateEmbeddingRequest ) -> Embedding: - if not environ.get("LLAMA_API_EMBEDDINGS"): + if MainCliArgs.no_embed.value: raise PermissionError("Embeddings endpoint is disabled") assert body.model is not None, "Model is required" async with get_wix_with_semaphore(request, body.model) as wix: diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index 6469e76..55e24de 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -1,5 +1,21 @@ +import argparse +from dataclasses import dataclass, field +import json +from os import environ from pathlib import Path -from typing import Dict, List, Literal, Optional, Tuple +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterable, + List, + Literal, + Optional, + Tuple, + TypeVar, + Union, +) try: from typing_extensions import TypedDict @@ -9,6 +25,181 @@ from typing import TypedDict # When dependencies aren't installed yet +T = TypeVar("T", bound=Union[str, int, float, bool]) + + +@dataclass +class CliArg(Generic[T]): + type: Callable[[Any], T] + help: str = "" + short_option: Optional[str] = None + action: Optional[str] = None + default: Optional[T] = None + value: Optional[T] = field(init=False) # ensure it's set in __post_init__ + + def __post_init__(self): + self.value = self.default + + +class CliArgHelper: + @classmethod + def get_parser(cls) -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + for cli_key, cli_arg in cls.iterate_over_cli_args(): + args = [] # type: List[str] + if cli_arg.short_option: + args.append(f"-{cli_arg.short_option.replace('_', '-')}") + args.append(f"--{cli_key.replace('_', '-')}") + kwargs = {} + if cli_arg.help: + kwargs["help"] = cli_arg.help + if cli_arg.default is not None: + kwargs["default"] = cli_arg.default + if cli_arg.action: + kwargs["action"] = cli_arg.action + else: + kwargs["type"] = cli_arg.type + parser.add_argument(*args, **kwargs) + return parser + + @classmethod + def load(cls) -> None: + cls.load_from_namespace(cls.get_parser().parse_args()) + + @classmethod + def load_from_namespace( + cls, args: argparse.Namespace, environ_key: str = "LLAMA_API_ARGS" + ) -> None: + cli_args = { + cli_key: cli_arg + for cli_key, cli_arg in cls.iterate_over_cli_args() + } + for cli_key, cli_arg in cli_args.items(): + cli_arg_value = getattr(args, cli_key, None) + if cli_arg_value is not None: + cli_arg.value = cli_arg.type(cli_arg_value) + environ[environ_key] = json.dumps( + { + cli_key.upper(): cli_arg.value + for cli_key, cli_arg in cli_args.items() + } + ) + + @classmethod + def load_from_environ(cls, environ_key: str = "LLAMA_API_ARGS") -> None: + json_str = environ.get(environ_key) + assert ( + json_str is not None + ), f"Environment variable {environ_key} not found" + cli_args = { + cli_key: cli_arg + for cli_key, cli_arg in cls.iterate_over_cli_args() + } # type: Dict[str, CliArg] + cli_arg_values = json.loads(json_str) # type: Dict[str, Any] + for cli_key, cli_value in cli_arg_values.items(): + cli_key = cli_key.lower() + if cli_key in cli_args and cli_value is not None: + cli_arg = cli_args[cli_key] + cli_arg.value = cli_arg.type(cli_value) + + @classmethod + def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]: + for _cls in cls.__mro__: + for attr_name, attr_value in vars(_cls).items(): + if isinstance(attr_value, CliArg): + yield attr_name, attr_value + + +class AppSettingsCliArgs(CliArgHelper): + install_pkgs: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="i", + help="Install all required packages before running the server", + ) + force_cuda: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="c", + help="Force CUDA version of pytorch to be used " + "when installing pytorch. e.g. torch==2.0.1+cu118", + ) + skip_torch_install: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="-no-torch", + help="Skip installing pytorch, if `install-pkgs` is set", + ) + skip_tf_install: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="-no-tf", + help="Skip installing tensorflow, if `install-pkgs` is set", + ) + skip_compile: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="-no-compile", + help="Skip compiling the shared library of LLaMA C++ code", + ) + no_cache_dir: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="-no-cache", + help="Disable caching of pip installs, if `install-pkgs` is set", + ) + upgrade_pkgs: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="u", + help="Upgrade all packages before running the server", + ) + + +class MainCliArgs(AppSettingsCliArgs): + port: CliArg[int] = CliArg( + type=int, + short_option="p", + help="Port to run the server on; default is 8000", + default=8000, + ) + max_workers: CliArg[int] = CliArg( + type=int, + short_option="w", + help="Maximum number of process workers to run; default is 1", + default=1, + ) + max_semaphores: CliArg[int] = CliArg( + type=int, + short_option="s", + help="Maximum number of process semaphores to permit; default is 1", + default=1, + ) + api_key: CliArg[str] = CliArg( + type=str, + short_option="k", + help="API key to use for the server", + default=None, + ) + xformers: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="x", + help="Apply xformers' memory-efficient optimizations", + ) + no_embed: CliArg[bool] = CliArg( + type=bool, + action="store_true", + help="Disable embeddings endpoint", + ) + tunnel: CliArg[bool] = CliArg( + type=bool, + action="store_true", + short_option="t", + help="Tunnel the server through cloudflared", + ) + + class GitCloneArgs(TypedDict): git_path: str disk_path: str @@ -43,7 +234,7 @@ class Config: "exllama": GitCloneArgs( git_path="https://github.com/turboderp/exllama", disk_path="repositories/exllama", - options=["--recurse-submodules"], + options=["recurse-submodules"], ), "llama_cpp": GitCloneArgs( git_path="https://github.com/abetlen/llama-cpp-python", diff --git a/llama_api/utils/concurrency.py b/llama_api/utils/concurrency.py index 797a1be..0424202 100644 --- a/llama_api/utils/concurrency.py +++ b/llama_api/utils/concurrency.py @@ -10,6 +10,8 @@ from fastapi.concurrency import run_in_threadpool +from llama_api.shared.config import MainCliArgs + from ..server.app_settings import set_priority from ..utils.logger import ApiLogger from ..utils.process_pool import ProcessPool @@ -36,6 +38,8 @@ def init_process_pool(env_vars: Dict[str, str]) -> None: for key, value in env_vars.items(): environ[key] = value + MainCliArgs.load_from_environ() + def pool() -> ProcessPool: """Get the process pool, and initialize it if it's not initialized yet""" @@ -44,14 +48,14 @@ def pool() -> ProcessPool: if _pool is None: logger.info("Initializing process pool...") _pool = ProcessPool( - max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)), + max_workers=MainCliArgs.max_workers.value or 1, initializer=init_process_pool, initargs=(dict(environ),), ) elif not _pool.is_available: logger.critical("🚨 Process pool died. Reinitializing process pool...") _pool = ProcessPool( - max_workers=int(environ.get("LLAMA_API_MAX_WORKERS", 1)), + max_workers=MainCliArgs.max_workers.value or 1, initializer=init_process_pool, initargs=(dict(environ),), ) diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index 3d1cef0..8f367f2 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -1,5 +1,4 @@ from functools import cached_property -from os import environ from pathlib import Path from re import Match, Pattern, compile from typing import Callable, Coroutine, Dict, Optional, Tuple, Union @@ -14,6 +13,7 @@ CreateCompletionRequest, CreateEmbeddingRequest, ) +from ..shared.config import MainCliArgs from ..utils.logger import ApiLogger logger = ApiLogger(__name__) @@ -126,7 +126,7 @@ class RouteErrorHandler(APIRoute): ): ErrorResponseFormatters.model_not_found, } - api_key: Optional[str] = environ.get("LLAMA_API_API_KEY") or None + api_key: Optional[str] = MainCliArgs.api_key.value @cached_property def authorization(self) -> Optional[str]: diff --git a/llama_api/utils/llama_cpp.py b/llama_api/utils/llama_cpp.py index b1de480..fddd339 100644 --- a/llama_api/utils/llama_cpp.py +++ b/llama_api/utils/llama_cpp.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import List, Optional, Union +from ..shared.config import MainCliArgs from ..utils.dependency import install_package, run_command from ..utils.system import get_cuda_version @@ -193,9 +194,10 @@ def build_shared_lib( logger: Optional[Logger] = None, force_cuda: bool = False ) -> None: """Build the shared library for llama.cpp""" - global CMAKE_ARGS - if force_cuda or bool(environ.get("FORCE_CUDA", False)): + if force_cuda or bool( + environ.get("FORCE_CUDA", MainCliArgs.force_cuda.value) + ): assert get_cuda_version() is not None, "CUDA is not available" CMAKE_ARGS = CUBLAS_ARGS diff --git a/main.py b/main.py index f23613b..920896a 100644 --- a/main.py +++ b/main.py @@ -1,62 +1,7 @@ -from llama_api.server.app_settings import run, parser +from llama_api.server.app_settings import run +from llama_api.shared.config import MainCliArgs if __name__ == "__main__": - parser.add_argument( - "-p", - "--port", - type=int, - default=8000, - help="Port to run the server on; default is 8000", - ) - parser.add_argument( - "-w", - "--max-workers", - type=int, - default=1, - help="Maximum number of process workers to run; default is 1", - ) - parser.add_argument( - "-k", - "--api-key", - type=str, - default=None, - help="API key to use for the server", - ) - parser.add_argument( - "-x", - "--xformers", - action="store_true", - help="Apply xformers' memory-efficient optimizations", - ) - parser.add_argument( - "-ne", - "--no-embed", - action="store_true", - help="Disable embeddings endpoint", - ) - parser.add_argument( - "-t", - "--tunnel", - action="store_true", - help="Tunnel the server through cloudflared", - ) - - args = parser.parse_args() - run( - port=args.port, - install_packages=args.install_pkgs, - force_cuda=args.force_cuda, - skip_pytorch_install=args.skip_torch_install, - skip_tensorflow_install=args.skip_tf_install, - skip_compile=args.skip_compile, - no_cache=args.no_cache_dir, - tunnel=args.tunnel, - environs={ - "LLAMA_API_MAX_WORKERS": str(args.max_workers), - "LLAMA_API_XFORMERS": "1" if args.xformers else "", - "LLAMA_API_API_KEY": args.api_key or "", - "FORCE_CUDA": "1" if args.force_cuda else "", - "LLAMA_API_EMBEDDINGS": "1" if not args.no_embed else "", - }, - ) + MainCliArgs.load() + run() diff --git a/readme.md b/readme.md index f9646ee..3a74c87 100644 --- a/readme.md +++ b/readme.md @@ -32,26 +32,31 @@ python -m main ``` Options: ```b -usage: main.py [-h] [-i] [-fc] [-st] [-stf] [-sc] [-nc] [-p PORT] [-w MAX_WORKERS] [-k API_KEY] [-x] [-ne] [-t] +usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-s MAX_SEMAPHORES] [-k API_KEY] [-x] [--no-embed] [-t] [-i] [-c] [--no-torch] [--no-tf] [--no-compile] [--no-cache] [-u] options: -h, --help show this help message and exit - -i, --install-pkgs Install all required packages before running the server - -fc, --force-cuda Force CUDA version of pytorch to be usedwhen installing pytorch. e.g. torch==2.0.1+cu118 - -st, --skip-torch-install - Skip installing pytorch, if `install-pkgs` is set - -stf, --skip-tf-install - Skip installing tensorflow, if `install-pkgs` is set - -sc, --skip-compile Skip compiling the shared library of LLaMA C++ code - -nc, --no-cache-dir Disable caching of pip installs, if `install-pkgs` is set -p PORT, --port PORT Port to run the server on; default is 8000 -w MAX_WORKERS, --max-workers MAX_WORKERS Maximum number of process workers to run; default is 1 + -s MAX_SEMAPHORES, --max-semaphores MAX_SEMAPHORES + Maximum number of process semaphores to permit; default is 1 -k API_KEY, --api-key API_KEY API key to use for the server -x, --xformers Apply xformers' memory-efficient optimizations - -ne, --no-embed Disable embeddings endpoint + --no-embed Disable embeddings endpoint -t, --tunnel Tunnel the server through cloudflared + -i, --install-pkgs Install all required packages before running the server + -c, --force-cuda Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118 + --no-torch, --skip-torch-install + Skip installing pytorch, if `install-pkgs` is set + --no-tf, --skip-tf-install + Skip installing tensorflow, if `install-pkgs` is set + --no-compile, --skip-compile + Skip compiling the shared library of LLaMA C++ code + --no-cache, --no-cache-dir + Disable caching of pip installs, if `install-pkgs` is set + -u, --upgrade-pkgs Upgrade all packages before running the server ``` ### Unique features diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py index e96096e..2839cfd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,16 @@ -from asyncio import gather +from asyncio import gather, iscoroutinefunction +from contextlib import ExitStack from datetime import datetime +from functools import wraps import importlib +from types import ModuleType import unittest from os import environ from pathlib import Path from re import compile, sub from typing import ( TYPE_CHECKING, + Any, AsyncIterator, Dict, Iterable, @@ -16,6 +20,7 @@ Tuple, Union, ) +from unittest.mock import MagicMock, patch from orjson import loads from llama_api.schemas.api import ( @@ -42,6 +47,33 @@ EndPoint = Literal["completions", "chat/completions"] +def patch_module(mocking_module: ModuleType): + def decorator(func): + @wraps(func) + async def async_wrapper(*args, **kwargs): + patches = [] + for name, attr in mocking_module.__dict__.items(): + # Mock all functions and classes + if callable(attr) or isinstance(attr, (type,)): + patches.append( + patch.object(mocking_module, name, MagicMock()) + ) + + with ExitStack() as stack: + for p in patches: + stack.enter_context(p) + + if iscoroutinefunction(func): + return await func(*args, **kwargs) + return func(*args, **kwargs) + + if iscoroutinefunction(func): + return async_wrapper + return func + + return decorator + + class TestLlamaAPI(unittest.TestCase): ggml_model: str = "orca-mini-3b.ggmlv3.q4_0.bin" ggml_path: Path = Config.project_root / Path(f"models/ggml/{ggml_model}") @@ -65,7 +97,7 @@ def setUpClass(cls): "fastapi.testclient" ).TestClient # type: Type[TestClient] cls.app = create_app_llama_cpp() - environ["LLAMA_API_MAX_WORKERS"] = "2" + environ["LLAMA_API_ARGS"] = '{"MAX_WORKERS": 1}' @classmethod def tearDownClass(cls): @@ -91,6 +123,7 @@ async def arequest_completion( self, model_names: Union[List[str], Tuple[str, ...]], endpoints: Union[EndPoint, Iterable[EndPoint]], + **kwargs: Any, ) -> Tuple[List[List[str]], List[datetime], List[datetime]]: async with self.AsyncClient( app=self.app, base_url="http://localhost", timeout=None @@ -111,6 +144,7 @@ async def arequest_completion( else endpoints ), ), + **kwargs, ) async def get_models( @@ -133,6 +167,7 @@ async def submit_streaming_requests( self, client: "AsyncClient", model_and_endpoints: Iterable[Tuple[str, EndPoint]], + **kwargs: Any, ) -> Tuple[List[List[str]], List[datetime], List[datetime]]: async def send_request( model: str, endpoint: EndPoint @@ -146,6 +181,7 @@ async def send_request( {"messages": self.messages} if endpoint.startswith("chat") else {"prompt": self.prompt}, + kwargs, ), headers={"Content-Type": "application/json"}, ) as response: diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..64ccf5e --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,57 @@ +import json +from os import environ +import unittest +from llama_api.shared.config import AppSettingsCliArgs, MainCliArgs + + +class TestCLIArgs(unittest.TestCase): + def test_cli_args(self): + parser = MainCliArgs.get_parser() + + # Check that `--install-pkgs` is inherited from `MainCliArgs` + args = parser.parse_args(["--install-pkgs", "--port", "8080"]) + AppSettingsCliArgs.load_from_namespace(args) + self.assertFalse(AppSettingsCliArgs.force_cuda.value) + self.assertTrue(AppSettingsCliArgs.install_pkgs.value) + self.assertFalse(MainCliArgs.force_cuda.value) + self.assertTrue(MainCliArgs.install_pkgs.value) + self.assertEqual(MainCliArgs.port.value, 8000) + + # Check that both `--force-cuda` and `--port` are inherited from `MainCliArgs` # noqa + args = parser.parse_args(["--port", "9000", "--force-cuda"]) + MainCliArgs.load_from_namespace(args) + self.assertTrue(AppSettingsCliArgs.force_cuda.value) + self.assertFalse(AppSettingsCliArgs.install_pkgs.value) + self.assertTrue(MainCliArgs.force_cuda.value) + self.assertFalse(MainCliArgs.install_pkgs.value) + self.assertEqual(MainCliArgs.port.value, 9000) + + # Set `--install-pkgs` to `False` and check that it is applied + args.install_pkgs = True + AppSettingsCliArgs.load_from_namespace(args) + self.assertTrue(AppSettingsCliArgs.force_cuda.value) + self.assertTrue(AppSettingsCliArgs.install_pkgs.value) + self.assertTrue(MainCliArgs.force_cuda.value) + self.assertTrue(MainCliArgs.install_pkgs.value) + self.assertEqual(MainCliArgs.port.value, 9000) + + environ["LLAMA_CLI_ARGS"] = json.dumps( + {"force_cuda": False, "port": 7000} + ) + AppSettingsCliArgs.load_from_environ("LLAMA_CLI_ARGS") + self.assertFalse(AppSettingsCliArgs.force_cuda.value) + self.assertTrue(AppSettingsCliArgs.install_pkgs.value) + self.assertFalse(MainCliArgs.force_cuda.value) + self.assertTrue(MainCliArgs.install_pkgs.value) + self.assertEqual(MainCliArgs.port.value, 9000) + + MainCliArgs.load_from_environ("LLAMA_CLI_ARGS") + self.assertFalse(AppSettingsCliArgs.force_cuda.value) + self.assertTrue(AppSettingsCliArgs.install_pkgs.value) + self.assertFalse(MainCliArgs.force_cuda.value) + self.assertTrue(MainCliArgs.install_pkgs.value) + self.assertEqual(MainCliArgs.port.value, 7000) + + +if __name__ == "__main__": + unittest.main() From 8bc40191b759f57e948d5307557648e783426bd1 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Mon, 21 Aug 2023 00:40:50 +0900 Subject: [PATCH 13/18] Fixed CLI bugs --- llama_api/server/app_settings.py | 30 ++++--------- llama_api/shared/config.py | 76 ++++++++++++++++++++++---------- llama_api/utils/dependency.py | 15 +++++++ main.py | 2 - tests/test_cli.py | 16 ++++--- 5 files changed, 86 insertions(+), 53 deletions(-) diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 26ab23b..11ce418 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -7,12 +7,13 @@ from threading import Timer from typing import Literal, Optional -from ..shared.config import AppSettingsCliArgs, MainCliArgs, CliArg, Config +from ..shared.config import AppSettingsCliArgs, MainCliArgs, Config from ..utils.dependency import ( get_installed_packages, get_poetry_executable, git_clone, + git_pull, run_command, install_all_dependencies, install_package, @@ -70,27 +71,6 @@ def set_priority( return False -def parse_cli_args_from_environ(prefix: str = "LLAMA_") -> None: - """Parse CLI arguments from environment variables""" - prefix = prefix.lower() - cli_args = { - cli_key: cli_arg - for cli_key, cli_arg in MainCliArgs.iterate_over_cli_args() - } # type: dict[str, CliArg] - prefix_length = len(prefix) - for key, value in environ.items(): - key = key.lower() - if not key.startswith(prefix): - continue - key = key[prefix_length:] - if key not in cli_args: - continue - cli_arg = cli_args[key] - if not isinstance(cli_arg, CliArg): - continue - cli_arg.value = cli_arg.type(value) - - def initialize_before_launch() -> None: """Initialize the app""" args = MainCliArgs @@ -101,6 +81,9 @@ def initialize_before_launch() -> None: skip_tensorflow_install = args.skip_tf_install.value or False skip_compile = args.skip_compile.value or False no_cache_dir = args.no_cache_dir.value or False + logger.info( + "Starting Application with CLI args:", environ["LLAMA_API_ARGS"] + ) # PIP arguments pip_args = [] # type: list[str] @@ -118,6 +101,8 @@ def initialize_before_launch() -> None: # Clone all repositories for git_clone_args in Config.repositories.values(): git_clone(**git_clone_args) + if upgrade_packages: + git_pull(git_clone_args["git_path"]) # Install packages if install_packages: @@ -187,6 +172,7 @@ async def health(): def run() -> None: + MainCliArgs.load() port = MainCliArgs.port.value assert port is not None, "Port is not set" if MainCliArgs.force_cuda.value: diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index 55e24de..cde26f2 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -43,28 +43,16 @@ def __post_init__(self): class CliArgHelper: @classmethod - def get_parser(cls) -> argparse.ArgumentParser: - parser = argparse.ArgumentParser() - for cli_key, cli_arg in cls.iterate_over_cli_args(): - args = [] # type: List[str] - if cli_arg.short_option: - args.append(f"-{cli_arg.short_option.replace('_', '-')}") - args.append(f"--{cli_key.replace('_', '-')}") - kwargs = {} - if cli_arg.help: - kwargs["help"] = cli_arg.help - if cli_arg.default is not None: - kwargs["default"] = cli_arg.default - if cli_arg.action: - kwargs["action"] = cli_arg.action - else: - kwargs["type"] = cli_arg.type - parser.add_argument(*args, **kwargs) - return parser - - @classmethod - def load(cls) -> None: - cls.load_from_namespace(cls.get_parser().parse_args()) + def load( + cls, + environ_key: str = "LLAMA_API_ARGS", + environ_key_prefix: str = "LLAMA_API_", + ) -> None: + """Load CLI arguments from environment variables and/or CLI arguments""" + cls.load_from_namespace(cls.parser.parse_args()) + cls.load_from_environ( + environ_key=environ_key, environ_key_prefix=environ_key_prefix + ) @classmethod def load_from_namespace( @@ -82,11 +70,16 @@ def load_from_namespace( { cli_key.upper(): cli_arg.value for cli_key, cli_arg in cli_args.items() + if cli_arg.value is not None } ) @classmethod - def load_from_environ(cls, environ_key: str = "LLAMA_API_ARGS") -> None: + def load_from_environ( + cls, + environ_key: str = "LLAMA_API_ARGS", + environ_key_prefix: str = "LLAMA_API_", + ) -> None: json_str = environ.get(environ_key) assert ( json_str is not None @@ -102,6 +95,20 @@ def load_from_environ(cls, environ_key: str = "LLAMA_API_ARGS") -> None: cli_arg = cli_args[cli_key] cli_arg.value = cli_arg.type(cli_value) + environ_key_prefix = environ_key_prefix.lower() + prefix_length = len(environ_key_prefix) + for key, value in environ.items(): + key = key.lower() + if not key.startswith(environ_key_prefix): + continue + key = key[prefix_length:] + if key not in cli_args: + continue + cli_arg = cli_args[key] + if not isinstance(cli_arg, CliArg): + continue + cli_arg.value = cli_arg.type(value) + @classmethod def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]: for _cls in cls.__mro__: @@ -109,6 +116,29 @@ def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]: if isinstance(attr_value, CliArg): yield attr_name, attr_value + @classmethod + @property + def parser(cls) -> argparse.ArgumentParser: + """Parse CLI arguments from environment variables, + and return the parser""" + arg_parser = argparse.ArgumentParser() + for cli_key, cli_arg in cls.iterate_over_cli_args(): + args = [] # type: List[str] + args.append(f"--{cli_key.replace('_', '-')}") + if cli_arg.short_option: + args.append(f"-{cli_arg.short_option.replace('_', '-')}") + kwargs = {} + if cli_arg.help: + kwargs["help"] = cli_arg.help + if cli_arg.default is not None: + kwargs["default"] = cli_arg.default + if cli_arg.action: + kwargs["action"] = cli_arg.action + else: + kwargs["type"] = cli_arg.type + arg_parser.add_argument(*args, **kwargs) + return arg_parser + class AppSettingsCliArgs(CliArgHelper): install_pkgs: CliArg[bool] = CliArg( diff --git a/llama_api/utils/dependency.py b/llama_api/utils/dependency.py index 49fc279..ca6bcde 100644 --- a/llama_api/utils/dependency.py +++ b/llama_api/utils/dependency.py @@ -76,6 +76,21 @@ def git_clone( return None +def git_pull( + git_path: str, + options: Optional[List[str]] = None, +) -> Optional[bool]: + """Pull a git repository.""" + if Path(git_path).exists(): + return run_command( + ["git", "pull", git_path, *(options or [])], + action="pull", + name=f"{git_path}", + try_emoji="📥", + ) + return None + + def get_mac_major_version_string(): # platform.mac_ver() returns a tuple ('10.16', ('', '', ''), 'x86_64') # Split the version string on '.' and take the first two components diff --git a/main.py b/main.py index 920896a..39948b7 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,5 @@ from llama_api.server.app_settings import run -from llama_api.shared.config import MainCliArgs if __name__ == "__main__": - MainCliArgs.load() run() diff --git a/tests/test_cli.py b/tests/test_cli.py index 64ccf5e..621ef1e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,9 @@ class TestCLIArgs(unittest.TestCase): def test_cli_args(self): - parser = MainCliArgs.get_parser() + parser = MainCliArgs.parser + environ_key = "LLAMA_CLI_ARGS" + environ_key_prefix = "LLAMA_" # Check that `--install-pkgs` is inherited from `MainCliArgs` args = parser.parse_args(["--install-pkgs", "--port", "8080"]) @@ -35,23 +37,25 @@ def test_cli_args(self): self.assertTrue(MainCliArgs.install_pkgs.value) self.assertEqual(MainCliArgs.port.value, 9000) - environ["LLAMA_CLI_ARGS"] = json.dumps( - {"force_cuda": False, "port": 7000} - ) - AppSettingsCliArgs.load_from_environ("LLAMA_CLI_ARGS") + environ[environ_key] = json.dumps({"force_cuda": False, "port": 7000}) + AppSettingsCliArgs.load_from_environ(environ_key, environ_key_prefix) self.assertFalse(AppSettingsCliArgs.force_cuda.value) self.assertTrue(AppSettingsCliArgs.install_pkgs.value) self.assertFalse(MainCliArgs.force_cuda.value) self.assertTrue(MainCliArgs.install_pkgs.value) self.assertEqual(MainCliArgs.port.value, 9000) - MainCliArgs.load_from_environ("LLAMA_CLI_ARGS") + MainCliArgs.load_from_environ(environ_key, environ_key_prefix) self.assertFalse(AppSettingsCliArgs.force_cuda.value) self.assertTrue(AppSettingsCliArgs.install_pkgs.value) self.assertFalse(MainCliArgs.force_cuda.value) self.assertTrue(MainCliArgs.install_pkgs.value) self.assertEqual(MainCliArgs.port.value, 7000) + environ[f"{environ_key_prefix}MAX_SEMAPHORES"] = "100" + MainCliArgs.load_from_environ(environ_key, environ_key_prefix) + self.assertEqual(MainCliArgs.max_semaphores.value, 100) + if __name__ == "__main__": unittest.main() From 6e5db5c483dd2743beaa0a88a1fae4f2b31a6945 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Mon, 21 Aug 2023 13:50:10 +0900 Subject: [PATCH 14/18] Added doc strings --- llama_api/server/app_settings.py | 10 ++-- llama_api/shared/config.py | 78 ++++++++++++++++++++++---------- llama_api/utils/errors.py | 2 +- readme.md | 29 ++++++------ 4 files changed, 76 insertions(+), 43 deletions(-) diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index 11ce418..c3a2a78 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -75,21 +75,21 @@ def initialize_before_launch() -> None: """Initialize the app""" args = MainCliArgs install_packages = args.install_pkgs.value or False - upgrade_packages = args.upgrade_pkgs.value or False + upgrade = args.upgrade.value or False force_cuda = args.force_cuda.value or False skip_pytorch_install = args.skip_torch_install.value or False skip_tensorflow_install = args.skip_tf_install.value or False skip_compile = args.skip_compile.value or False no_cache_dir = args.no_cache_dir.value or False - logger.info( - "Starting Application with CLI args:", environ["LLAMA_API_ARGS"] + print( + "Starting Application with CLI args:" + str(environ["LLAMA_API_ARGS"]) ) # PIP arguments pip_args = [] # type: list[str] if no_cache_dir: pip_args.append("--no-cache-dir") - if upgrade_packages: + if upgrade: pip_args.append("--upgrade") # Upgrade pip run_command( @@ -101,7 +101,7 @@ def initialize_before_launch() -> None: # Clone all repositories for git_clone_args in Config.repositories.values(): git_clone(**git_clone_args) - if upgrade_packages: + if upgrade: git_pull(git_clone_args["git_path"]) # Install packages diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index cde26f2..b211ed8 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -22,10 +22,19 @@ except ImportError: + print("Failed to import typing_extensions, using TypedDict from typing") from typing import TypedDict # When dependencies aren't installed yet T = TypeVar("T", bound=Union[str, int, float, bool]) +DEFAULT_ENVIRON_KEY = "LLAMA_API_ARGS" +DEFAULT_ENVIRON_KEY_PREFIX = "LLAMA_API_" + + +class GitCloneArgs(TypedDict): + git_path: str + disk_path: str + options: Optional[List[str]] @dataclass @@ -42,13 +51,16 @@ def __post_init__(self): class CliArgHelper: + """Helper class for loading CLI arguments from environment variables + or a namespace of CLI arguments""" + @classmethod def load( cls, - environ_key: str = "LLAMA_API_ARGS", - environ_key_prefix: str = "LLAMA_API_", + environ_key: str = DEFAULT_ENVIRON_KEY, + environ_key_prefix: str = DEFAULT_ENVIRON_KEY_PREFIX, ) -> None: - """Load CLI arguments from environment variables and/or CLI arguments""" + """Load CLI arguments from environment variables and CLI arguments""" cls.load_from_namespace(cls.parser.parse_args()) cls.load_from_environ( environ_key=environ_key, environ_key_prefix=environ_key_prefix @@ -56,38 +68,58 @@ def load( @classmethod def load_from_namespace( - cls, args: argparse.Namespace, environ_key: str = "LLAMA_API_ARGS" + cls, + args: argparse.Namespace, + environ_key: Optional[str] = DEFAULT_ENVIRON_KEY, ) -> None: + """Load CLI arguments from a namespace, + and set an environment variable with the CLI arguments as JSON""" + # Get all defined CLI arguments within the class cli_args = { cli_key: cli_arg for cli_key, cli_arg in cls.iterate_over_cli_args() } + + # Parse the CLI arguments and set the value of the CLI argument + # if it's not None, otherwise keep the default value for cli_key, cli_arg in cli_args.items(): cli_arg_value = getattr(args, cli_key, None) if cli_arg_value is not None: cli_arg.value = cli_arg.type(cli_arg_value) - environ[environ_key] = json.dumps( - { - cli_key.upper(): cli_arg.value - for cli_key, cli_arg in cli_args.items() - if cli_arg.value is not None - } - ) + + # Set an environment variable with the CLI arguments as JSON, + # if an environment variable key is provided + if environ_key is not None: + environ[environ_key] = json.dumps( + { + cli_key.upper(): cli_arg.value + for cli_key, cli_arg in cli_args.items() + } + ) @classmethod def load_from_environ( cls, - environ_key: str = "LLAMA_API_ARGS", - environ_key_prefix: str = "LLAMA_API_", + environ_key: str = DEFAULT_ENVIRON_KEY, + environ_key_prefix: Optional[str] = DEFAULT_ENVIRON_KEY_PREFIX, ) -> None: + """Load JSON CLI arguments from an environment variable. + If an environment variable key prefix is provided, + load CLI arguments from environment variables which start with + the prefix.""" json_str = environ.get(environ_key) assert ( json_str is not None ), f"Environment variable {environ_key} not found" + # Get all defined CLI arguments within the class cli_args = { cli_key: cli_arg for cli_key, cli_arg in cls.iterate_over_cli_args() } # type: Dict[str, CliArg] + + # Parse the CLI arguments from the JSON string + # and set the value of the CLI argument if it's not None, + # otherwise keep the default value cli_arg_values = json.loads(json_str) # type: Dict[str, Any] for cli_key, cli_value in cli_arg_values.items(): cli_key = cli_key.lower() @@ -95,6 +127,10 @@ def load_from_environ( cli_arg = cli_args[cli_key] cli_arg.value = cli_arg.type(cli_value) + # Parse the CLI arguments from environment variables, + # which start with the prefix + if environ_key_prefix is None: + return environ_key_prefix = environ_key_prefix.lower() prefix_length = len(environ_key_prefix) for key, value in environ.items(): @@ -111,6 +147,9 @@ def load_from_environ( @classmethod def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]: + """Get all CLI arguments defined in the class, + including inherited classes. Yields a tuple of + (attribute name, CliArg)""" for _cls in cls.__mro__: for attr_name, attr_value in vars(_cls).items(): if isinstance(attr_value, CliArg): @@ -119,8 +158,7 @@ def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]: @classmethod @property def parser(cls) -> argparse.ArgumentParser: - """Parse CLI arguments from environment variables, - and return the parser""" + """Return an argument parser with all CLI arguments""" arg_parser = argparse.ArgumentParser() for cli_key, cli_arg in cls.iterate_over_cli_args(): args = [] # type: List[str] @@ -178,11 +216,11 @@ class AppSettingsCliArgs(CliArgHelper): short_option="-no-cache", help="Disable caching of pip installs, if `install-pkgs` is set", ) - upgrade_pkgs: CliArg[bool] = CliArg( + upgrade: CliArg[bool] = CliArg( type=bool, action="store_true", short_option="u", - help="Upgrade all packages before running the server", + help="Upgrade all packages and repositories before running the server", ) @@ -230,12 +268,6 @@ class MainCliArgs(AppSettingsCliArgs): ) -class GitCloneArgs(TypedDict): - git_path: str - disk_path: str - options: Optional[List[str]] - - class Config: """Configuration for the project""" diff --git a/llama_api/utils/errors.py b/llama_api/utils/errors.py index 8f367f2..0f1356f 100644 --- a/llama_api/utils/errors.py +++ b/llama_api/utils/errors.py @@ -126,7 +126,7 @@ class RouteErrorHandler(APIRoute): ): ErrorResponseFormatters.model_not_found, } - api_key: Optional[str] = MainCliArgs.api_key.value + api_key: Optional[str] = MainCliArgs.api_key.value or None @cached_property def authorization(self) -> Optional[str]: diff --git a/readme.md b/readme.md index 3a74c87..65b29df 100644 --- a/readme.md +++ b/readme.md @@ -32,31 +32,32 @@ python -m main ``` Options: ```b -usage: main.py [-h] [-p PORT] [-w MAX_WORKERS] [-s MAX_SEMAPHORES] [-k API_KEY] [-x] [--no-embed] [-t] [-i] [-c] [--no-torch] [--no-tf] [--no-compile] [--no-cache] [-u] +usage: main.py [-h] [--port PORT] [--max-workers MAX_WORKERS] [--max-semaphores MAX_SEMAPHORES] [--api-key API_KEY] [--xformers] [--no-embed] [--tunnel] [--install-pkgs] [--force-cuda] [--skip-torch-install] [--skip-tf-install] [--skip-compile] + [--no-cache-dir] [--upgrade] options: -h, --help show this help message and exit - -p PORT, --port PORT Port to run the server on; default is 8000 - -w MAX_WORKERS, --max-workers MAX_WORKERS + --port PORT, -p PORT Port to run the server on; default is 8000 + --max-workers MAX_WORKERS, -w MAX_WORKERS Maximum number of process workers to run; default is 1 - -s MAX_SEMAPHORES, --max-semaphores MAX_SEMAPHORES + --max-semaphores MAX_SEMAPHORES, -s MAX_SEMAPHORES Maximum number of process semaphores to permit; default is 1 - -k API_KEY, --api-key API_KEY + --api-key API_KEY, -k API_KEY API key to use for the server - -x, --xformers Apply xformers' memory-efficient optimizations + --xformers, -x Apply xformers' memory-efficient optimizations --no-embed Disable embeddings endpoint - -t, --tunnel Tunnel the server through cloudflared - -i, --install-pkgs Install all required packages before running the server - -c, --force-cuda Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118 - --no-torch, --skip-torch-install + --tunnel, -t Tunnel the server through cloudflared + --install-pkgs, -i Install all required packages before running the server + --force-cuda, -c Force CUDA version of pytorch to be used when installing pytorch. e.g. torch==2.0.1+cu118 + --skip-torch-install, --no-torch Skip installing pytorch, if `install-pkgs` is set - --no-tf, --skip-tf-install + --skip-tf-install, --no-tf Skip installing tensorflow, if `install-pkgs` is set - --no-compile, --skip-compile + --skip-compile, --no-compile Skip compiling the shared library of LLaMA C++ code - --no-cache, --no-cache-dir + --no-cache-dir, --no-cache Disable caching of pip installs, if `install-pkgs` is set - -u, --upgrade-pkgs Upgrade all packages before running the server + --upgrade, -u Upgrade all packages and repositories before running the server ``` ### Unique features From 419f138c9336e5aca823d40a9aab0b463befd9e6 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 22 Aug 2023 01:19:00 +0900 Subject: [PATCH 15/18] Refactored `get_event_publisher` --- llama_api/server/app_settings.py | 4 +- llama_api/server/routers/v1.py | 131 +++++++++++++------------------ llama_api/utils/completions.py | 23 +++++- 3 files changed, 79 insertions(+), 79 deletions(-) diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py index c3a2a78..5212a3b 100644 --- a/llama_api/server/app_settings.py +++ b/llama_api/server/app_settings.py @@ -102,7 +102,9 @@ def initialize_before_launch() -> None: for git_clone_args in Config.repositories.values(): git_clone(**git_clone_args) if upgrade: - git_pull(git_clone_args["git_path"]) + git_pull( + git_clone_args["git_path"], options=["--recurse-submodules"] + ) # Install packages if install_packages: diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py index de0901b..54c2d0f 100644 --- a/llama_api/server/routers/v1.py +++ b/llama_api/server/routers/v1.py @@ -8,7 +8,6 @@ from functools import partial from queue import Queue from random import choice -from threading import Event from time import time from typing import ( Any, @@ -28,9 +27,7 @@ Semaphore, create_memory_object_stream, get_cancelled_exc_class, - move_on_after, ) -from anyio.streams.memory import MemoryObjectSendStream from fastapi import APIRouter, Request from fastapi.concurrency import iterate_in_threadpool, run_in_threadpool from orjson import OPT_INDENT_2, dumps @@ -41,9 +38,7 @@ from ...mixins.completion import CompletionStatus from ...schemas.api import ( ChatCompletion, - ChatCompletionChunk, Completion, - CompletionChunk, CreateChatCompletionRequest, CreateCompletionRequest, CreateEmbeddingRequest, @@ -119,17 +114,29 @@ async def get_wix_with_semaphore( ) -> AsyncGenerator[int, None]: """Get the worker index (wix) for the key and acquire the semaphore""" global wix_metas + + # Get the worker index (wix) with the lowest rank + # If the rank is -2, then the worker is processing the same model + # If the rank is -1, then the worker is not processing any model + # If the rank is greater than or equal to 0, then the worker is processing + # a different model worker_ranks = [ get_worker_rank(wix_meta, request_key) for wix_meta in wix_metas ] min_rank = min(worker_ranks) + + # Choose a random worker index (wix) with the lowest rank candidates = [i for i, rank in enumerate(worker_ranks) if rank == min_rank] if not candidates: raise LookupError("No available wix") wix_meta = wix_metas[choice(candidates)] + + # Acquire the semaphore for the worker index (wix) async with wix_meta.semaphore: + # If client is already gone, then ignore the request if await request.is_disconnected(): return + # Reserve the worker, it is now processing the request wix_meta.processed_key = request_key yield wix_meta.wix @@ -145,60 +152,6 @@ def validate_item_type(item: Any, type: Type[T]) -> T: return item -def get_text_from_completion( - completion: Union[Completion, ChatCompletion] -) -> str: - """Get the generated text from a completion""" - if "text" in completion["choices"][0]: - return completion["choices"][0]["text"] - return completion["choices"][0]["message"]["content"] - - -def get_text_from_chunk( - chunk: Union[CompletionChunk, ChatCompletionChunk] -) -> str: - """Get the generated text from a completion chunk""" - if "text" in chunk["choices"][0]: - return chunk["choices"][0]["text"] - return chunk["choices"][0]["delta"].get("content", "") - - -async def get_event_publisher( - request: Request, - body: Union[ - CreateChatCompletionRequest, - CreateCompletionRequest, - ], - inner_send_chan: MemoryObjectSendStream[bytes], - task: "Task[CompletionStatus]", - interrupt_signal: Event, - iterator: Iterator[Union[ChatCompletionChunk, CompletionChunk]], -) -> None: - """Publish Server-Sent-Events (SSE) to the client""" - is_interrupted = False # type: bool - async with inner_send_chan: - try: - async for chunk in iterate_in_threadpool(iterator): - await inner_send_chan.send(b"data: " + dumps(chunk) + b"\n\n") - if await request.is_disconnected(): - raise get_cancelled_exc_class()() - await inner_send_chan.send(b"data: [DONE]\n\n") - except get_cancelled_exc_class(): - is_interrupted = True - with move_on_after(1, shield=True): - raise - finally: - # Cancel the producer task and set event, - # so the completion task can be stopped - interrupt_signal.set() - state = "Interrupted" if is_interrupted else "Completed" - try: - status = await wait_for(task, timeout=3) - log_request_and_response(body, status, state) - finally: - task.cancel() - - def get_streaming_iterator( queue: Queue, first_response: Optional[Dict] = None, @@ -225,8 +178,11 @@ def log_request_and_response( state: Literal["Completed", "Interrupted"], ) -> None: """Log the request and response of the completion or embedding""" + # If the status is None, then the request has been interrupted if status is None: return + + # Measure the elapsed time, and get information about the request elapsed_time = time() - status.started_at log_messages: List[str] = [f"elapsed time: {elapsed_time: .1f}s"] body_without_prompt = body.model_dump( @@ -240,18 +196,20 @@ def log_request_and_response( if isinstance(status, EmbeddingStatus) and isinstance( body, CreateEmbeddingRequest ): + # Embedding usage is the number of characters in the input + # and the number of chunks in the embedding embed_usage = { "input_chars": len(body.input), "embedding_chunks": len(status.embedding["data"]) if status.embedding else 0, - } + } # type: Dict[str, int] log_messages.append(f"embedding chunks: {embed_usage}") embed_log = { "request": body_without_prompt, "input": body.input, "embedding": status.embedding, - } + } # type: Dict[str, Any] logger.info( f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})" ) @@ -264,6 +222,7 @@ def log_request_and_response( tokens_per_second = tokens / elapsed_time log_messages.append(f"tokens: {tokens}({tokens_per_second: .1f}tok/s)") if isinstance(body, CreateChatCompletionRequest): + # Log the chat completion status chat_log = { "request": body_without_prompt, "chat": [ @@ -276,15 +235,16 @@ def log_request_and_response( "content": status.generated_text, } ], - } + } # type: Dict[str, Any] elif isinstance(body, CreateCompletionRequest): + # Log the text completion status chat_log = { "request": body_without_prompt, "prompt": { "user": body.prompt, "assistant": status.generated_text, }, - } + } # type: Dict[str, Any] else: return logger.info(f"🦙 [{state} for {body.model}]: ({' | '.join(log_messages)})") @@ -316,22 +276,39 @@ async def create_chat_completion_or_completion( ) if body.stream: send_chan, recv_chan = create_memory_object_stream(10) + chunk_iterator = get_streaming_iterator( + queue=queue, + first_response=validate_item_type( + await run_in_threadpool(queue.get), type=dict + ), + ) + + async def get_event_publisher() -> None: + # Publish Server-Sent-Events (SSE) to the client + is_interrupted = False # type: bool + send = send_chan.send + try: + async for chunk in iterate_in_threadpool(chunk_iterator): + await send(b"data: " + dumps(chunk) + b"\n\n") + await send(b"data: [DONE]\n\n") + except get_cancelled_exc_class(): + is_interrupted = True + finally: + send_chan.close() + # Cancel the producer task and set event, + # so the completion task can be stopped + interrupt_signal.set() + state = "Interrupted" if is_interrupted else "Completed" + try: + status = await wait_for(task, timeout=3) + log_request_and_response(body, status, state) + finally: + task.cancel() + return EventSourceResponse( recv_chan, - data_sender_callable=partial( - get_event_publisher, - request=request, - body=body, - inner_send_chan=send_chan, - task=task, - interrupt_signal=interrupt_signal, - iterator=get_streaming_iterator( # type: ignore - queue=queue, - first_response=validate_item_type( - await run_in_threadpool(queue.get), type=dict - ), - ), - ), + data_sender_callable=get_event_publisher, + ping=5, ) else: # Cancel the producer task and set event, diff --git a/llama_api/utils/completions.py b/llama_api/utils/completions.py index 6b696f3..4459aae 100644 --- a/llama_api/utils/completions.py +++ b/llama_api/utils/completions.py @@ -1,5 +1,5 @@ from time import time -from typing import Iterator, Literal, Optional +from typing import Iterator, Literal, Optional, Union from uuid import uuid4 from ..schemas.api import ( @@ -327,3 +327,24 @@ def convert_text_completion_chunks_to_chat( ) ], ) + + +# ==== GET TEXT FROM COMPLETION ==== # + + +def get_text_from_completion( + completion: Union[Completion, ChatCompletion] +) -> str: + """Get the generated text from a completion""" + if "text" in completion["choices"][0]: + return completion["choices"][0]["text"] + return completion["choices"][0]["message"]["content"] + + +def get_text_from_chunk( + chunk: Union[CompletionChunk, ChatCompletionChunk] +) -> str: + """Get the generated text from a completion chunk""" + if "text" in chunk["choices"][0]: + return chunk["choices"][0]["text"] + return chunk["choices"][0]["delta"].get("content", "") From 427d553782b9b63b36757bcb04e4ed5db24eda07 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 22 Aug 2023 01:19:32 +0900 Subject: [PATCH 16/18] Added `required` option to `FunctionCallMixin` --- llama_api/mixins/function_call.py | 151 ++++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 40 deletions(-) diff --git a/llama_api/mixins/function_call.py b/llama_api/mixins/function_call.py index 7b7ae6e..5f7da00 100644 --- a/llama_api/mixins/function_call.py +++ b/llama_api/mixins/function_call.py @@ -34,7 +34,7 @@ # whitespace is constrained to a single space char # to prevent model "running away" in # whitespace. Also maybe improves generation quality? -SPACE_RULE: str = '" "?' +SPACE_RULE: str = "([ \t\n])?" PRIMITIVE_RULES: Dict[str, str] = { "boolean": '("true" | "false") space', @@ -60,7 +60,14 @@ "boolean", "number", "integer", "string", "null", "object", "array" ] SchemaKey = Literal[ - "type", "oneOf", "anyOf", "const", "enum", "properties", "items" + "type", + "oneOf", + "anyOf", + "const", + "enum", + "properties", + "items", + "required", ] @@ -104,24 +111,43 @@ def invoke_function_call_streaming( "function call is not implemented for this model" ) - @staticmethod + @classmethod + def from_json_schema( + cls, + schema: Union[Dict[SchemaKey, Any], str], + prop_order: Optional[Dict[str, int]] = None, + ) -> str: + """Parse a JSON schema into a BNF grammar""" + if isinstance(schema, str): + schema = json.loads(schema) + assert isinstance(schema, dict), "schema must be valid JSON" + self = cls() + self._prop_order = prop_order or {} + self._rules = {"space": SPACE_RULE} + self._visit(schema, "") + return self._format_grammar() + + @classmethod @overload def from_function_calls( + cls, function_calls: FunctionCall, prop_order: Optional[Dict[str, int]] = None, ) -> str: ... - @staticmethod + @classmethod @overload def from_function_calls( + cls, function_calls: Iterable[FunctionCall], prop_order: Optional[Dict[str, int]] = None, ) -> List[str]: ... - @staticmethod + @classmethod def from_function_calls( + cls, function_calls: Union[FunctionCall, Iterable[FunctionCall]], prop_order: Optional[Dict[str, int]] = None, ) -> Union[str, List[str]]: @@ -135,7 +161,7 @@ def from_function_calls( bnfs = [] # type: List[str] for function_call in function_calls: - self = FunctionCallMixin() + self = cls() self._prop_order = prop_order or {} self._rules = {"space": SPACE_RULE} parameters = function_call.to_dict().get("parameters") @@ -144,24 +170,27 @@ def from_function_calls( bnfs.append(self._format_grammar()) return bnfs if return_as_list else bnfs[0] - @staticmethod + @classmethod @overload def from_functions( + cls, functions: Callable, prop_order: Optional[Dict[str, int]] = None, ) -> str: ... - @staticmethod + @classmethod @overload def from_functions( + cls, functions: Iterable[Callable], prop_order: Optional[Dict[str, int]] = None, ) -> List[str]: ... - @staticmethod + @classmethod def from_functions( + cls, functions: Union[Callable, Iterable[Callable]], prop_order: Optional[Dict[str, int]] = None, ) -> Union[str, List[str]]: @@ -258,7 +287,7 @@ def _visit(self, schema: Dict[SchemaKey, Any], name: str) -> str: if "oneOf" in schema or "anyOf" in schema: # This is a union type - rule: str = " | ".join( + rule = " | ".join( ( self._visit(alt_schema, f'{name}{"-" if name else ""}{i}') for i, alt_schema in enumerate( @@ -282,24 +311,47 @@ def _visit(self, schema: Dict[SchemaKey, Any], name: str) -> str: return self._add_rule(rule_name, rule) elif schema_type == "object" and "properties" in schema: - # TODO: `required` keyword + required_properties = set( + schema.get("required", schema["properties"].keys()) + ) + if not required_properties: + raise ValueError( + "Object schema must have at least one required property if `required` is specified" + ) prop_order = self._prop_order prop_pairs = sorted( schema["properties"].items(), - # sort by position in prop_order (if specified) then by key key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), ) - rule = '"{" space' - for i, (prop_name, prop_schema) in enumerate(prop_pairs): + rule_parts = [] # type: List[str] + optional_rule_parts = [] # type: List[str] + first_property = True # type: bool + + for prop_name, prop_schema in prop_pairs: prop_rule_name = self._visit( prop_schema, f'{name}{"-" if name else ""}{prop_name}' ) - if i > 0: - rule += ' "," space' - rule += rf' {self._format_literal(prop_name)} space ":" space {prop_rule_name}' - rule += ' "}" space' + prop_str = rf'{self._format_literal(prop_name)} space ":" space {prop_rule_name}' + + if prop_name in required_properties: + if not first_property: + prop_str = rf'"," space {prop_str}' + rule_parts.append(prop_str) + first_property = False + else: + optional_rule_parts.append(prop_str) + + for i, optional_str in enumerate(optional_rule_parts): + if i == 0 and not rule_parts: + # if no required properties + combined_str = rf"({optional_str})?" + else: + combined_str = rf'("," space {optional_str})?' + rule_parts.append(combined_str) + # Combine rules + rule = '"{" space ' + " ".join(rule_parts) + ' "}" space' return self._add_rule(rule_name, rule) elif schema_type == "array" and "items" in schema: @@ -326,7 +378,7 @@ def _format_grammar(self): if __name__ == "__main__": - # from llama_cpp import LlamaGrammar, Llama + from repositories.llama_cpp.llama_cpp import LlamaGrammar, Llama # Define a python function and parse it into a grammar def get_current_weather( @@ -340,32 +392,51 @@ def get_current_weather( ["fahrenheit", "celsius"], ], source: Annotated[ - str, + Optional[str], "The source of the weather information", ["openweathermap", "weatherapi"], ] = "openweathermap", ): """Get the current weather in a given location""" - model_path = "C:/Users/sdml/Desktop/orca-mini-3b.ggmlv3.q4_0.bin" + model_path = r"models\ggml\orca-mini-3b.ggmlv3.q4_0.bin" grammar: str = FunctionCallMixin.from_functions(get_current_weather) + # print(f"Grammar:\n{grammar}") + + json_schema = { + "type": "object", + "properties": { + "location": {"type": "string"}, + "unit": { + "type": "string", + "enum": ["fahrenheit", "celsius"], + }, + "source": { + "type": "string", + "enum": ["openweathermap", "weatherapi"], + }, + }, + "required": ["location", "unit"], + } # type: Dict[SchemaKey, Any] + grammar = FunctionCallMixin.from_json_schema(json_schema) print(f"Grammar:\n{grammar}") - # llama_grammar = LlamaGrammar.from_string(grammar, verbose=False) - # llm = Llama(model_path) - # llm.grammar = llama_grammar - # for city in ( - # "London", - # "Paris", - # "New York", - # "Berlin", - # "Tokyo", - # "Sydney", - # "Moscow", - # "Beijing", - # "Cairo", - # "Rome", - # ): - # print(llm(prompt=f"### User: What is the weather in {city} today? ### Assistant:")["choices"][0]["text"]) # type: ignore - - # # Output: - # # { "location": "London", "source": "openweathermap","unit" : "celsius"} + + llama_grammar = LlamaGrammar.from_string(grammar, verbose=False) + llm = Llama(model_path) + for city in ( + "London", + "Paris", + "New York", + "Berlin", + "Tokyo", + "Sydney", + "Moscow", + "Beijing", + "Cairo", + "Rome", + ): + output = llm(prompt=f"### User: What is the weather in {city} today? ### Assistant:", grammar=llama_grammar)["choices"][0]["text"] # type: ignore + print(json.loads(output)) + + # Output: + # { "location": "London", "source": "openweathermap","unit" : "celsius"} From b1018430ad805ca054dd92f3e3bd6e4fea11b847 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 22 Aug 2023 21:12:50 +0900 Subject: [PATCH 17/18] ci fail resolve - 1 --- llama_api/shared/config.py | 5 ++--- tests/test_cli.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index b211ed8..7b2e791 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -61,7 +61,7 @@ def load( environ_key_prefix: str = DEFAULT_ENVIRON_KEY_PREFIX, ) -> None: """Load CLI arguments from environment variables and CLI arguments""" - cls.load_from_namespace(cls.parser.parse_args()) + cls.load_from_namespace(cls.get_parser().parse_args()) cls.load_from_environ( environ_key=environ_key, environ_key_prefix=environ_key_prefix ) @@ -156,8 +156,7 @@ def iterate_over_cli_args(cls) -> Iterable[Tuple[str, CliArg]]: yield attr_name, attr_value @classmethod - @property - def parser(cls) -> argparse.ArgumentParser: + def get_parser(cls) -> argparse.ArgumentParser: """Return an argument parser with all CLI arguments""" arg_parser = argparse.ArgumentParser() for cli_key, cli_arg in cls.iterate_over_cli_args(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 621ef1e..df9ef72 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,7 @@ class TestCLIArgs(unittest.TestCase): def test_cli_args(self): - parser = MainCliArgs.parser + parser = MainCliArgs.get_parser() environ_key = "LLAMA_CLI_ARGS" environ_key_prefix = "LLAMA_" From 7ea7b6cfce1a16edcce7ac45ae8f6bc1688fba61 Mon Sep 17 00:00:00 2001 From: c0sogi Date: Tue, 22 Aug 2023 21:20:59 +0900 Subject: [PATCH 18/18] ci fail resolve - 2 --- llama_api/mixins/completion.py | 2 +- llama_api/shared/config.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py index 26f03f9..f5404bc 100644 --- a/llama_api/mixins/completion.py +++ b/llama_api/mixins/completion.py @@ -24,7 +24,7 @@ class CompletionStatus: class CompletionMixin: """A mixin for modules that support completion generation.""" - _completion_status: Optional[defaultdict[str, CompletionStatus]] = None + _completion_status: Optional["defaultdict[str, CompletionStatus]"] = None @property def completion_status(self) -> Dict[str, CompletionStatus]: diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py index 7b2e791..3a9c2e2 100644 --- a/llama_api/shared/config.py +++ b/llama_api/shared/config.py @@ -295,11 +295,11 @@ class Config: "exllama": GitCloneArgs( git_path="https://github.com/turboderp/exllama", disk_path="repositories/exllama", - options=["recurse-submodules"], + options=None, ), "llama_cpp": GitCloneArgs( git_path="https://github.com/abetlen/llama-cpp-python", disk_path="repositories/llama_cpp", - options=None, + options=["--recurse-submodules"], ), }