diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 7638170a9..f3407c92b 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,6 +1,7 @@ import sys import ctypes import logging +import logging.config import llama_cpp @@ -17,8 +18,38 @@ 5: logging.DEBUG, } -logger = logging.getLogger("llama-cpp-python") +UVICORN_LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": {"format": "%(asctime)s [%(levelname)s] %(message)s"}, + }, + "handlers": { + "default": { + "level": "INFO", + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", # Default is stderr + }, + }, + "loggers": { + "uvicorn.error": { + "level": "DEBUG", + "handlers": ["default"], + }, + "uvicorn.access": { + "level": "DEBUG", + "handlers": ["default"], + }, + }, +} +# Set up llama-cpp-python logger matching the format of uvicorn logger +logger = logging.getLogger("llama-cpp-python") +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) @llama_cpp.llama_log_callback def llama_log_callback( @@ -27,7 +58,13 @@ def llama_log_callback( user_data: ctypes.c_void_p, ): if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: - print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) + _text = text.decode("utf-8") + if _text.endswith("\n"): + _text = _text[:-1] + + # Skip if the message only contains "." + if not _text == ".": + logger.log(GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level], _text) llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4d10eaaeb..476d75d54 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -66,7 +66,7 @@ _LlamaSamplingContext, # type: ignore _normalize_embedding, # type: ignore ) -from ._logger import set_verbose +from ._logger import set_verbose, logger from ._utils import suppress_stdout_stderr @@ -403,7 +403,7 @@ def __init__( ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + logger.info(f'System info: {llama_cpp.llama_print_system_info().decode("utf-8")}') self.chat_format = chat_format self.chat_handler = chat_handler @@ -434,10 +434,10 @@ def __init__( except Exception as e: self.metadata = {} if self.verbose: - print(f"Failed to load metadata: {e}", file=sys.stderr) + logger.error(f"Failed to load metadata: {e}") if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + logger.info(f"Model metadata: {self.metadata}") eos_token_id = self.token_eos() bos_token_id = self.token_bos() @@ -452,7 +452,7 @@ def __init__( template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"] if self.verbose and template_choices: - print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr) + logger.info(f"Available chat formats from metadata: {', '.join(template_choices.keys())}") for name, template in template_choices.items(): self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( @@ -474,19 +474,19 @@ def __init__( if chat_format is not None: self.chat_format = chat_format if self.verbose: - print(f"Guessed chat format: {chat_format}", file=sys.stderr) + logger.info(f"Guessed chat format: {chat_format}") else: if self.verbose: - print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr) - print(f"Using chat eos_token: {eos_token}", file=sys.stderr) - print(f"Using chat bos_token: {bos_token}", file=sys.stderr) + logger.info(f"Using gguf chat template: {template_choices['chat_template.default']}") + logger.info(f"Using chat eos_token: {eos_token}") + logger.info(f"Using chat bos_token: {bos_token}") self.chat_format = "chat_template.default" if self.chat_format is None and self.chat_handler is None: self.chat_format = "llama-2" if self.verbose: - print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr) + logger.info(f"Using fallback chat format: {self.chat_format}") @property def ctx(self) -> llama_cpp.llama_context_p: @@ -728,7 +728,8 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + # print("Llama.generate: prefix-match hit", file=sys.stderr) + logger.info("Llama.generate: prefix-match hit") reset = False tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index a6f1f4e9c..6ced7bc10 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -37,6 +37,7 @@ ConfigFileSettings, ) from llama_cpp.server.cli import add_args_from_model, parse_model_from_args +from llama_cpp._logger import logger, UVICORN_LOGGING_CONFIG def main(): @@ -75,7 +76,7 @@ def main(): server_settings = parse_model_from_args(ServerSettings, args) model_settings = [parse_model_from_args(ModelSettings, args)] except Exception as e: - print(e, file=sys.stderr) + logger.error(e) parser.print_help() sys.exit(1) assert server_settings is not None @@ -90,6 +91,7 @@ def main(): port=int(os.getenv("PORT", server_settings.port)), ssl_keyfile=server_settings.ssl_keyfile, ssl_certfile=server_settings.ssl_certfile, + log_config=UVICORN_LOGGING_CONFIG ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 4a00bf0e2..ed90fa155 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -51,6 +51,7 @@ from llama_cpp.server.errors import RouteErrorHandler from llama_cpp._utils import monitor_task_queue from llama_cpp.llama_metrics import MetricsExporter +from llama_cpp._logger import logger router = APIRouter(route_class=RouteErrorHandler) @@ -211,9 +212,9 @@ async def get_event_publisher( raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: - print("disconnected") + logger.warning(f"Disconnected from client {request.client}") with anyio.move_on_after(1, shield=True): - print(f"Disconnected from client (via refresh/close) {request.client}") + logger.error(f"Disconnected from client (via refresh/close) {request.client}") raise e finally: if on_complete: diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py index fbf9fd80d..826e7ed94 100644 --- a/llama_cpp/server/errors.py +++ b/llama_cpp/server/errors.py @@ -21,6 +21,7 @@ CreateEmbeddingRequest, CreateChatCompletionRequest, ) +from llama_cpp._logger import logger class ErrorResponse(TypedDict): @@ -134,7 +135,7 @@ def error_message_wrapper( ] = None, ) -> Tuple[int, ErrorResponse]: """Wraps error message in OpenAI style error response""" - print(f"Exception: {str(error)}", file=sys.stderr) + logger.error(f"Exception: {str(error)}") traceback.print_exc(file=sys.stderr) if body is not None and isinstance( body, diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index ad39c1004..c35fd37dd 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -9,6 +9,7 @@ import llama_cpp.llama_tokenizer as llama_tokenizer from llama_cpp.server.settings import ModelSettings +from llama_cpp._logger import logger class LlamaProxy: @@ -272,11 +273,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: if settings.cache: if settings.cache_type == "disk": if settings.verbose: - print(f"Using disk cache with size {settings.cache_size}") + logger.info(f"Using disk cache with size {settings.cache_size}") cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: - print(f"Using ram cache with size {settings.cache_size}") + logger.info(f"Using ram cache with size {settings.cache_size}") cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) _model.set_cache(cache) return _model