Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uses logger with formatted output across function calls #10

Merged
merged 2 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions llama_cpp/_logger.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import ctypes
import logging
import logging.config

import llama_cpp

Expand All @@ -17,8 +18,38 @@
5: logging.DEBUG,
}

logger = logging.getLogger("llama-cpp-python")
UVICORN_LOGGING_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"standard": {"format": "%(asctime)s [%(levelname)s] %(message)s"},
},
"handlers": {
"default": {
"level": "INFO",
"formatter": "standard",
"class": "logging.StreamHandler",
"stream": "ext://sys.stdout", # Default is stderr
},
},
"loggers": {
"uvicorn.error": {
"level": "DEBUG",
"handlers": ["default"],
},
"uvicorn.access": {
"level": "DEBUG",
"handlers": ["default"],
},
},
}

# Set up llama-cpp-python logger matching the format of uvicorn logger
logger = logging.getLogger("llama-cpp-python")
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

@llama_cpp.llama_log_callback
def llama_log_callback(
Expand All @@ -27,7 +58,13 @@ def llama_log_callback(
user_data: ctypes.c_void_p,
):
if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
_text = text.decode("utf-8")
if _text.endswith("\n"):
_text = _text[:-1]

# Skip if the message only contains "."
if not _text == ".":
logger.log(GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level], _text)


llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
Expand Down
23 changes: 12 additions & 11 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
_LlamaSamplingContext, # type: ignore
_normalize_embedding, # type: ignore
)
from ._logger import set_verbose
from ._logger import set_verbose, logger
from ._utils import suppress_stdout_stderr


Expand Down Expand Up @@ -403,7 +403,7 @@ def __init__(
)

if self.verbose:
print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
logger.info(f'System info: {llama_cpp.llama_print_system_info().decode("utf-8")}')

self.chat_format = chat_format
self.chat_handler = chat_handler
Expand Down Expand Up @@ -434,10 +434,10 @@ def __init__(
except Exception as e:
self.metadata = {}
if self.verbose:
print(f"Failed to load metadata: {e}", file=sys.stderr)
logger.error(f"Failed to load metadata: {e}")

if self.verbose:
print(f"Model metadata: {self.metadata}", file=sys.stderr)
logger.info(f"Model metadata: {self.metadata}")

eos_token_id = self.token_eos()
bos_token_id = self.token_bos()
Expand All @@ -452,7 +452,7 @@ def __init__(
template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]

if self.verbose and template_choices:
print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
logger.info(f"Available chat formats from metadata: {', '.join(template_choices.keys())}")

for name, template in template_choices.items():
self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
Expand All @@ -474,19 +474,19 @@ def __init__(
if chat_format is not None:
self.chat_format = chat_format
if self.verbose:
print(f"Guessed chat format: {chat_format}", file=sys.stderr)
logger.info(f"Guessed chat format: {chat_format}")
else:
if self.verbose:
print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
logger.info(f"Using gguf chat template: {template_choices['chat_template.default']}")
logger.info(f"Using chat eos_token: {eos_token}")
logger.info(f"Using chat bos_token: {bos_token}")

self.chat_format = "chat_template.default"

if self.chat_format is None and self.chat_handler is None:
self.chat_format = "llama-2"
if self.verbose:
print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
logger.info(f"Using fallback chat format: {self.chat_format}")

@property
def ctx(self) -> llama_cpp.llama_context_p:
Expand Down Expand Up @@ -728,7 +728,8 @@ def generate(
break
if longest_prefix > 0:
if self.verbose:
print("Llama.generate: prefix-match hit", file=sys.stderr)
# print("Llama.generate: prefix-match hit", file=sys.stderr)
logger.info("Llama.generate: prefix-match hit")
reset = False
tokens = tokens[longest_prefix:]
self.n_tokens = longest_prefix
Expand Down
4 changes: 3 additions & 1 deletion llama_cpp/server/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
ConfigFileSettings,
)
from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
from llama_cpp._logger import logger, UVICORN_LOGGING_CONFIG


def main():
Expand Down Expand Up @@ -75,7 +76,7 @@ def main():
server_settings = parse_model_from_args(ServerSettings, args)
model_settings = [parse_model_from_args(ModelSettings, args)]
except Exception as e:
print(e, file=sys.stderr)
logger.error(e)
parser.print_help()
sys.exit(1)
assert server_settings is not None
Expand All @@ -90,6 +91,7 @@ def main():
port=int(os.getenv("PORT", server_settings.port)),
ssl_keyfile=server_settings.ssl_keyfile,
ssl_certfile=server_settings.ssl_certfile,
log_config=UVICORN_LOGGING_CONFIG
)


Expand Down
5 changes: 3 additions & 2 deletions llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from llama_cpp.server.errors import RouteErrorHandler
from llama_cpp._utils import monitor_task_queue
from llama_cpp.llama_metrics import MetricsExporter
from llama_cpp._logger import logger


router = APIRouter(route_class=RouteErrorHandler)
Expand Down Expand Up @@ -211,9 +212,9 @@ async def get_event_publisher(
raise anyio.get_cancelled_exc_class()()
await inner_send_chan.send(dict(data="[DONE]"))
except anyio.get_cancelled_exc_class() as e:
print("disconnected")
logger.warning(f"Disconnected from client {request.client}")
with anyio.move_on_after(1, shield=True):
print(f"Disconnected from client (via refresh/close) {request.client}")
logger.error(f"Disconnected from client (via refresh/close) {request.client}")
raise e
finally:
if on_complete:
Expand Down
3 changes: 2 additions & 1 deletion llama_cpp/server/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
CreateEmbeddingRequest,
CreateChatCompletionRequest,
)
from llama_cpp._logger import logger


class ErrorResponse(TypedDict):
Expand Down Expand Up @@ -134,7 +135,7 @@ def error_message_wrapper(
] = None,
) -> Tuple[int, ErrorResponse]:
"""Wraps error message in OpenAI style error response"""
print(f"Exception: {str(error)}", file=sys.stderr)
logger.error(f"Exception: {str(error)}")
traceback.print_exc(file=sys.stderr)
if body is not None and isinstance(
body,
Expand Down
5 changes: 3 additions & 2 deletions llama_cpp/server/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import llama_cpp.llama_tokenizer as llama_tokenizer

from llama_cpp.server.settings import ModelSettings
from llama_cpp._logger import logger


class LlamaProxy:
Expand Down Expand Up @@ -272,11 +273,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
if settings.cache:
if settings.cache_type == "disk":
if settings.verbose:
print(f"Using disk cache with size {settings.cache_size}")
logger.info(f"Using disk cache with size {settings.cache_size}")
cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
else:
if settings.verbose:
print(f"Using ram cache with size {settings.cache_size}")
logger.info(f"Using ram cache with size {settings.cache_size}")
cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
_model.set_cache(cache)
return _model