Skip to content

Commit

Permalink
Merge pull request #2 from ZenHubHQ/llm-monitoring
Browse files Browse the repository at this point in the history
Use 'ai_service' parameter to differentiate Prometheus metrics
  • Loading branch information
juanroesel authored May 15, 2024
2 parents 8cd638c + 67ae74a commit c538a13
Show file tree
Hide file tree
Showing 11 changed files with 184 additions and 41 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ _skbuild/

.envrc

# LLMs - comment if you'd like to bake the model into the image
models/

# Byte-compiled / optimized / DLL files
Expand Down
44 changes: 44 additions & 0 deletions dev.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Define the image argument and provide a default value
ARG IMAGE=python:3.11.8

# Use the image as specified
FROM ${IMAGE}

# Re-declare the ARG after FROM
ARG IMAGE

# Update and upgrade the existing packages
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
ninja-build \
libopenblas-dev \
build-essential \
git

RUN mkdir /app
WORKDIR /app
COPY . /app

RUN python3 -m pip install --upgrade pip

RUN make deps && make build && make clean

# Set environment variable for the host
ENV GH_TOKEN=$GH_TOKEN
ENV HOST=0.0.0.0
ENV PORT=8000
ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf

# # Install depencencies
# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client

# # Install llama-cpp-python (build with METAL)
# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose

# Expose a port for the server
EXPOSE 8000

# Run the server start script
CMD ["/bin/sh", "/app/docker/simple/run.sh"]
# CMD python3 -m llama_cpp.server --n_gpu_layers -1
15 changes: 15 additions & 0 deletions dev.docker-compose
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: '3'
services:
dev-llama-cpp-python:
build:
context: .
dockerfile: dev.Dockerfile
ports:
- 8000:8000
volumes:
- ./llama_cpp:/app/llama_cpp
networks:
- zh-service-network
networks:
zh-service-network:
external: true
3 changes: 2 additions & 1 deletion docker/simple/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

make build
uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload
python3 -m llama_cpp.server --model $MODEL --n_gpu_layers -1
26 changes: 2 additions & 24 deletions llama_cpp/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import psutil
import subprocess

from typing import Any, Dict, List
from typing import Any, Dict, List, Tuple, Union

# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
Expand Down Expand Up @@ -112,7 +112,7 @@ def get_gpu_info_by_pid(pid) -> float:
pass
return 0.0

def get_gpu_general_info() -> tuple[float, float, float]:
def get_gpu_general_info() -> Tuple[float, float, float]:
"""
GPU general info (if GPU is available)
"""
Expand All @@ -123,25 +123,3 @@ def get_gpu_general_info() -> tuple[float, float, float]:
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0, 0.0, 0.0

def infer_service_from_prompt(prompt: str | List[str]):
"""
Infer the service for which a completion request is sent based on the prompt.
"""
LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided."
ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue."
SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review."

if isinstance(prompt, list):
prompt = " ".join(prompt)

if LABEL_SUGGESTIONS_TASK in prompt:
return "label-suggestions"

elif ACCEPTANCE_CRITERIA_TASK in prompt:
return "acceptance-criteria"

elif SPRINT_REVIEW_TASK in prompt:
return "sprint-review"

return "not-specified"
77 changes: 73 additions & 4 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
from llama_cpp.llama_metrics import Metrics, MetricsExporter

from llama_cpp._utils import (
infer_service_from_prompt,
get_cpu_usage,
get_ram_usage,
get_gpu_info_by_pid,
Expand Down Expand Up @@ -70,6 +69,7 @@ class Llama:
"""High-level Python wrapper for a llama.cpp model."""

__backend_initialized = False
__prometheus_metrics = MetricsExporter()

def __init__(
self,
Expand Down Expand Up @@ -464,7 +464,7 @@ def __init__(
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)

# Prometheus metrics
self.metrics = MetricsExporter()
self.metrics = self.__prometheus_metrics

@property
def ctx(self) -> llama_cpp.llama_context_p:
Expand Down Expand Up @@ -960,6 +960,7 @@ def _create_completion(
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
ai_service: Optional[str] = None
) -> Union[
Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
]:
Expand All @@ -974,8 +975,10 @@ def _create_completion(
_ttft_start = time.time()
_pid = os.getpid()
_tpot_metrics = []
if not ai_service:
raise ValueError("ai_service must be provided")
_labels = {
"service": infer_service_from_prompt(prompt), # Infer the service for which the completion is being generated
"service": ai_service if ai_service is not None else "not-specified",
"request_type": "chat/completions",
}
# Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process
Expand Down Expand Up @@ -1278,6 +1281,14 @@ def logit_bias_processor(

token_end_position = 0
for token in remaining_tokens:
# Record TTFT metric (once)
if idx == 0:
_metrics_dict["time_to_first_token"] = time.time() - _ttft_start
# Record TPOT metric
else:
_tpot_metrics.append(time.time() - _tpot_start)
_tpot_start = time.time() # reset

token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))

logprobs_or_none: Optional[CompletionLogprobs] = None
Expand Down Expand Up @@ -1371,6 +1382,53 @@ def logit_bias_processor(
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
print("Llama._create_completion: cache saved", file=sys.stderr)

## PROMETHEUS METRICS IN STREAMING MODE ##
# Record TTFT metric -- Setting to None if no tokens were generated
if not _metrics_dict.get("time_to_first_token"):
_metrics_dict["time_to_first_token"] = None

# Record TPOT metrics (per generated token)
_metrics_dict["time_per_output_token"] = _tpot_metrics

# Record metrics from the C++ backend (converted to seconds)
_timings = llama_cpp.llama_get_timings(self._ctx.ctx)
_metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
_metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
_metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
_metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
_metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
_metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
_metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
_metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)

# Record prefill and generation token metrics
_metrics_dict["prefill_tokens"] = len(prompt_tokens)
_metrics_dict["generation_tokens"] = len(completion_tokens)

# Record system info
_gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
_metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check
_metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
_metrics_dict["gpu_utilization"] = _gpu_utilization
_metrics_dict["gpu_ram_usage"] = _gpu_memory_used
_metrics_dict["gpu_ram_free"] = _gpu_memory_free
_metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
_metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
_metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
_metrics_dict["system_info"] = {
"model": model_name,
"n_params": str(llama_cpp.llama_model_n_params(self.model)),
"n_embd": str(self.n_embd()),
"n_ctx": str(self.n_ctx()),
"n_vocab": str(self.n_vocab()),
"n_threads": str(self.n_threads)
}

# Log metrics to Prometheus
_all_metrics = Metrics(**_metrics_dict)
self.metrics.log_metrics(_all_metrics, labels=_labels)

return

if self.cache:
Expand Down Expand Up @@ -1446,6 +1504,11 @@ def logit_bias_processor(
"top_logprobs": top_logprobs,
}

## PROMETHEUS METRICS IN CHAT COMPLETION MODE ##
# Record TTFT metric -- Setting to None if no tokens were generated
if not _metrics_dict.get("time_to_first_token"):
_metrics_dict["time_to_first_token"] = None

# Record TPOT metrics (per generated token)
_metrics_dict["time_per_output_token"] = _tpot_metrics

Expand Down Expand Up @@ -1484,7 +1547,6 @@ def logit_bias_processor(
}

# Log metrics to Prometheus
#print(_metrics_dict, file=sys.stderr)
_all_metrics = Metrics(**_metrics_dict)
self.metrics.log_metrics(_all_metrics, labels=_labels)

Expand All @@ -1493,6 +1555,7 @@ def logit_bias_processor(
"object": "text_completion",
"created": created,
"model": model_name,
"service": ai_service,
"choices": [
{
"text": text_str,
Expand Down Expand Up @@ -1535,6 +1598,7 @@ def create_completion(
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
ai_service: Optional[str] = None
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt.
Expand Down Expand Up @@ -1598,6 +1662,7 @@ def create_completion(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)
if stream:
chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
Expand Down Expand Up @@ -1632,6 +1697,7 @@ def __call__(
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
ai_service: Optional[str] = None
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt.
Expand Down Expand Up @@ -1695,6 +1761,7 @@ def __call__(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)

def create_chat_completion(
Expand Down Expand Up @@ -1727,6 +1794,7 @@ def create_chat_completion(
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
ai_service: Optional[str] = None
) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
Expand Down Expand Up @@ -1796,6 +1864,7 @@ def create_chat_completion(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)

def create_chat_completion_openai_v1(
Expand Down
5 changes: 5 additions & 0 deletions llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def __call__(
grammar: Optional[llama.LlamaGrammar] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
ai_service: Optional[str] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
Expand Down Expand Up @@ -535,6 +536,7 @@ def chat_completion_handler(
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
ai_service: Optional[str] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
Expand Down Expand Up @@ -625,6 +627,7 @@ def chat_completion_handler(
stopping_criteria=stopping_criteria,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)
if tool is not None:
tool_name = tool["function"]["name"]
Expand Down Expand Up @@ -1715,6 +1718,7 @@ def functionary_v1_v2_chat_handler(
model: Optional[str] = None,
logits_processor: Optional[llama.LogitsProcessorList] = None,
grammar: Optional[llama.LlamaGrammar] = None,
ai_service: Optional[str] = None,
**kwargs, # type: ignore
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
Expand Down Expand Up @@ -1931,6 +1935,7 @@ def prepare_messages_for_inference(
model=model,
logits_processor=logits_processor,
grammar=grammar,
ai_service=ai_service
)
if stream is False:
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
Expand Down
7 changes: 4 additions & 3 deletions llama_cpp/llama_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class MetricsExporter:
def __init__(self):
self.labels = LABELS
# One-time metrics
self._histrogram_load_time = Histogram(
self._histogram_load_time = Histogram(
name="llama_cpp_python:load_t_seconds",
documentation="Histogram of load time in seconds",
labelnames=self.labels,
Expand Down Expand Up @@ -194,9 +194,10 @@ def log_metrics(self, metrics: Metrics, labels: Dict[str, str]):
"""
Log the metrics using the Prometheus client.
"""
self._histrogram_load_time.labels(**labels).observe(metrics.load_time)
self._histogram_load_time.labels(**labels).observe(metrics.load_time)
self._histogram_sample_time.labels(**labels).observe(metrics.sample_time)
self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)
if metrics.time_to_first_token:
self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)
for _tpot in metrics.time_per_output_token:
self._histogram_time_per_output_token.labels(**labels).observe(_tpot)
self._histogram_prompt_eval_time.labels(**labels).observe(metrics.prompt_eval_time)
Expand Down
6 changes: 6 additions & 0 deletions llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ async def create_chat_completion(
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
],
"ai_service": "copilot"
},
},
"json_mode": {
Expand Down Expand Up @@ -454,6 +455,11 @@ async def create_chat_completion(
"user",
}
kwargs = body.model_dump(exclude=exclude)

# Adds the ai_service value from the request body to the kwargs
# to be passed downstream to the llama_cpp.ChatCompletion object
kwargs["ai_service"] = body.ai_service

llama = llama_proxy(body.model)
if body.logit_bias is not None:
kwargs["logit_bias"] = (
Expand Down
Loading

0 comments on commit c538a13

Please sign in to comment.