Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use 'ai_service' parameter to differentiate Prometheus metrics #2

Merged
merged 10 commits into from
May 15, 2024
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ _skbuild/

.envrc

# LLMs - comment if you'd like to bake the model into the image
models/

# Byte-compiled / optimized / DLL files
Expand Down
44 changes: 44 additions & 0 deletions dev.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Define the image argument and provide a default value
ARG IMAGE=python:3.11.8

# Use the image as specified
FROM ${IMAGE}

# Re-declare the ARG after FROM
ARG IMAGE

# Update and upgrade the existing packages
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
ninja-build \
libopenblas-dev \
build-essential \
git

RUN mkdir /app
WORKDIR /app
COPY . /app

RUN python3 -m pip install --upgrade pip

RUN make deps && make build && make clean

# Set environment variable for the host
ENV GH_TOKEN=$GH_TOKEN
ENV HOST=0.0.0.0
ENV PORT=8000
ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf

# # Install depencencies
# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client

# # Install llama-cpp-python (build with METAL)
# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose

# Expose a port for the server
EXPOSE 8000

# Run the server start script
CMD ["/bin/sh", "/app/docker/simple/run.sh"]
# CMD python3 -m llama_cpp.server --n_gpu_layers -1
15 changes: 15 additions & 0 deletions dev.docker-compose
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: '3'
services:
dev-llama-cpp-python:
build:
context: .
dockerfile: dev.Dockerfile
ports:
- 8000:8000
volumes:
- ./llama_cpp:/app/llama_cpp
networks:
- zh-service-network
networks:
zh-service-network:
external: true
3 changes: 2 additions & 1 deletion docker/simple/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

make build
uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload
python3 -m llama_cpp.server --model $MODEL --n_gpu_layers -1
26 changes: 2 additions & 24 deletions llama_cpp/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import psutil
import subprocess

from typing import Any, Dict, List
from typing import Any, Dict, List, Tuple, Union

# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
Expand Down Expand Up @@ -112,7 +112,7 @@ def get_gpu_info_by_pid(pid) -> float:
pass
return 0.0

def get_gpu_general_info() -> tuple[float, float, float]:
def get_gpu_general_info() -> Tuple[float, float, float]:
"""
GPU general info (if GPU is available)
"""
Expand All @@ -123,25 +123,3 @@ def get_gpu_general_info() -> tuple[float, float, float]:
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0, 0.0, 0.0

def infer_service_from_prompt(prompt: str | List[str]):
"""
Infer the service for which a completion request is sent based on the prompt.
"""
LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided."
ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue."
SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review."

if isinstance(prompt, list):
prompt = " ".join(prompt)

if LABEL_SUGGESTIONS_TASK in prompt:
return "label-suggestions"

elif ACCEPTANCE_CRITERIA_TASK in prompt:
return "acceptance-criteria"

elif SPRINT_REVIEW_TASK in prompt:
return "sprint-review"

return "not-specified"
22 changes: 17 additions & 5 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
from llama_cpp.llama_metrics import Metrics, MetricsExporter

from llama_cpp._utils import (
infer_service_from_prompt,
get_cpu_usage,
get_ram_usage,
get_gpu_info_by_pid,
Expand Down Expand Up @@ -70,6 +69,7 @@ class Llama:
"""High-level Python wrapper for a llama.cpp model."""

__backend_initialized = False
__prometheus_metrics = MetricsExporter()

def __init__(
self,
Expand Down Expand Up @@ -464,7 +464,7 @@ def __init__(
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)

# Prometheus metrics
self.metrics = MetricsExporter()
self.metrics = self.__prometheus_metrics

@property
def ctx(self) -> llama_cpp.llama_context_p:
Expand Down Expand Up @@ -960,6 +960,7 @@ def _create_completion(
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
ai_service: Optional[str] = None
) -> Union[
Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
]:
Expand All @@ -974,8 +975,10 @@ def _create_completion(
_ttft_start = time.time()
_pid = os.getpid()
_tpot_metrics = []
if not ai_service:
raise ValueError("ai_service must be provided")
_labels = {
"service": infer_service_from_prompt(prompt), # Infer the service for which the completion is being generated
"service": ai_service if ai_service is not None else "not-specified",
"request_type": "chat/completions",
}
# Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process
Expand Down Expand Up @@ -1445,7 +1448,10 @@ def logit_bias_processor(
"token_logprobs": token_logprobs,
"top_logprobs": top_logprobs,
}

# Record TTFT metric -- Setting to None if no tokens were generated
if not _metrics_dict.get("time_to_first_token"):
_metrics_dict["time_to_first_token"] = None

# Record TPOT metrics (per generated token)
_metrics_dict["time_per_output_token"] = _tpot_metrics

Expand Down Expand Up @@ -1484,7 +1490,6 @@ def logit_bias_processor(
}

# Log metrics to Prometheus
#print(_metrics_dict, file=sys.stderr)
_all_metrics = Metrics(**_metrics_dict)
self.metrics.log_metrics(_all_metrics, labels=_labels)

Expand All @@ -1493,6 +1498,7 @@ def logit_bias_processor(
"object": "text_completion",
"created": created,
"model": model_name,
"service": ai_service,
"choices": [
{
"text": text_str,
Expand Down Expand Up @@ -1535,6 +1541,7 @@ def create_completion(
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
ai_service: Optional[str] = None
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt.

Expand Down Expand Up @@ -1598,6 +1605,7 @@ def create_completion(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)
if stream:
chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
Expand Down Expand Up @@ -1632,6 +1640,7 @@ def __call__(
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
ai_service: Optional[str] = None
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt.

Expand Down Expand Up @@ -1695,6 +1704,7 @@ def __call__(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)

def create_chat_completion(
Expand Down Expand Up @@ -1727,6 +1737,7 @@ def create_chat_completion(
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
ai_service: Optional[str] = None
) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
Expand Down Expand Up @@ -1796,6 +1807,7 @@ def create_chat_completion(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)

def create_chat_completion_openai_v1(
Expand Down
3 changes: 3 additions & 0 deletions llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def __call__(
grammar: Optional[llama.LlamaGrammar] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
ai_service: Optional[str] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
Expand Down Expand Up @@ -535,6 +536,7 @@ def chat_completion_handler(
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
ai_service: Optional[str] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
Expand Down Expand Up @@ -625,6 +627,7 @@ def chat_completion_handler(
stopping_criteria=stopping_criteria,
grammar=grammar,
logit_bias=logit_bias,
ai_service=ai_service
)
if tool is not None:
tool_name = tool["function"]["name"]
Expand Down
3 changes: 2 additions & 1 deletion llama_cpp/llama_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ def log_metrics(self, metrics: Metrics, labels: Dict[str, str]):
"""
self._histrogram_load_time.labels(**labels).observe(metrics.load_time)
self._histogram_sample_time.labels(**labels).observe(metrics.sample_time)
self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)
if metrics.time_to_first_token:
self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)
for _tpot in metrics.time_per_output_token:
self._histogram_time_per_output_token.labels(**labels).observe(_tpot)
self._histogram_prompt_eval_time.labels(**labels).observe(metrics.prompt_eval_time)
Expand Down
6 changes: 5 additions & 1 deletion llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ async def create_chat_completion(
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
],
"ai_service": "copilot"
},
},
"json_mode": {
Expand Down Expand Up @@ -454,6 +455,8 @@ async def create_chat_completion(
"user",
}
kwargs = body.model_dump(exclude=exclude)
# Adds the ai_service value from the request body to the kwargs
kwargs["ai_service"] = body.ai_service
llama = llama_proxy(body.model)
if body.logit_bias is not None:
kwargs["logit_bias"] = (
Expand All @@ -471,7 +474,8 @@ async def create_chat_completion(

if isinstance(iterator_or_completion, Iterator):
# EAFP: It's easier to ask for forgiveness than permission
first_response = await run_in_threadpool(next, iterator_or_completion)
# NOTE: Including kwargs so it can also pass the "ai_service" argument to the iterator
first_response = await run_in_threadpool(next, iterator_or_completion, **kwargs)

# If no exception was raised from first_response, we can assume that
# the iterator is valid and we can use it to stream the response.
Expand Down
3 changes: 3 additions & 0 deletions llama_cpp/server/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ class CreateChatCompletionRequest(BaseModel):
}
}

# AI service added as request body parameter by Client
ai_service: Optional[str] = None


class ModelData(TypedDict):
id: str
Expand Down
21 changes: 12 additions & 9 deletions tests/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def mock_kv_cache_seq_add(

def test_llama_patch(mock_llama):
n_ctx = 128
ai_service = "label-suggestions"
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)

n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
assert n_vocab == 32000

Expand All @@ -163,32 +165,32 @@ def test_llama_patch(mock_llama):

## Test basic completion from bos until eos
mock_llama(llama, all_text)
completion = llama.create_completion("", max_tokens=36)
completion = llama.create_completion("", max_tokens=36, ai_service=ai_service)
assert completion["choices"][0]["text"] == all_text
assert completion["choices"][0]["finish_reason"] == "stop"

## Test basic completion until eos
mock_llama(llama, all_text)
completion = llama.create_completion(text, max_tokens=20)
completion = llama.create_completion(text, max_tokens=20, ai_service=ai_service)
assert completion["choices"][0]["text"] == output_text
assert completion["choices"][0]["finish_reason"] == "stop"

## Test streaming completion until eos
mock_llama(llama, all_text)
chunks = list(llama.create_completion(text, max_tokens=20, stream=True))
chunks = list(llama.create_completion(text, max_tokens=20, stream=True, ai_service=ai_service))
assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text
assert chunks[-1]["choices"][0]["finish_reason"] == "stop"

## Test basic completion until stop sequence
mock_llama(llama, all_text)
completion = llama.create_completion(text, max_tokens=20, stop=["lazy"])
completion = llama.create_completion(text, max_tokens=20, stop=["lazy"], ai_service=ai_service)
assert completion["choices"][0]["text"] == " jumps over the "
assert completion["choices"][0]["finish_reason"] == "stop"

## Test streaming completion until stop sequence
mock_llama(llama, all_text)
chunks = list(
llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"], ai_service=ai_service)
)
assert (
"".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the "
Expand All @@ -197,13 +199,13 @@ def test_llama_patch(mock_llama):

## Test basic completion until length
mock_llama(llama, all_text)
completion = llama.create_completion(text, max_tokens=2)
completion = llama.create_completion(text, max_tokens=2, ai_service=ai_service)
assert completion["choices"][0]["text"] == " jumps"
assert completion["choices"][0]["finish_reason"] == "length"

## Test streaming completion until length
mock_llama(llama, all_text)
chunks = list(llama.create_completion(text, max_tokens=2, stream=True))
chunks = list(llama.create_completion(text, max_tokens=2, stream=True, ai_service=ai_service))
assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps"
assert chunks[-1]["choices"][0]["finish_reason"] == "length"

Expand All @@ -230,15 +232,16 @@ def test_utf8(mock_llama):
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, logits_all=True)

output_text = "😀"
ai_service = "label-suggestions"

## Test basic completion with utf8 multibyte
mock_llama(llama, output_text)
completion = llama.create_completion("", max_tokens=4)
completion = llama.create_completion("", max_tokens=4, ai_service=ai_service)
assert completion["choices"][0]["text"] == output_text

## Test basic completion with incomplete utf8 multibyte
mock_llama(llama, output_text)
completion = llama.create_completion("", max_tokens=1)
completion = llama.create_completion("", max_tokens=1, ai_service=ai_service)
assert completion["choices"][0]["text"] == ""


Expand Down