Skip to content

Commit

Permalink
Merge pull request #4 from ZenHubHQ/llm-monitoring
Browse files Browse the repository at this point in the history
Merging llm-monitoring into synced main
  • Loading branch information
juanroesel authored May 21, 2024
2 parents 3dbfec7 + e413c65 commit ef98a23
Show file tree
Hide file tree
Showing 12 changed files with 544 additions and 28 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ _skbuild/

.envrc

# LLMs - comment if you'd like to bake the model into the image
models/

# Byte-compiled / optimized / DLL files
Expand Down
44 changes: 44 additions & 0 deletions dev.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Define the image argument and provide a default value
ARG IMAGE=python:3.11.8

# Use the image as specified
FROM ${IMAGE}

# Re-declare the ARG after FROM
ARG IMAGE

# Update and upgrade the existing packages
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
ninja-build \
libopenblas-dev \
build-essential \
git

RUN mkdir /app
WORKDIR /app
COPY . /app

RUN python3 -m pip install --upgrade pip

RUN make deps && make build && make clean

# Set environment variable for the host
ENV GH_TOKEN=$GH_TOKEN
ENV HOST=0.0.0.0
ENV PORT=8000
ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf

# # Install depencencies
# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client

# # Install llama-cpp-python (build with METAL)
# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose

# Expose a port for the server
EXPOSE 8000

# Run the server start script
CMD ["/bin/sh", "/app/docker/simple/run.sh"]
# CMD python3 -m llama_cpp.server --n_gpu_layers -1
15 changes: 15 additions & 0 deletions dev.docker-compose
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: '3'
services:
dev-llama-cpp-python:
build:
context: .
dockerfile: dev.Dockerfile
ports:
- 8000:8000
volumes:
- ./llama_cpp:/app/llama_cpp
networks:
- zh-service-network
networks:
zh-service-network:
external: true
3 changes: 2 additions & 1 deletion docker/simple/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

make build
uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload
python3 -m llama_cpp.server --model $MODEL --n_gpu_layers -1
50 changes: 49 additions & 1 deletion llama_cpp/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import sys
import psutil
import subprocess

from typing import Any, Dict
from typing import Any, Dict, List, Tuple, Union

# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
Expand Down Expand Up @@ -75,3 +77,49 @@ class Singleton(object, metaclass=MetaSingleton):

def __init__(self):
super(Singleton, self).__init__()


# Get snapshot of RAM and GPU usage before and after function execution.
# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
def get_cpu_usage(pid) -> float:
"""
CPU usage in percentage by the current process.
"""
process = psutil.Process(pid)
return process.cpu_percent()

def get_ram_usage(pid) -> float:
"""
RAM usage in MiB by the current process.
"""
process = psutil.Process(pid)
ram_info = process.memory_info()
ram_usage = ram_info.rss / (1024 * 1024) # Convert to MiB
return ram_usage

def get_gpu_info_by_pid(pid) -> float:
"""
GPU memory usage by the current process (if GPU is available)
"""
try:
gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8")
gpu_info = gpu_info.strip().split("\n")
for info in gpu_info:
gpu_pid, gpu_ram_usage = info.split(", ")
if int(gpu_pid) == pid:
return float(gpu_ram_usage.split()[0])
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0

def get_gpu_general_info() -> Tuple[float, float, float]:
"""
GPU general info (if GPU is available)
"""
try:
gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8")
gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ")
return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free])
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0, 0.0, 0.0
Loading

0 comments on commit ef98a23

Please sign in to comment.