Merge pull request #4 from ZenHubHQ/llm-monitoring

Merging llm-monitoring into synced main
ZenHubHQ · May 21, 2024 · ef98a23 · ef98a23
2 parents 3dbfec7 + e413c65
commit ef98a23
Show file tree

Hide file tree

Showing 12 changed files with 544 additions and 28 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -2,6 +2,7 @@ _skbuild/
 
 .envrc
 
+# LLMs - comment if you'd like to bake the model into the image
 models/
 
 # Byte-compiled / optimized / DLL files

diff --git a/dev.Dockerfile b/dev.Dockerfile
@@ -0,0 +1,44 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3.11.8
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    libopenblas-dev \
+    build-essential \
+    git
+
+RUN mkdir /app
+WORKDIR /app
+COPY . /app
+
+RUN python3 -m pip install --upgrade pip
+
+RUN make deps && make build && make clean
+
+# Set environment variable for the host
+ENV GH_TOKEN=$GH_TOKEN
+ENV HOST=0.0.0.0
+ENV PORT=8000
+ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf
+
+# # Install depencencies
+# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client
+
+# # Install llama-cpp-python (build with METAL)
+# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/docker/simple/run.sh"]
+# CMD python3 -m llama_cpp.server --n_gpu_layers -1
diff --git a/dev.docker-compose b/dev.docker-compose
@@ -0,0 +1,15 @@
+version: '3'
+services:
+    dev-llama-cpp-python:
+        build:
+            context: .
+            dockerfile: dev.Dockerfile
+        ports: 
+            - 8000:8000
+        volumes:
+            - ./llama_cpp:/app/llama_cpp
+        networks:
+            - zh-service-network
+networks:
+    zh-service-network:
+        external: true
diff --git a/docker/simple/run.sh b/docker/simple/run.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
 make build
-uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
+# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers -1
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
@@ -1,7 +1,9 @@
 import os
 import sys
+import psutil
+import subprocess
 
-from typing import Any, Dict
+from typing import Any, Dict, List, Tuple, Union
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 outnull_file = open(os.devnull, "w")
@@ -75,3 +77,49 @@ class Singleton(object, metaclass=MetaSingleton):
 
     def __init__(self):
         super(Singleton, self).__init__()
+
+
+# Get snapshot of RAM and GPU usage before and after function execution.
+# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
+def get_cpu_usage(pid) -> float:
+    """
+    CPU usage in percentage by the current process.
+    """
+    process = psutil.Process(pid)
+    return process.cpu_percent()
+
+def get_ram_usage(pid) -> float:
+    """
+    RAM usage in MiB by the current process.
+    """
+    process = psutil.Process(pid)
+    ram_info = process.memory_info()
+    ram_usage = ram_info.rss / (1024 * 1024)  # Convert to MiB
+    return ram_usage
+
+def get_gpu_info_by_pid(pid) -> float:
+    """
+    GPU memory usage by the current process (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8")
+        gpu_info = gpu_info.strip().split("\n")
+        for info in gpu_info:
+            gpu_pid, gpu_ram_usage = info.split(", ")
+            if int(gpu_pid) == pid:
+                return float(gpu_ram_usage.split()[0])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0
+
+def get_gpu_general_info() -> Tuple[float, float, float]:
+    """
+    GPU general info (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8")
+        gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ")
+        return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0, 0.0, 0.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ _skbuild/ @@
     .envrc
+    # LLMs - comment if you'd like to bake the model into the image
     models/
     # Byte-compiled / optimized / DLL files
@@ Expand Down @@