Merge pull request #5 from c0sogi/dev

Dev update (23.8.22.)
c0sogi · Aug 22, 2023 · 61385d4 · 61385d4
2 parents c6f36c6 + 7ea7b6c
commit 61385d4
Show file tree

Hide file tree

Showing 31 changed files with 2,280 additions and 1,305 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,48 +1,34 @@
-### Dockerfile for Python 3.11.4 & CUDA 12.1.1 & Ubuntu 22.04
-### Approximately 5 ~ 10 minutes to build
-
 # Select the required CUDA version.
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
-ENV PYTHON_VERSION="3.11.4"
-ENV PYTHON_VERSION_SHORT="3.11"
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
 
-# Copy the necessary files.
-COPY llama_api /app/llama_api
-COPY pyproject.toml /app/pyproject.toml
-COPY requirements.txt /app/requirements.txt
-COPY main.py /app/main.py
-COPY model_definitions.py /app/model_definitions.py
+ENV PYTHON_VERSION="3.11.4" \
+    PYTHON_VERSION_SHORT="3.11" \
+    DEBIAN_FRONTEND=noninteractive \
+    CUDA_DOCKER_ARCH=all
 
 # Install the necessary applications, and then install Python.
-# Then, install the necessary Python packages(Dependencies).
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    zlib1g-dev \
-    libncurses5-dev \
-    libgdbm-dev \
-    libnss3-dev \
-    libssl-dev \
-    libreadline-dev \
-    libffi-dev \
-    wget \
-    git \
-    libsqlite3-dev\
+    git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \
     && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \
     && tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \
     && cd /tmp/Python-${PYTHON_VERSION} \
-    && ./configure \
-    && make \
-    && make install \
+    && ./configure && make && make install \
+    && python3 -m pip install --upgrade pip --no-cache-dir \
+    && rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \
     && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
     && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
-    && python3 -m pip install --upgrade pip \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get clean \
-    && rm -rf /tmp/* \
-    && cd /app \
-    && python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
-    # Need to skip complie because GPU access to host is not supported when building image.
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
+    && nvcc --version
+
+# Copy the necessary files.
+COPY llama_api /app/llama_api
+COPY pyproject.toml requirements.txt main.py model_definitions.py /app/
+
+# Install the necessary Python packages(Dependencies).
+RUN cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir
 
 # Set the working directory and start the server.
+ENV PORT=${PORT:-8000}
+STOPSIGNAL SIGINT
 WORKDIR /app
-ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
+ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/Dockerfile.compressed b/Dockerfile.compressed
@@ -0,0 +1,37 @@
+### Compressed version of the Dockerfile.
+### It is compressed by only do RUN once to reduce the number of layers.
+### However, it takes a long time to build the image compared to the original Dockerfile
+### because it does not use the cache.
+
+# Select the required CUDA version.
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
+
+ENV PYTHON_VERSION="3.11.4" \
+    PYTHON_VERSION_SHORT="3.11" \
+    DEBIAN_FRONTEND=noninteractive \
+    CUDA_DOCKER_ARCH=all
+
+# Copy the necessary files.
+COPY llama_api /app/llama_api
+COPY pyproject.toml requirements.txt main.py model_definitions.py /app/
+
+# Install the necessary applications, and then install Python.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \
+    && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \
+    && tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \
+    && cd /tmp/Python-${PYTHON_VERSION} \
+    && ./configure && make && make install \
+    && python3 -m pip install --upgrade pip --no-cache-dir \
+    && rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \
+    && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
+    && nvcc --version \
+    && cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir
+
+# Set the working directory and start the server.
+ENV PORT=${PORT:-8000}
+STOPSIGNAL SIGINT
+WORKDIR /app
+ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/docker-compose.persistent.yml b/docker-compose.persistent.yml
@@ -5,7 +5,11 @@ volumes:
 
 services:
   llama-api:
-    image: cosogi/llama-api:230816
+    image: cosogi/llama-api:latest
+    cap_add:
+      - IPC_LOCK
+      - SYS_NICE
+      - SYS_RESOURCE
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
@@ -14,35 +18,11 @@ services:
     volumes:
       - llama-api-models:/app/models
       - ./model_definitions.py:/app/model_definitions.py
-      - ./main.py:/app/main.py
     ports:
       - 8000:8000
     deploy:
       resources:
         reservations:
           devices:
           - driver: nvidia
-            capabilities: [gpu]
-
-
-# services:
-#   llama-api:
-#     build:
-#       context: .
-#       dockerfile: Dockerfile
-#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
-#     environment:
-#       - LLAMA_API_MAX_WORKERS=1
-#       - LLAMA_API_API_KEY=
-#     volumes:
-#       - llama-api-models:/app/models
-#       - ./model_definitions.py:/app/model_definitions.py
-#       - ./main.py:/app/main.py
-#     ports:
-#       - 8000:8000
-#     deploy:
-#       resources:
-#         reservations:
-#           devices:
-#           - driver: nvidia
-#             capabilities: [gpu]
+            capabilities: [gpu]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,7 +2,11 @@ version: '3'
 
 services:
   llama-api:
-    image: cosogi/llama-api:230816
+    image: cosogi/llama-api:latest
+    cap_add:
+      - IPC_LOCK
+      - SYS_NICE
+      - SYS_RESOURCE
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - FORCE_CUDA=1
@@ -22,27 +26,4 @@ services:
         reservations:
           devices:
           - driver: nvidia
-            capabilities: [gpu]
-
-# services:
-#   llama-api:
-#     build:
-#       context: .
-#       dockerfile: Dockerfile
-#     entrypoint: ["python3", "-m", "main", "--port", "8000"]
-#     environment:
-#       - MAX_WORKERS=1
-#     volumes:
-#       - ./models:/app/models
-#       - ./llama_api:/app/llama_api
-#       - ./model_definitions.py:/app/model_definitions.py
-#       - ./main.py:/app/main.py
-#       - ./requirements.txt:/app/requirements.txt
-#     ports:
-#       - 8000:8000
-#     deploy:
-#       resources:
-#         reservations:
-#           devices:
-#           - driver: nvidia
-#             capabilities: [gpu]
+            capabilities: [gpu]
diff --git a/llama_api/mixins/completion.py b/llama_api/mixins/completion.py
@@ -0,0 +1,48 @@
+from collections import defaultdict
+from dataclasses import dataclass, field
+from time import time
+from typing import Dict, Literal, Optional
+
+from ..schemas.api import CompletionLogprobs, TextGenerationSettings
+
+
+@dataclass
+class CompletionStatus:
+    # These fields are automatically set
+    started_at: float = field(default_factory=time, init=False)
+
+    # These fields are set by `accept_settings` method.
+    input_text: str = field(default="", init=False)
+    input_tokens: int = field(default=0, init=False)
+
+    # These fields are set by `generate_text` method.
+    generated_text: str = field(default="", init=False)
+    generated_tokens: int = field(default=0, init=False)
+    logprobs: Optional[CompletionLogprobs] = field(default=None, init=False)
+
+
+class CompletionMixin:
+    """A mixin for modules that support completion generation."""
+
+    _completion_status: Optional["defaultdict[str, CompletionStatus]"] = None
+
+    @property
+    def completion_status(self) -> Dict[str, CompletionStatus]:
+        """Get the completion status.
+        key: completion_id
+        value: CompletionStatus"""
+        if self._completion_status is None:
+            self._completion_status = defaultdict(CompletionStatus)
+        return self._completion_status
+
+    def get_finish_reason(
+        self,
+        settings: TextGenerationSettings,
+    ) -> Literal["length", "stop"]:
+        """Get the finish reason for the completion."""
+        return (
+            "length"
+            if self.completion_status[settings.completion_id].generated_tokens
+            >= settings.max_tokens
+            else "stop"
+        )