Skip to content

Commit

Permalink
Merge pull request #5 from c0sogi/dev
Browse files Browse the repository at this point in the history
Dev update (23.8.22.)
  • Loading branch information
c0sogi authored Aug 22, 2023
2 parents c6f36c6 + 7ea7b6c commit 61385d4
Show file tree
Hide file tree
Showing 31 changed files with 2,280 additions and 1,305 deletions.
56 changes: 21 additions & 35 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,48 +1,34 @@
### Dockerfile for Python 3.11.4 & CUDA 12.1.1 & Ubuntu 22.04
### Approximately 5 ~ 10 minutes to build

# Select the required CUDA version.
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
ENV PYTHON_VERSION="3.11.4"
ENV PYTHON_VERSION_SHORT="3.11"
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder

# Copy the necessary files.
COPY llama_api /app/llama_api
COPY pyproject.toml /app/pyproject.toml
COPY requirements.txt /app/requirements.txt
COPY main.py /app/main.py
COPY model_definitions.py /app/model_definitions.py
ENV PYTHON_VERSION="3.11.4" \
PYTHON_VERSION_SHORT="3.11" \
DEBIAN_FRONTEND=noninteractive \
CUDA_DOCKER_ARCH=all

# Install the necessary applications, and then install Python.
# Then, install the necessary Python packages(Dependencies).
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
zlib1g-dev \
libncurses5-dev \
libgdbm-dev \
libnss3-dev \
libssl-dev \
libreadline-dev \
libffi-dev \
wget \
git \
libsqlite3-dev\
git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \
&& tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \
&& cd /tmp/Python-${PYTHON_VERSION} \
&& ./configure \
&& make \
&& make install \
&& ./configure && make && make install \
&& python3 -m pip install --upgrade pip --no-cache-dir \
&& rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \
&& update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
&& python3 -m pip install --upgrade pip \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& rm -rf /tmp/* \
&& cd /app \
&& python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
# Need to skip complie because GPU access to host is not supported when building image.
&& mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
&& nvcc --version

# Copy the necessary files.
COPY llama_api /app/llama_api
COPY pyproject.toml requirements.txt main.py model_definitions.py /app/

# Install the necessary Python packages(Dependencies).
RUN cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir

# Set the working directory and start the server.
ENV PORT=${PORT:-8000}
STOPSIGNAL SIGINT
WORKDIR /app
ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
37 changes: 37 additions & 0 deletions Dockerfile.compressed
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
### Compressed version of the Dockerfile.
### It is compressed by only do RUN once to reduce the number of layers.
### However, it takes a long time to build the image compared to the original Dockerfile
### because it does not use the cache.

# Select the required CUDA version.
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder

ENV PYTHON_VERSION="3.11.4" \
PYTHON_VERSION_SHORT="3.11" \
DEBIAN_FRONTEND=noninteractive \
CUDA_DOCKER_ARCH=all

# Copy the necessary files.
COPY llama_api /app/llama_api
COPY pyproject.toml requirements.txt main.py model_definitions.py /app/

# Install the necessary applications, and then install Python.
RUN apt-get update && apt-get install -y --no-install-recommends \
git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget libsqlite3-dev gcc ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev \
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz -O /tmp/Python-${PYTHON_VERSION}.tgz \
&& tar -xvf /tmp/Python-${PYTHON_VERSION}.tgz -C /tmp \
&& cd /tmp/Python-${PYTHON_VERSION} \
&& ./configure && make && make install \
&& python3 -m pip install --upgrade pip --no-cache-dir \
&& rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* \
&& update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
&& mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
&& nvcc --version \
&& cd /app && python3 -m llama_api.server.app_settings --install-pkgs --force-cuda --no-cache-dir

# Set the working directory and start the server.
ENV PORT=${PORT:-8000}
STOPSIGNAL SIGINT
WORKDIR /app
ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
32 changes: 6 additions & 26 deletions docker-compose.persistent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ volumes:

services:
llama-api:
image: cosogi/llama-api:230816
image: cosogi/llama-api:latest
cap_add:
- IPC_LOCK
- SYS_NICE
- SYS_RESOURCE
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- FORCE_CUDA=1
Expand All @@ -14,35 +18,11 @@ services:
volumes:
- llama-api-models:/app/models
- ./model_definitions.py:/app/model_definitions.py
- ./main.py:/app/main.py
ports:
- 8000:8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]


# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - LLAMA_API_MAX_WORKERS=1
# - LLAMA_API_API_KEY=
# volumes:
# - llama-api-models:/app/models
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
capabilities: [gpu]
31 changes: 6 additions & 25 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ version: '3'

services:
llama-api:
image: cosogi/llama-api:230816
image: cosogi/llama-api:latest
cap_add:
- IPC_LOCK
- SYS_NICE
- SYS_RESOURCE
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- FORCE_CUDA=1
Expand All @@ -22,27 +26,4 @@ services:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]

# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - MAX_WORKERS=1
# volumes:
# - ./models:/app/models
# - ./llama_api:/app/llama_api
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# - ./requirements.txt:/app/requirements.txt
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
capabilities: [gpu]
48 changes: 48 additions & 0 deletions llama_api/mixins/completion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from collections import defaultdict
from dataclasses import dataclass, field
from time import time
from typing import Dict, Literal, Optional

from ..schemas.api import CompletionLogprobs, TextGenerationSettings


@dataclass
class CompletionStatus:
# These fields are automatically set
started_at: float = field(default_factory=time, init=False)

# These fields are set by `accept_settings` method.
input_text: str = field(default="", init=False)
input_tokens: int = field(default=0, init=False)

# These fields are set by `generate_text` method.
generated_text: str = field(default="", init=False)
generated_tokens: int = field(default=0, init=False)
logprobs: Optional[CompletionLogprobs] = field(default=None, init=False)


class CompletionMixin:
"""A mixin for modules that support completion generation."""

_completion_status: Optional["defaultdict[str, CompletionStatus]"] = None

@property
def completion_status(self) -> Dict[str, CompletionStatus]:
"""Get the completion status.
key: completion_id
value: CompletionStatus"""
if self._completion_status is None:
self._completion_status = defaultdict(CompletionStatus)
return self._completion_status

def get_finish_reason(
self,
settings: TextGenerationSettings,
) -> Literal["length", "stop"]:
"""Get the finish reason for the completion."""
return (
"length"
if self.completion_status[settings.completion_id].generated_tokens
>= settings.max_tokens
else "stop"
)
Loading

0 comments on commit 61385d4

Please sign in to comment.