Skip to content

Commit

Permalink
Merge pull request #4 from c0sogi/dev
Browse files Browse the repository at this point in the history
Dev update (23.8.17.)
  • Loading branch information
c0sogi authored Aug 17, 2023
2 parents 178fe3e + 1f111ba commit 023fb40
Show file tree
Hide file tree
Showing 39 changed files with 1,979 additions and 900 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ repositories/
.venv/
.vscode/
.test-venv/
PRIVATE_*
.temp/
PRIVATE_*
private/*
14 changes: 7 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
### Approximately 5 ~ 10 minutes to build

# Select the required CUDA version.
ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
FROM nvidia/cuda:${CUDA_IMAGE}
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
ENV PYTHON_VERSION="3.11.4"
ENV PYTHON_VERSION_SHORT="3.11"
ENV HOST 0.0.0.0
ENV PORT=8000

# Copy the necessary files.
COPY requirements.txt /app/requirements.txt
COPY pyproject.toml /app/pyproject.toml
COPY llama_api /app/llama_api
COPY pyproject.toml /app/pyproject.toml
COPY requirements.txt /app/requirements.txt
COPY main.py /app/main.py
COPY model_definitions.py /app/model_definitions.py

# Install the necessary applications, and then install Python.
# Then, install the necessary Python packages(Dependencies).
Expand Down Expand Up @@ -41,7 +40,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& apt-get clean \
&& rm -rf /tmp/* \
&& cd /app \
&& python3 -m llama_api.server.app_settings --force-cuda --install-pkgs
&& python3 -m llama_api.server.app_settings --skip-compile --install-pkgs --force-cuda
# Need to skip complie because GPU access to host is not supported when building image.

# Set the working directory and start the server.
WORKDIR /app
Expand Down
32 changes: 32 additions & 0 deletions build_shared_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# flake8: noqa

from argparse import ArgumentParser
from llama_api.utils.llama_cpp import (
build_shared_lib,
CPU_ARGS, # Only use CPU
METAL_ARGS, # Only use Metal (MacOS)
CUBLAS_ARGS, # Only use CUBLAS (Nvidia)
)
from os import environ

ARGS = {
"CPU": CPU_ARGS,
"METAL": METAL_ARGS,
"CUBLAS": CUBLAS_ARGS,
"CUDA": CUBLAS_ARGS,
}

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"-b",
"--build_type",
type=lambda s: str(s).upper(),
default="CPU",
choices=["CPU", "METAL", "CUBLAS", "CUDA"],
help="Build type",
)

environ["FORCE_CMAKE"] = "1"
environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type]
build_shared_lib()
48 changes: 48 additions & 0 deletions docker-compose.persistent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
version: '3.8'

volumes:
llama-api-models:

services:
llama-api:
image: cosogi/llama-api:230816
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- FORCE_CUDA=1
- LLAMA_API_MAX_WORKERS=1
- LLAMA_API_API_KEY=
volumes:
- llama-api-models:/app/models
- ./model_definitions.py:/app/model_definitions.py
- ./main.py:/app/main.py
ports:
- 8000:8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]


# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - LLAMA_API_MAX_WORKERS=1
# - LLAMA_API_API_KEY=
# volumes:
# - llama-api-models:/app/models
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
48 changes: 25 additions & 23 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ version: '3'

services:
llama-api:
image: cosogi/llama-api:230730
image: cosogi/llama-api:230816
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- MAX_WORKERS=1
- FORCE_CUDA=1
- LLAMA_API_MAX_WORKERS=1
- LLAMA_API_API_KEY=
volumes:
- ./models:/app/models
- ./llama_api:/app/llama_api
Expand All @@ -23,24 +25,24 @@ services:
capabilities: [gpu]

# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - MAX_WORKERS=1
# volumes:
# - ./models:/app/models
# - ./llama_api:/app/llama_api
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# - ./requirements.txt:/app/requirements.txt
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - MAX_WORKERS=1
# volumes:
# - ./models:/app/models
# - ./llama_api:/app/llama_api
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# - ./requirements.txt:/app/requirements.txt
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
19 changes: 19 additions & 0 deletions llama_api/logits/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List

if TYPE_CHECKING:
import torch as pytorch


class BaseLogitProcessor(ABC):
@abstractmethod
def with_torch(
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
) -> "pytorch.Tensor":
"""Process logits with PyTorch tensors."""

@abstractmethod
def without_torch(
self, input_ids: List[int], scores: List[float]
) -> List[float]:
"""Process logits with Python lists."""
86 changes: 86 additions & 0 deletions llama_api/logits/bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from typing import (
TYPE_CHECKING,
Callable,
Dict,
List,
Optional,
)

from ..utils.logger import ApiLogger
from .base import BaseLogitProcessor

if TYPE_CHECKING:
import torch as pytorch

logger = ApiLogger(__name__)

try:
import tiktoken

openai_decoder = tiktoken.get_encoding("cl100k_base").decode
except Exception as e:
logger.warning(
"Could not load tiktoken, which is required for OpenAI GPT models. "
f"Please `pip install tiktoken` to use the OpenAI encoder: {e}"
)
openai_decoder: Optional[Callable[[List[int]], str]] = None


class LogitBiasProcessor(BaseLogitProcessor):
"""Create a logit bias processor to bias the logit scores."""

def __init__(
self,
logit_bias: Dict[str, float],
encoder: Callable[[str], List[int]],
is_openai: bool = False,
):
"""Create a logit bias processor to bias the logit scores."""

global openai_decoder

biases = {} # type: Dict[int, float]
for id_or_token, bias in logit_bias.items():
is_digit = id_or_token.isdigit()

if is_digit and is_openai and openai_decoder is not None:
# If we have an OpenAI id, we need to convert it to a token
# and then encode the token to get the ids
for id in encoder(openai_decoder([int(id_or_token)])):
if abs(bias) > abs(biases.get(id, 0.0)):
biases[id] = bias
elif is_digit:
# If we have a digit, we can just use it directly
biases[int(id_or_token)] = bias
else:
# Otherwise, we need to encode the token and use the ids
for id in encoder(id_or_token):
if abs(bias) > abs(biases.get(id, 0.0)):
biases[id] = bias

self._biases = biases
self._bias_tensor = None

def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor":
if self._bias_tensor is None:
import torch

self._bias_tensor = torch.zeros(
scores.shape[-1], dtype=scores.dtype, device=scores.device
)
for id, bias in self._biases.items():
self._bias_tensor[id] = bias

return self._bias_tensor

def with_torch(
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
) -> "pytorch.Tensor":
return scores + self._get_bias_tensor(scores)

def without_torch(
self, input_ids: List[int], scores: List[float]
) -> List[float]:
for id, bias in self._biases.items():
scores[id] += bias
return scores
78 changes: 78 additions & 0 deletions llama_api/logits/muse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# flake8: noqa
from typing import TYPE_CHECKING, List, Tuple

from .base import BaseLogitProcessor

if TYPE_CHECKING:
import torch as pytorch


class MuseLogitProcessor(BaseLogitProcessor):
"""Performs dampening of the k highest probability elements.
Args:
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
damp (`float`, *optional*, defaults to 0.98):
How much less likely should the top_k most likely tokens be made. If set to 0, they become impossible.
"""

def __init__(
self,
top_k: int = 3,
damp: float = 0.9,
damp_initial: float = 1.0,
damp_ramp_tokens: int = 32,
min_tokens_to_keep: int = 1,
):
if not isinstance(top_k, int) or top_k <= 0:
raise ValueError(
"`top_k` has to be a strictly positive integer, "
f"but is {top_k}"
)

self.top_k = max(top_k, min_tokens_to_keep)
self.damp = damp
self.damp_initial = damp_initial
self.damp_ramp_tokens = damp_ramp_tokens
self.token_num = 0

def with_torch(
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor"
) -> "pytorch.Tensor":
import torch

top_k_safety = min(self.top_k, scores.size(-1)) # Safety check
linear_damp = self.linear_damp
topk_values, topk_indices = torch.topk(
scores, top_k_safety, dim=-1
) # Specify the dimension
self.token_num += 1
return scores.scatter_(-1, topk_indices, topk_values * linear_damp)

def without_torch(
self, input_ids: List[int], scores: List[float]
) -> List[float]:
top_k_safety = min(self.top_k, len(scores)) # Safety check
linear_damp = self.linear_damp
topk_values_indices = sorted(
range(len(scores)), key=lambda x: scores[x], reverse=True
)[:top_k_safety]
self.token_num += 1
return [
score * linear_damp if idx in topk_values_indices else score
for idx, score in enumerate(scores)
]

@property
def linear_damp(self) -> float:
ratio = (
1.0
if self.damp_ramp_tokens == 0
else min(self.token_num / self.damp_ramp_tokens, 1.0)
)
return (
self.damp_initial + ratio * (self.damp - self.damp_initial)
if ratio < 1.0
else self.damp
)
Loading

0 comments on commit 023fb40

Please sign in to comment.