-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from c0sogi/dev
Dev update (23.8.17.)
- Loading branch information
Showing
39 changed files
with
1,979 additions
and
900 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,4 +8,6 @@ repositories/ | |
.venv/ | ||
.vscode/ | ||
.test-venv/ | ||
PRIVATE_* | ||
.temp/ | ||
PRIVATE_* | ||
private/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# flake8: noqa | ||
|
||
from argparse import ArgumentParser | ||
from llama_api.utils.llama_cpp import ( | ||
build_shared_lib, | ||
CPU_ARGS, # Only use CPU | ||
METAL_ARGS, # Only use Metal (MacOS) | ||
CUBLAS_ARGS, # Only use CUBLAS (Nvidia) | ||
) | ||
from os import environ | ||
|
||
ARGS = { | ||
"CPU": CPU_ARGS, | ||
"METAL": METAL_ARGS, | ||
"CUBLAS": CUBLAS_ARGS, | ||
"CUDA": CUBLAS_ARGS, | ||
} | ||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser() | ||
parser.add_argument( | ||
"-b", | ||
"--build_type", | ||
type=lambda s: str(s).upper(), | ||
default="CPU", | ||
choices=["CPU", "METAL", "CUBLAS", "CUDA"], | ||
help="Build type", | ||
) | ||
|
||
environ["FORCE_CMAKE"] = "1" | ||
environ["CMAKE_ARGS"] = ARGS[parser.parse_args().build_type] | ||
build_shared_lib() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
version: '3.8' | ||
|
||
volumes: | ||
llama-api-models: | ||
|
||
services: | ||
llama-api: | ||
image: cosogi/llama-api:230816 | ||
entrypoint: ["python3", "-m", "main", "--port", "8000"] | ||
environment: | ||
- FORCE_CUDA=1 | ||
- LLAMA_API_MAX_WORKERS=1 | ||
- LLAMA_API_API_KEY= | ||
volumes: | ||
- llama-api-models:/app/models | ||
- ./model_definitions.py:/app/model_definitions.py | ||
- ./main.py:/app/main.py | ||
ports: | ||
- 8000:8000 | ||
deploy: | ||
resources: | ||
reservations: | ||
devices: | ||
- driver: nvidia | ||
capabilities: [gpu] | ||
|
||
|
||
# services: | ||
# llama-api: | ||
# build: | ||
# context: . | ||
# dockerfile: Dockerfile | ||
# entrypoint: ["python3", "-m", "main", "--port", "8000"] | ||
# environment: | ||
# - LLAMA_API_MAX_WORKERS=1 | ||
# - LLAMA_API_API_KEY= | ||
# volumes: | ||
# - llama-api-models:/app/models | ||
# - ./model_definitions.py:/app/model_definitions.py | ||
# - ./main.py:/app/main.py | ||
# ports: | ||
# - 8000:8000 | ||
# deploy: | ||
# resources: | ||
# reservations: | ||
# devices: | ||
# - driver: nvidia | ||
# capabilities: [gpu] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import TYPE_CHECKING, List | ||
|
||
if TYPE_CHECKING: | ||
import torch as pytorch | ||
|
||
|
||
class BaseLogitProcessor(ABC): | ||
@abstractmethod | ||
def with_torch( | ||
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor" | ||
) -> "pytorch.Tensor": | ||
"""Process logits with PyTorch tensors.""" | ||
|
||
@abstractmethod | ||
def without_torch( | ||
self, input_ids: List[int], scores: List[float] | ||
) -> List[float]: | ||
"""Process logits with Python lists.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Callable, | ||
Dict, | ||
List, | ||
Optional, | ||
) | ||
|
||
from ..utils.logger import ApiLogger | ||
from .base import BaseLogitProcessor | ||
|
||
if TYPE_CHECKING: | ||
import torch as pytorch | ||
|
||
logger = ApiLogger(__name__) | ||
|
||
try: | ||
import tiktoken | ||
|
||
openai_decoder = tiktoken.get_encoding("cl100k_base").decode | ||
except Exception as e: | ||
logger.warning( | ||
"Could not load tiktoken, which is required for OpenAI GPT models. " | ||
f"Please `pip install tiktoken` to use the OpenAI encoder: {e}" | ||
) | ||
openai_decoder: Optional[Callable[[List[int]], str]] = None | ||
|
||
|
||
class LogitBiasProcessor(BaseLogitProcessor): | ||
"""Create a logit bias processor to bias the logit scores.""" | ||
|
||
def __init__( | ||
self, | ||
logit_bias: Dict[str, float], | ||
encoder: Callable[[str], List[int]], | ||
is_openai: bool = False, | ||
): | ||
"""Create a logit bias processor to bias the logit scores.""" | ||
|
||
global openai_decoder | ||
|
||
biases = {} # type: Dict[int, float] | ||
for id_or_token, bias in logit_bias.items(): | ||
is_digit = id_or_token.isdigit() | ||
|
||
if is_digit and is_openai and openai_decoder is not None: | ||
# If we have an OpenAI id, we need to convert it to a token | ||
# and then encode the token to get the ids | ||
for id in encoder(openai_decoder([int(id_or_token)])): | ||
if abs(bias) > abs(biases.get(id, 0.0)): | ||
biases[id] = bias | ||
elif is_digit: | ||
# If we have a digit, we can just use it directly | ||
biases[int(id_or_token)] = bias | ||
else: | ||
# Otherwise, we need to encode the token and use the ids | ||
for id in encoder(id_or_token): | ||
if abs(bias) > abs(biases.get(id, 0.0)): | ||
biases[id] = bias | ||
|
||
self._biases = biases | ||
self._bias_tensor = None | ||
|
||
def _get_bias_tensor(self, scores: "pytorch.Tensor") -> "pytorch.Tensor": | ||
if self._bias_tensor is None: | ||
import torch | ||
|
||
self._bias_tensor = torch.zeros( | ||
scores.shape[-1], dtype=scores.dtype, device=scores.device | ||
) | ||
for id, bias in self._biases.items(): | ||
self._bias_tensor[id] = bias | ||
|
||
return self._bias_tensor | ||
|
||
def with_torch( | ||
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor" | ||
) -> "pytorch.Tensor": | ||
return scores + self._get_bias_tensor(scores) | ||
|
||
def without_torch( | ||
self, input_ids: List[int], scores: List[float] | ||
) -> List[float]: | ||
for id, bias in self._biases.items(): | ||
scores[id] += bias | ||
return scores |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# flake8: noqa | ||
from typing import TYPE_CHECKING, List, Tuple | ||
|
||
from .base import BaseLogitProcessor | ||
|
||
if TYPE_CHECKING: | ||
import torch as pytorch | ||
|
||
|
||
class MuseLogitProcessor(BaseLogitProcessor): | ||
"""Performs dampening of the k highest probability elements. | ||
Args: | ||
top_k (`int`): | ||
The number of highest probability vocabulary tokens to keep for top-k-filtering. | ||
damp (`float`, *optional*, defaults to 0.98): | ||
How much less likely should the top_k most likely tokens be made. If set to 0, they become impossible. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
top_k: int = 3, | ||
damp: float = 0.9, | ||
damp_initial: float = 1.0, | ||
damp_ramp_tokens: int = 32, | ||
min_tokens_to_keep: int = 1, | ||
): | ||
if not isinstance(top_k, int) or top_k <= 0: | ||
raise ValueError( | ||
"`top_k` has to be a strictly positive integer, " | ||
f"but is {top_k}" | ||
) | ||
|
||
self.top_k = max(top_k, min_tokens_to_keep) | ||
self.damp = damp | ||
self.damp_initial = damp_initial | ||
self.damp_ramp_tokens = damp_ramp_tokens | ||
self.token_num = 0 | ||
|
||
def with_torch( | ||
self, input_ids: "pytorch.Tensor", scores: "pytorch.Tensor" | ||
) -> "pytorch.Tensor": | ||
import torch | ||
|
||
top_k_safety = min(self.top_k, scores.size(-1)) # Safety check | ||
linear_damp = self.linear_damp | ||
topk_values, topk_indices = torch.topk( | ||
scores, top_k_safety, dim=-1 | ||
) # Specify the dimension | ||
self.token_num += 1 | ||
return scores.scatter_(-1, topk_indices, topk_values * linear_damp) | ||
|
||
def without_torch( | ||
self, input_ids: List[int], scores: List[float] | ||
) -> List[float]: | ||
top_k_safety = min(self.top_k, len(scores)) # Safety check | ||
linear_damp = self.linear_damp | ||
topk_values_indices = sorted( | ||
range(len(scores)), key=lambda x: scores[x], reverse=True | ||
)[:top_k_safety] | ||
self.token_num += 1 | ||
return [ | ||
score * linear_damp if idx in topk_values_indices else score | ||
for idx, score in enumerate(scores) | ||
] | ||
|
||
@property | ||
def linear_damp(self) -> float: | ||
ratio = ( | ||
1.0 | ||
if self.damp_ramp_tokens == 0 | ||
else min(self.token_num / self.damp_ramp_tokens, 1.0) | ||
) | ||
return ( | ||
self.damp_initial + ratio * (self.damp - self.damp_initial) | ||
if ratio < 1.0 | ||
else self.damp | ||
) |
Oops, something went wrong.