diff --git a/Dockerfile b/Dockerfile index b01f071dbe3d..9f699ac98fee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,8 +15,7 @@ ARG TARGETARCH ARG TARGETVARIANT ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" - +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -445,9 +444,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE ; fi && \ if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/rerankers \ - ; fi && \ - if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ - make -C backend/python/mamba \ ; fi # Make sure the models directory exists diff --git a/Makefile b/Makefile index 467b2d391d25..fc649c4ff7ad 100644 --- a/Makefile +++ b/Makefile @@ -533,10 +533,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen transformers-protogen parler-tts-protogen kokoro-protogen vllm-protogen openvoice-protogen faster-whisper-protogen +protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen parler-tts-protogen kokoro-protogen vllm-protogen openvoice-protogen faster-whisper-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean rerankers-protogen-clean transformers-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean faster-whisper-protogen-clean +protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean faster-whisper-protogen-clean .PHONY: autogptq-protogen autogptq-protogen: @@ -586,14 +586,6 @@ exllama2-protogen: exllama2-protogen-clean: $(MAKE) -C backend/python/exllama2 protogen-clean -.PHONY: mamba-protogen -mamba-protogen: - $(MAKE) -C backend/python/mamba protogen - -.PHONY: mamba-protogen-clean -mamba-protogen-clean: - $(MAKE) -C backend/python/mamba protogen-clean - .PHONY: rerankers-protogen rerankers-protogen: $(MAKE) -C backend/python/rerankers protogen @@ -651,7 +643,6 @@ prepare-extra-conda-environments: protogen-python $(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/faster-whisper $(MAKE) -C backend/python/vllm - $(MAKE) -C backend/python/mamba $(MAKE) -C backend/python/rerankers $(MAKE) -C backend/python/transformers $(MAKE) -C backend/python/parler-tts diff --git a/backend/python/mamba/Makefile b/backend/python/mamba/Makefile deleted file mode 100644 index 52b1c53a4d2e..000000000000 --- a/backend/python/mamba/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -.PHONY: mamba -mamba: protogen - bash install.sh - -.PHONY: run -run: protogen - @echo "Running mamba..." - bash run.sh - @echo "mamba run." - -.PHONY: test -test: protogen - @echo "Testing mamba..." - bash test.sh - @echo "mamba tested." - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto - -.PHONY: clean -clean: protogen-clean - $(RM) -r venv __pycache__ \ No newline at end of file diff --git a/backend/python/mamba/README.md b/backend/python/mamba/README.md deleted file mode 100644 index d6ead9176e34..000000000000 --- a/backend/python/mamba/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Creating a separate environment for the mamba project - -``` -make mamba -``` \ No newline at end of file diff --git a/backend/python/mamba/backend.py b/backend/python/mamba/backend.py deleted file mode 100644 index 3c15fea715d8..000000000000 --- a/backend/python/mamba/backend.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python3 -from concurrent import futures -import time -import argparse -import signal -import sys -import os - -import backend_pb2 -import backend_pb2_grpc - -import grpc - -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM -from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) -MAMBA_CHAT= os.environ.get('MAMBA_CHAT', '1') == '1' - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - """ - A gRPC servicer that implements the Backend service defined in backend.proto. - """ - def generate(self,prompt, max_new_tokens): - """ - Generates text based on the given prompt and maximum number of new tokens. - - Args: - prompt (str): The prompt to generate text from. - max_new_tokens (int): The maximum number of new tokens to generate. - - Returns: - str: The generated text. - """ - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text - - def Health(self, request, context): - """ - Returns a health check message. - - Args: - request: The health check request. - context: The gRPC context. - - Returns: - backend_pb2.Reply: The health check reply. - """ - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - - def LoadModel(self, request, context): - """ - Loads a language model. - - Args: - request: The load model request. - context: The gRPC context. - - Returns: - backend_pb2.Result: The load model result. - """ - try: - tokenizerModel = request.Tokenizer - if tokenizerModel == "": - tokenizerModel = request.Model - - tokenizer = AutoTokenizer.from_pretrained(tokenizerModel) - if MAMBA_CHAT: - tokenizer.eos_token = "<|endoftext|>" - tokenizer.pad_token = tokenizer.eos_token - self.tokenizer = tokenizer - self.model = MambaLMHeadModel.from_pretrained(request.Model, device="cuda", dtype=torch.float16) - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - """ - Generates text based on the given prompt and sampling parameters. - - Args: - request: The predict request. - context: The gRPC context. - - Returns: - backend_pb2.Result: The predict result. - """ - if request.TopP == 0: - request.TopP = 0.9 - - max_tokens = request.Tokens - - if request.Tokens == 0: - max_tokens = 2000 - - # encoded_input = self.tokenizer(request.Prompt) - tokens = self.tokenizer(request.Prompt, return_tensors="pt") - input_ids = tokens.input_ids.to(device="cuda") - out = self.model.generate(input_ids=input_ids, max_length=max_tokens, temperature=request.Temperature, - top_p=request.TopP, eos_token_id=self.tokenizer.eos_token_id) - - decoded = self.tokenizer.batch_decode(out) - - generated_text = decoded[0] - - # Remove prompt from response if present - if request.Prompt in generated_text: - generated_text = generated_text.replace(request.Prompt, "") - - return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) - - def PredictStream(self, request, context): - """ - Generates text based on the given prompt and sampling parameters, and streams the results. - - Args: - request: The predict stream request. - context: The gRPC context. - - Returns: - backend_pb2.Result: The predict stream result. - """ - yield self.Predict(request, context) - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) diff --git a/backend/python/mamba/install.sh b/backend/python/mamba/install.sh deleted file mode 100755 index db18eefc9ac5..000000000000 --- a/backend/python/mamba/install.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -e - -LIMIT_TARGETS="cublas" -EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation" - -source $(dirname $0)/../common/libbackend.sh - -installRequirements \ No newline at end of file diff --git a/backend/python/mamba/requirements-after.txt b/backend/python/mamba/requirements-after.txt deleted file mode 100644 index ea6890ebb1e5..000000000000 --- a/backend/python/mamba/requirements-after.txt +++ /dev/null @@ -1,2 +0,0 @@ -causal-conv1d==1.4.0 -mamba-ssm==2.2.2 \ No newline at end of file diff --git a/backend/python/mamba/requirements-cpu.txt b/backend/python/mamba/requirements-cpu.txt deleted file mode 100644 index b4f1261f8793..000000000000 --- a/backend/python/mamba/requirements-cpu.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==2.4.1 -transformers \ No newline at end of file diff --git a/backend/python/mamba/requirements-cublas11.txt b/backend/python/mamba/requirements-cublas11.txt deleted file mode 100644 index ed0d4df53803..000000000000 --- a/backend/python/mamba/requirements-cublas11.txt +++ /dev/null @@ -1,3 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.4.1+cu118 -transformers \ No newline at end of file diff --git a/backend/python/mamba/requirements-cublas12.txt b/backend/python/mamba/requirements-cublas12.txt deleted file mode 100644 index b4f1261f8793..000000000000 --- a/backend/python/mamba/requirements-cublas12.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==2.4.1 -transformers \ No newline at end of file diff --git a/backend/python/mamba/requirements-install.txt b/backend/python/mamba/requirements-install.txt deleted file mode 100644 index 69d263f0b3ed..000000000000 --- a/backend/python/mamba/requirements-install.txt +++ /dev/null @@ -1,6 +0,0 @@ -# mabma does not specify it's build dependencies per PEP517, so we need to disable build isolation -# this also means that we need to install the basic build dependencies into the venv ourselves -# https://github.com/Dao-AILab/causal-conv1d/issues/24 -packaging -setuptools -wheel \ No newline at end of file diff --git a/backend/python/mamba/requirements.txt b/backend/python/mamba/requirements.txt deleted file mode 100644 index afc8b2a9c129..000000000000 --- a/backend/python/mamba/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -grpcio==1.69.0 -protobuf -certifi \ No newline at end of file diff --git a/backend/python/mamba/run.sh b/backend/python/mamba/run.sh deleted file mode 100755 index 1afc39848c7e..000000000000 --- a/backend/python/mamba/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -LIMIT_TARGETS="cublas" - -source $(dirname $0)/../common/libbackend.sh - -startBackend $@ \ No newline at end of file diff --git a/backend/python/mamba/test.py b/backend/python/mamba/test.py deleted file mode 100644 index 83fb26518e76..000000000000 --- a/backend/python/mamba/test.py +++ /dev/null @@ -1,76 +0,0 @@ -import unittest -import subprocess -import time -import backend_pb2 -import backend_pb2_grpc - -import grpc - -import unittest -import subprocess -import time -import grpc -import backend_pb2_grpc -import backend_pb2 - -class TestBackendServicer(unittest.TestCase): - """ - TestBackendServicer is the class that tests the gRPC service. - - This class contains methods to test the startup and shutdown of the gRPC service. - """ - def setUp(self): - self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"]) - time.sleep(10) - - def tearDown(self) -> None: - self.service.terminate() - self.service.wait() - - def test_server_startup(self): - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.Health(backend_pb2.HealthMessage()) - self.assertEqual(response.message, b'OK') - except Exception as err: - print(err) - self.fail("Server failed to start") - finally: - self.tearDown() - def test_load_model(self): - """ - This method tests if the model is loaded successfully - """ - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) - self.assertTrue(response.success) - self.assertEqual(response.message, "Model loaded successfully") - except Exception as err: - print(err) - self.fail("LoadModel service failed") - finally: - self.tearDown() - - def test_text(self): - """ - This method tests if the embeddings are generated successfully - """ - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) - self.assertTrue(response.success) - req = backend_pb2.PredictOptions(Prompt="The capital of France is") - resp = stub.Predict(req) - self.assertIsNotNone(resp.message) - except Exception as err: - print(err) - self.fail("text service failed") - finally: - self.tearDown() \ No newline at end of file diff --git a/backend/python/mamba/test.sh b/backend/python/mamba/test.sh deleted file mode 100755 index 6940b0661df2..000000000000 --- a/backend/python/mamba/test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -runUnittests diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index 9b65c6db2ea3..b0d5875bde2f 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -21,7 +21,7 @@ XPU=os.environ.get("XPU", "0") == "1" -from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria +from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, MambaConfig, MambaForCausalLM from transformers import AutoProcessor, MusicgenForConditionalGeneration from scipy.io import wavfile import outetts @@ -245,6 +245,10 @@ def LoadModel(self, request, context): autoTokenizer = False self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode) self.SentenceTransformer = True + elif request.Type == "Mamba": + autoTokenizer = False + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = MambaForCausalLM.from_pretrained(model_name) else: print("Automodel", file=sys.stderr) self.model = AutoModel.from_pretrained(model_name, diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index b2a5293bdd95..d5f1459b7636 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -29,12 +29,14 @@ var Aliases map[string]string = map[string]string{ "langchain-huggingface": LCHuggingFaceBackend, "transformers-musicgen": TransformersBackend, "sentencetransformers": TransformersBackend, + "mamba": TransformersBackend, "stablediffusion": StableDiffusionGGMLBackend, } var TypeAlias map[string]string = map[string]string{ "sentencetransformers": "SentenceTransformer", "huggingface-embeddings": "SentenceTransformer", + "mamba": "Mamba", "transformers-musicgen": "MusicgenForConditionalGeneration", }