From 69a3dfdf93984bec723dcb838163095237db929c Mon Sep 17 00:00:00 2001 From: Michael Feil <63565275+michaelfeil@users.noreply.github.com> Date: Fri, 16 Jun 2023 00:09:20 +0200 Subject: [PATCH] New release (#8) * new new_release * format with black * update converter * update converter * update test --------- Co-authored-by: michaelfeil --- conversion_utils/convert.py | 287 +++++++++++++++++++++++-------- hf_hub_ctranslate2/__init__.py | 10 +- hf_hub_ctranslate2/translate.py | 286 +++++++++++++++++++++++++----- hf_hub_ctranslate2/util/utils.py | 1 + setup.py | 2 +- tests/test_translate.py | 47 ++++- 6 files changed, 511 insertions(+), 122 deletions(-) diff --git a/conversion_utils/convert.py b/conversion_utils/convert.py index c054b8b..c4aca99 100644 --- a/conversion_utils/convert.py +++ b/conversion_utils/convert.py @@ -1,89 +1,162 @@ import os + def call(*args, **kwargs): import subprocess + out = subprocess.call(*args, **kwargs) if out != 0: raise ValueError(f"Output: {out}") -def convert(NAME="opus-mt-en-fr", ORG="Helsinki-NLP"): + +model_description_generator = """ +from hf_hub_ctranslate2 import GeneratorCT2fromHfHub +model = GeneratorCT2fromHfHub( + # load in int8 on CUDA + model_name_or_path=model_name, + device="cuda", + compute_type="int8_float16", + # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}") +) +outputs = model.generate( + text=["def fibonnaci(", "User: How are you doing? Bot:"], + max_length=64, + include_prompt_in_result=False +) +print(outputs)""" + +model_description_translator = """ +from hf_hub_ctranslate2 import TranslatorCT2fromHfHub +model = TranslatorCT2fromHfHub( + # load in int8 on CUDA + model_name_or_path=model_name, + device="cuda", + compute_type="int8_float16", + # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}") +) +outputs = model.generate( + text=["def fibonnaci(", "User: How are you doing? Bot:"], + max_length=64, +) +print(outputs)""" + +model_description_encoder = """ +from hf_hub_ctranslate2 import EncoderCT2fromHfHub +model = EncoderCT2fromHfHub( + # load in int8 on CUDA + model_name_or_path=model_name, + device="cuda", + compute_type="float16", + # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}") +) +embeddings = model.encode( + ["I like soccer", "I like tennis", "The eiffel tower is in Paris"], + batch_size=32, + convert_to_numpy=True, + normalize_embeddings=True, +) +print(embeddings.shape, embeddings) +scores = (embeddings @ embeddings.T) * 100 +""" + + +def convert(NAME="opus-mt-en-fr", ORG="Helsinki-NLP", description="generator"): + print(f"converting {ORG}/{NAME} ") import re import datetime from huggingface_hub import HfApi, snapshot_download + api = HfApi() - - HUB_NAME=f"ct2fast-{NAME}" + + HUB_NAME = f"ct2fast-{NAME}" repo_id = f"michaelfeil/{HUB_NAME}" api.create_repo(repo_id=repo_id, exist_ok=True, repo_type="model") tmp_dir = os.path.join(os.path.expanduser("~"), f"tmp-{HUB_NAME}") os.chdir(os.path.expanduser("~")) - + path = snapshot_download( - f'{ORG}/{NAME}', + f"{ORG}/{NAME}", + ) + files = [f for f in os.listdir(path) if "." in f] + filtered_f = [ + f + for f in files + if not ("model" in f or "config.json" == f or f.endswith(".py")) + ] + + conv_arg = ( + [ + "ct2-transformers-converter", + "--model", + f"{ORG}/{NAME}", + "--output_dir", + str(tmp_dir), + "--force", + "--copy_files", + ] + + filtered_f + + [ + "--quantization", + "float16" if description == "encoder" else "int8_float16", + "--trust_remote_code", + ] ) - files = os.listdir(path) - filtered_f = [f for f in files if not ("model" in f or "config.json" == f)] - - conv_arg = [ - 'ct2-transformers-converter', - '--model', - f'{ORG}/{NAME}', - '--output_dir', - str(tmp_dir), - '--force', - '--copy_files', - ]+ filtered_f + [ - '--quantization', - 'float16'] call(conv_arg) - - with open(os.path.join(tmp_dir,'README.md'),'r') as f: + if not "vocabulary.txt" in os.listdir(tmp_dir) and "vocab.txt" in os.listdir( + tmp_dir + ): + import shutil + + shutil.copyfile( + os.path.join(tmp_dir, "vocab.txt"), + os.path.join(tmp_dir, "vocabulary.txt"), + ) + + with open(os.path.join(tmp_dir, "README.md"), "r") as f: content = f.read() if "tags:" in content: - content = content.replace("tags:","tags:\n- ctranslate2\n- int8\n- float16") + content = content.replace("tags:", "tags:\n- ctranslate2\n- int8\n- float16", 1) else: - content = content.replace("---","---\ntags:\n- ctranslate2\n- int8\n- float16\n") + content = content.replace( + "---", "---\ntags:\n- ctranslate2\n- int8\n- float16\n", 1 + ) - end_header = [m.start() for m in re.finditer(r"---",content)] + end_header = [m.start() for m in re.finditer(r"---", content)] if len(end_header) > 1: end_header = end_header[1] + 3 else: end_header = 0 conv_arg_nice = " ".join(conv_arg) + conv_arg_nice = conv_arg_nice.replace(os.path.expanduser("~"), "~") + if description == "generator": + model_description = model_description_generator + elif description == "encoder": + model_description = model_description_encoder + elif description == "translator": + model_description = model_description_translator add_string = f""" # # Fast-Inference with Ctranslate2 Speedup inference while reducing memory by 2x-4x using int8 inference in C++ on CPU or GPU. quantized version of [{ORG}/{NAME}](https://huggingface.co/{ORG}/{NAME}) ```bash -pip install hf-hub-ctranslate2>=2.0.6 -``` -Converted on {str(datetime.datetime.now())[:10]} using +pip install hf-hub-ctranslate2>=2.10.0 ctranslate2>=3.16.0 ``` -{conv_arg_nice} + +```python +# from transformers import AutoTokenizer +model_name = "{repo_id}" +{model_description} ``` -Checkpoint compatible to [ctranslate2>=3.13.0](https://github.com/OpenNMT/CTranslate2) and [hf-hub-ctranslate2>=2.0.6](https://github.com/michaelfeil/hf-hub-ctranslate2) -- `compute_type=int8_float16` for `device="cuda"` +Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2) +and [hf-hub-ctranslate2>=2.10.0](https://github.com/michaelfeil/hf-hub-ctranslate2) +- `compute_type=int8_float16` for `device="cuda"` - `compute_type=int8` for `device="cpu"` -```python -from hf_hub_ctranslate2 import TranslatorCT2fromHfHub, GeneratorCT2fromHfHub -from transformers import AutoTokenizer - -model_name = "{repo_id}" -# use either TranslatorCT2fromHfHub or GeneratorCT2fromHfHub here, depending on model. -model = GeneratorCT2fromHfHub( - # load in int8 on CUDA - model_name_or_path=model_name, - device="cuda", - compute_type="int8_float16", - tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}") -) -outputs = model.generate( - text=["How do you call a fast Flan-ingo?", "User: How are you doing? Bot:"], -) -print(outputs) +Converted on {str(datetime.datetime.now())[:10]} using +``` +{conv_arg_nice} ``` # Licence and other remarks: @@ -91,38 +164,106 @@ def convert(NAME="opus-mt-en-fr", ORG="Helsinki-NLP"): # Original description """ - - with open(os.path.join(tmp_dir,'README.md'),'w') as f: + + with open(os.path.join(tmp_dir, "README.md"), "w") as f: f.write(content[:end_header] + add_string + content[end_header:]) - api.upload_folder( folder_path=tmp_dir, - repo_id=repo_id, repo_type="model", - commit_message=f"Upload {ORG}/{NAME} ctranslate fp16 weights" + repo_id=repo_id, + repo_type="model", + commit_message=f"Upload {ORG}/{NAME} ctranslate fp16 weights", ) - call(["rm","-rf", tmp_dir]) - + call(["rm", "-rf", tmp_dir]) + + if __name__ == "__main__": generators = [ - ("togethercomputer/RedPajama-INCITE-Instruct-3B-v1"), - ("togethercomputer/GPT-JT-6B-v0"), - "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1", - "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1", - "EleutherAI/pythia-160m", - "EleutherAI/pythia-2.8b", - "EleutherAI/pythia-6.9b", - "EleutherAI/pythia-12b", - "togethercomputer/Pythia-Chat-Base-7B", - "stabilityai/stablelm-base-alpha-7b", - "stabilityai/stablelm-tuned-alpha-7b", - "stabilityai/stablelm-base-alpha-3b", - "stabilityai/stablelm-tuned-alpha-3b", - "OpenAssistant/stablelm-7b-sft-v7-epoch-3", - "EleutherAI/gpt-j-6b", - "EleutherAI/gpt-neox-20b", - "OpenAssistant/pythia-12b-sft-v8-7k-steps" + # "togethercomputer/RedPajama-INCITE-Instruct-3B-v1", + # "togethercomputer/GPT-JT-6B-v0", + # "togethercomputer/RedPajama-INCITE-7B-Instruct", + # "togethercomputer/RedPajama-INCITE-7B-Chat", + # "EleutherAI/pythia-160m", + # "EleutherAI/pythia-2.8b", + # "EleutherAI/pythia-6.9b", + # "EleutherAI/pythia-12b", + # "togethercomputer/Pythia-Chat-Base-7B", + # "stabilityai/stablelm-base-alpha-7b", + # "stabilityai/stablelm-tuned-alpha-7b", + # "stabilityai/stablelm-base-alpha-3b", + # "stabilityai/stablelm-tuned-alpha-3b", + # "OpenAssistant/stablelm-7b-sft-v7-epoch-3", + # "EleutherAI/gpt-j-6b", + # "EleutherAI/gpt-neox-20b", + # "OpenAssistant/pythia-12b-sft-v8-7k-steps", + # "Salesforce/codegen-350M-mono", + # "Salesforce/codegen-350M-multi", + # "Salesforce/codegen-2B-mono", + # "Salesforce/codegen-2B-multi", + # "Salesforce/codegen-6B-multi", + # "Salesforce/codegen-6B-mono", + # "Salesforce/codegen-16B-mono", + # "Salesforce/codegen-16B-multi", + # "Salesforce/codegen2-1B", + # "Salesforce/codegen2-3_7B", + # "Salesforce/codegen2-7B", + # "Salesforce/codegen2-16B", + # "bigcode/gpt_bigcode-santacoder", + # 'bigcode/starcoder', + # "mosaicml/mpt-7b", + # "mosaicml/mpt-7b-instruct", + # "mosaicml/mpt-7b-chat" + "VMware/open-llama-7b-open-instruct", + # "tiiuae/falcon-7b-instruct", + # 'tiiuae/falcon-7b', + "tiiuae/falcon-40b-instruct", + "tiiuae/falcon-40b", + "OpenAssistant/falcon-7b-sft-top1-696", + "OpenAssistant/falcon-7b-sft-mix-2000", + "OpenAssistant/falcon-40b-sft-mix-1226", + # "HuggingFaceH4/starchat-beta", + "WizardLM/WizardCoder-15B-V1.0", + ] + translators = [ + # 'Salesforce/codet5p-770m-py', 'Salesforce/codet5p-770m' ] + encoders = [ + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + "intfloat/e5-small-v2", + "intfloat/e5-large-v2", + "intfloat/e5-large", + "sentence-transformers/all-MiniLM-L6-v2", + "setu4993/LaBSE", + ] + for m in encoders: + ORG, NAME = m.split("/") + convert(NAME=NAME, ORG=ORG, description="encoder") + + for m in translators: + ORG, NAME = m.split("/") + convert(NAME=NAME, ORG=ORG, description="translator") + for m in generators: - ORG , NAME = m.split("/") - convert(NAME=NAME, ORG=ORG) + ORG, NAME = m.split("/") + # import huggingface_hub + # huggingface_hub.snapshot_download( + # m + # ) + convert(NAME=NAME, ORG=ORG, description="generator") + + from hf_hub_ctranslate2 import GeneratorCT2fromHfHub + from transformers import AutoTokenizer + + model_name = f"michaelfeil/ct2fast-{NAME}" + # use either TranslatorCT2fromHfHub or GeneratorCT2fromHfHub here, depending on model. + model = GeneratorCT2fromHfHub( + # load in int8 on CUDA + model_name_or_path=model_name, + device="cuda", + compute_type="int8", + tokenizer=AutoTokenizer.from_pretrained(m), + ) + outputs = model.generate( + text=["def print_hello_world():", "def hello_name(name:"], max_length=64 + ) + print(outputs) diff --git a/hf_hub_ctranslate2/__init__.py b/hf_hub_ctranslate2/__init__.py index 1bdf3cc..555befd 100644 --- a/hf_hub_ctranslate2/__init__.py +++ b/hf_hub_ctranslate2/__init__.py @@ -1,5 +1,11 @@ # -*- coding: utf-8 -*- """Compatability between Huggingface and Ctranslate2.""" # __all__ = ["__version__", "TranslatorCT2fromHfHub", "GeneratorCT2fromHfHub", "MultiLingualTranslatorCT2fromHfHub"] -from hf_hub_ctranslate2.translate import TranslatorCT2fromHfHub, GeneratorCT2fromHfHub, MultiLingualTranslatorCT2fromHfHub -__version__ = "2.0.9" +from hf_hub_ctranslate2.translate import ( + TranslatorCT2fromHfHub, + GeneratorCT2fromHfHub, + MultiLingualTranslatorCT2fromHfHub, + EncoderCT2fromHfHub, +) + +__version__ = "2.0.10" diff --git a/hf_hub_ctranslate2/translate.py b/hf_hub_ctranslate2/translate.py index 0e582d4..a11ea08 100644 --- a/hf_hub_ctranslate2/translate.py +++ b/hf_hub_ctranslate2/translate.py @@ -1,6 +1,8 @@ import ctranslate2 + try: from transformers import AutoTokenizer + autotokenizer_ok = True except ImportError: AutoTokenizer = object @@ -28,23 +30,27 @@ def __init__( compute_type: Literal["int8_float16", "int8"] = "int8_float16", tokenizer: Union[AutoTokenizer, None] = None, hub_kwargs: dict = {}, - **kwargs: Any + **kwargs: Any, ): # adaptions from https://github.com/guillaumekln/faster-whisper if os.path.isdir(model_name_or_path): model_path = model_name_or_path else: try: - model_path = _utils._download_model(model_name_or_path, hub_kwargs=hub_kwargs) + model_path = _utils._download_model( + model_name_or_path, hub_kwargs=hub_kwargs + ) except: hub_kwargs["local_files_only"] = True - model_path = _utils._download_model(model_name_or_path, hub_kwargs=hub_kwargs) + model_path = _utils._download_model( + model_name_or_path, hub_kwargs=hub_kwargs + ) self.model = self.ctranslate_class( model_path, device=device, device_index=device_index, compute_type=compute_type, - **kwargs + **kwargs, ) if tokenizer is not None: @@ -52,24 +58,35 @@ def __init__( else: if "tokenizer.json" in os.listdir(model_path): if not autotokenizer_ok: - raise ValueError("`pip install transformers` missing to load AutoTokenizer.") + raise ValueError( + "`pip install transformers` missing to load AutoTokenizer." + ) self.tokenizer = AutoTokenizer.from_pretrained(model_path, fast=True) else: - raise ValueError("no suitable Tokenizer found. " - "Please set one via tokenizer=AutoTokenizer.from_pretrained(..) arg.") - + raise ValueError( + "no suitable Tokenizer found. " + "Please set one via tokenizer=AutoTokenizer.from_pretrained(..) arg." + ) def _forward(self, *args: Any, **kwds: Any) -> Any: raise NotImplementedError - + def tokenize_encode(self, text, *args, **kwargs): return [ self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(p)) for p in text ] + def tokenize_decode(self, tokens_out, *args, **kwargs): raise NotImplementedError - def generate(self, text: Union[str, List[str]], encode_kwargs={}, decode_kwargs={}, *forward_args, **forward_kwds: Any): + def generate( + self, + text: Union[str, List[str]], + encode_kwargs={}, + decode_kwargs={}, + *forward_args, + **forward_kwds: Any, + ): orig_type = list if isinstance(text, str): orig_type = str @@ -92,15 +109,15 @@ def __init__( compute_type: Literal["int8_float16", "int8"] = "int8_float16", tokenizer: Union[AutoTokenizer, None] = None, hub_kwargs={}, - **kwargs: Any + **kwargs: Any, ): """for ctranslate2.Translator models, in particular m2m-100 Args: model_name_or_path (str): _description_ - device (Literal["cpu", "cuda"], optional): _description_. Defaults to "cuda". + device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda". device_index (int, optional): _description_. Defaults to 0. - compute_type (Literal["int8_float16", "int8"], optional): _description_. Defaults to "int8_float16". + compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16". tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None. hub_kwargs (dict, optional): _description_. Defaults to {}. **kwargs (Any, optional): Any additional arguments @@ -113,22 +130,30 @@ def __init__( compute_type, tokenizer, hub_kwargs, - **kwargs + **kwargs, ) def _forward(self, *args, **kwds): return self.model.translate_batch(*args, **kwds) - + def tokenize_decode(self, tokens_out, *args, **kwargs): return [ self.tokenizer.decode( self.tokenizer.convert_tokens_to_ids(tokens_out[i].hypotheses[0]), - *args, **kwargs + *args, + **kwargs, ) for i in range(len(tokens_out)) ] - def generate(self, text: Union[str, List[str]], encode_tok_kwargs={}, decode_tok_kwargs={}, *forward_args, **forward_kwds: Any): + def generate( + self, + text: Union[str, List[str]], + encode_tok_kwargs={}, + decode_tok_kwargs={}, + *forward_args, + **forward_kwds: Any, + ): """_summary_ Args: @@ -168,7 +193,14 @@ def generate(self, text: Union[str, List[str]], encode_tok_kwargs={}, decode_tok Returns: Union[str, List[str]]: text as output, if list, same len as input """ - return super().generate(text, encode_kwargs=encode_tok_kwargs, decode_kwargs=decode_tok_kwargs, *forward_args, **forward_kwds) + return super().generate( + text, + encode_kwargs=encode_tok_kwargs, + decode_kwargs=decode_tok_kwargs, + *forward_args, + **forward_kwds, + ) + class MultiLingualTranslatorCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub): def __init__( @@ -179,15 +211,15 @@ def __init__( compute_type: Literal["int8_float16", "int8"] = "int8_float16", tokenizer: Union[AutoTokenizer, None] = None, hub_kwargs={}, - **kwargs: Any + **kwargs: Any, ): """for ctranslate2.Translator models Args: model_name_or_path (str): _description_ - device (Literal["cpu", "cuda"], optional): _description_. Defaults to "cuda". + device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda". device_index (int, optional): _description_. Defaults to 0. - compute_type (Literal["int8_float16", "int8"], optional): _description_. Defaults to "int8_float16". + compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16". tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None. hub_kwargs (dict, optional): _description_. Defaults to {}. **kwargs (Any, optional): Any additional arguments @@ -200,32 +232,44 @@ def __init__( compute_type, tokenizer, hub_kwargs, - **kwargs + **kwargs, ) - + def _forward(self, *args, **kwds): - target_prefix = [[self.tokenizer.lang_code_to_token[l]] for l in kwds.pop("tgt_lang")] + target_prefix = [ + [self.tokenizer.lang_code_to_token[l]] for l in kwds.pop("tgt_lang") + ] # target_prefix=[['__de__'], ['__fr__']] return self.model.translate_batch(*args, **kwds, target_prefix=target_prefix) - + def tokenize_encode(self, text, *args, **kwargs): tokens = [] src_lang = kwargs.pop("src_lang") for t, src_language in zip(text, src_lang): self.tokenizer.src_lang = src_language - tokens.append(self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(t))) + tokens.append( + self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(t)) + ) return tokens - + def tokenize_decode(self, tokens_out, *args, **kwargs): return [ self.tokenizer.decode( self.tokenizer.convert_tokens_to_ids(tokens_out[i].hypotheses[0][1:]), - *args, **kwargs + *args, + **kwargs, ) for i in range(len(tokens_out)) ] - def generate(self, text: Union[str, List[str]], src_lang: Union[str, List[str]], tgt_lang: Union[str, List[str]], *forward_args, **forward_kwds: Any): + def generate( + self, + text: Union[str, List[str]], + src_lang: Union[str, List[str]], + tgt_lang: Union[str, List[str]], + *forward_args, + **forward_kwds: Any, + ): """_summary_ Args: @@ -266,9 +310,160 @@ def generate(self, text: Union[str, List[str]], src_lang: Union[str, List[str]], Union[str, List[str]]: text as output, if list, same len as input """ if not len(text) == len(src_lang) == len(tgt_lang): - raise ValueError(f"unequal len: text={len(text)} src_lang={len(src_lang)} tgt_lang={len(tgt_lang)}") + raise ValueError( + f"unequal len: text={len(text)} src_lang={len(src_lang)} tgt_lang={len(tgt_lang)}" + ) forward_kwds["tgt_lang"] = tgt_lang - return super().generate(text, *forward_args, **forward_kwds, encode_kwargs={"src_lang": src_lang}) + return super().generate( + text, *forward_args, **forward_kwds, encode_kwargs={"src_lang": src_lang} + ) + + +class EncoderCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub): + def __init__( + self, + model_name_or_path: str, + device: Literal["cpu", "cuda"] = "cuda", + device_index=0, + compute_type: Literal["int8_float16", "int8"] = "int8_float16", + tokenizer: Union[AutoTokenizer, None] = None, + hub_kwargs={}, + **kwargs: Any, + ): + """for ctranslate2.Translator models, in particular m2m-100 + + Args: + model_name_or_path (str): _description_ + device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda". + device_index (int, optional): _description_. Defaults to 0. + compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16". + tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None. + hub_kwargs (dict, optional): _description_. Defaults to {}. + **kwargs (Any, optional): Any additional arguments + """ + self.ctranslate_class = ctranslate2.Encoder + super().__init__( + model_name_or_path, + device, + device_index, + compute_type, + tokenizer, + hub_kwargs, + **kwargs, + ) + self.device = device + if device == "cuda": + import functools + + try: + import torch + except ImportError: + raise ValueError( + "decoding storageview on CUDA of encoder requires torch" + ) + self.tensor_decode_method = functools.partial( + torch.as_tensor, device=device + ) + else: + try: + import numpy as np + except ImportError: + raise ValueError( + "decoding storageview on CPU of encoder requires numpy" + ) + self.tensor_decode_method = np.asarray + + def _forward(self, *args, **kwds): + return self.model.forward_batch(*args, **kwds) + + def tokenize_encode(self, text, *args, **kwargs): + return self.tokenizer(text).input_ids + + def tokenize_decode(self, tokens_out, *args, **kwargs): + return self.tensor_decode_method(tokens_out.pooler_output) + + def generate( + self, + text: Union[str, List[str]], + encode_tok_kwargs={}, + decode_tok_kwargs={}, + *forward_args, + **forward_kwds: Any, + ): + return super().generate( + text, + encode_kwargs=encode_tok_kwargs, + decode_kwargs=decode_tok_kwargs, + *forward_args, + **forward_kwds, + ) + + def encode( + self, + sentences: Union[str, List[str]], + batch_size: int = 32, + convert_to_numpy: bool = True, + convert_to_tensor: bool = False, + normalize_embeddings: bool = False, + *args, + **kwargs, + ): + """ + Computes sentence embeddings + + :param sentences: the sentences to embed + :param batch_size: the batch size used for the computation + :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. + :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy + :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. + + :return: + By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. + """ + import numpy as np + + if convert_to_tensor: + convert_to_numpy = False + + input_was_string = False + if isinstance(sentences, str) or not hasattr( + sentences, "__len__" + ): # Cast an individual sentence to a list with length 1 + sentences = [sentences] + input_was_string = True + + all_embeddings = [] + length_sorted_idx = np.argsort([-len(sen) for sen in sentences]) + sentences_sorted = [sentences[idx] for idx in length_sorted_idx] + + for start_index in range(0, len(sentences), batch_size): + sentences_batch = sentences_sorted[start_index : start_index + batch_size] + + embeddings = self.generate(sentences_batch) + + if normalize_embeddings: + embeddings = ( + embeddings / (embeddings**2).sum(axis=1, keepdims=True) ** 0.5 + ) + + # fixes for #522 and #487 to avoid oom problems on gpu with large datasets + if convert_to_numpy and not isinstance(embeddings, np.ndarray): + embeddings = embeddings.cpu() + + all_embeddings.extend(embeddings) + + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + if convert_to_tensor and not isinstance(all_embeddings[0], np.ndarray): + raise NotImplementedError + elif convert_to_numpy and not isinstance(all_embeddings[0], np.ndarray): + all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) + + if input_was_string: + all_embeddings = all_embeddings[0] + + return all_embeddings + class GeneratorCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub): def __init__( @@ -279,15 +474,15 @@ def __init__( compute_type: Literal["int8_float16", "int8"] = "int8_float16", tokenizer: Union[AutoTokenizer, None] = None, hub_kwargs={}, - **kwargs: Any + **kwargs: Any, ): """for ctranslate2.Generator models Args: model_name_or_path (str): _description_ - device (Literal["cpu", "cuda"], optional): _description_. Defaults to "cuda". + device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda". device_index (int, optional): _description_. Defaults to 0. - compute_type (Literal["int8_float16", "int8"], optional): _description_. Defaults to "int8_float16". + compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16". tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None. hub_kwargs (dict, optional): _description_. Defaults to {}. **kwargs (Any, optional): Any additional arguments @@ -300,20 +495,26 @@ def __init__( compute_type, tokenizer, hub_kwargs, - **kwargs + **kwargs, ) def _forward(self, *args, **kwds): return self.model.generate_batch(*args, **kwds) - + def tokenize_decode(self, tokens_out, *args, **kwargs): return [ self.tokenizer.decode(tokens_out[i].sequences_ids[0], *args, **kwargs) for i in range(len(tokens_out)) ] - - def generate(self, text: Union[str, List[str]], encode_tok_kwargs={}, decode_tok_kwargs={}, *forward_args, **forward_kwds: Any): + def generate( + self, + text: Union[str, List[str]], + encode_tok_kwargs={}, + decode_tok_kwargs={}, + *forward_args, + **forward_kwds: Any, + ): """_summary_ Args: @@ -347,5 +548,10 @@ def generate(self, text: Union[str, List[str]], encode_tok_kwargs={}, decode_tok Returns: str | List[str]: text as output, if list, same len as input """ - return super().generate(text, encode_kwargs=encode_tok_kwargs, decode_kwargs=decode_tok_kwargs, *forward_args, **forward_kwds) - + return super().generate( + text, + encode_kwargs=encode_tok_kwargs, + decode_kwargs=decode_tok_kwargs, + *forward_args, + **forward_kwds, + ) diff --git a/hf_hub_ctranslate2/util/utils.py b/hf_hub_ctranslate2/util/utils.py index de0d6f8..20f34c6 100644 --- a/hf_hub_ctranslate2/util/utils.py +++ b/hf_hub_ctranslate2/util/utils.py @@ -45,6 +45,7 @@ def _download_model( "vocabulary.txt", "tokenizer_config.json", "*ocabulary.txt", + "vocab.txt", ] return huggingface_hub.snapshot_download( diff --git a/setup.py b/setup.py index b97c5f7..153175b 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ "Operating System :: POSIX :: Linux", ], install_requires=[ - "ctranslate2>=3.13.0", + "ctranslate2>=3.16.0", "transformers>=4.28.0", "huggingface-hub", "typing_extensions", diff --git a/tests/test_translate.py b/tests/test_translate.py index 323f2e7..a47c6a2 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -1,8 +1,38 @@ -from hf_hub_ctranslate2 import TranslatorCT2fromHfHub, GeneratorCT2fromHfHub, MultiLingualTranslatorCT2fromHfHub +from hf_hub_ctranslate2 import ( + TranslatorCT2fromHfHub, + GeneratorCT2fromHfHub, + MultiLingualTranslatorCT2fromHfHub, + EncoderCT2fromHfHub, +) + from hf_hub_ctranslate2.util import utils as _utils from transformers import AutoTokenizer +def test_encoder(model_name="michaelfeil/ct2fast-e5-small-v2"): + model = EncoderCT2fromHfHub( + model_name_or_path=model_name, device="cpu", compute_type="int8" + ) + + embeddings = model.generate( + text=["I like soccer", "I like tennis", "The eiffel tower is in Paris"], + ) + assert len(embeddings) == 3 + assert len(embeddings[0]) == len(embeddings[1]) + import numpy as np + + assert isinstance(embeddings, np.ndarray) + embeddings_norm = embeddings / (embeddings**2).sum(axis=1, keepdims=True) ** 0.5 + scores = (embeddings_norm @ embeddings_norm.T) * 100 + assert 100.05 > scores[0][0] >= 99.95 + assert scores[0][0] > scores[0][1] + assert scores[0][1] > scores[0][2] + + embeddings2 = model.encode( + ["I like soccer", "I like tennis", "The eiffel tower is in Paris"], + ) + assert (embeddings2 == embeddings).all() + def test_translator(model_name="michaelfeil/ct2fast-flan-alpaca-base"): model = TranslatorCT2fromHfHub( model_name_or_path=model_name, device="cpu", compute_type="int8" @@ -17,16 +47,21 @@ def test_translator(model_name="michaelfeil/ct2fast-flan-alpaca-base"): for o in outputs: assert isinstance(o, str) + def test_multilingualtranslator(model_name="michaelfeil/ct2fast-m2m100_418M"): model = MultiLingualTranslatorCT2fromHfHub( - model_name_or_path=model_name, device="cpu", compute_type="int8", - tokenizer=AutoTokenizer.from_pretrained(f"facebook/{model_name.split('-')[-1]}") + model_name_or_path=model_name, + device="cpu", + compute_type="int8", + tokenizer=AutoTokenizer.from_pretrained( + f"facebook/{model_name.split('-')[-1]}" + ), ) outputs = model.generate( ["How do you call a fast Flamingo?", "Wie geht es dir?"], src_lang=["en", "de"], - tgt_lang=["de", "fr"] + tgt_lang=["de", "fr"], ) assert len(outputs) == 2 assert len(outputs[0]) != len(outputs[1]) @@ -35,6 +70,7 @@ def test_multilingualtranslator(model_name="michaelfeil/ct2fast-m2m100_418M"): for o in outputs: assert isinstance(o, str) + def test_generator(model_name="michaelfeil/ct2fast-pythia-160m"): model = GeneratorCT2fromHfHub( model_name_or_path=model_name, device="cpu", compute_type="int8" @@ -67,5 +103,4 @@ def test_generator_single(model_name="michaelfeil/ct2fast-pythia-160m"): if __name__ == "__main__": - test_generator() - test_translator() + test_encoder()