diff --git a/llments/lm/base/hugging_face.py b/llments/lm/base/hugging_face.py index 7f56f38..359940e 100644 --- a/llments/lm/base/hugging_face.py +++ b/llments/lm/base/hugging_face.py @@ -21,78 +21,7 @@ def __init__( ) self.text_generator: TextGenerationPipeline = pipeline( "text-generation", model=model, device=device - ) - self.model_name = model - self.device = device - - def fit( - self, target: LanguageModel, task_description: str | None = None - ) -> LanguageModel: - """Fit the language model to a target language model's distribution. - - Args: - target: The language model that should be fitted to. - task_description: A task description that explains more about - what the language model that should be fit is doing (a prompt). - - Returns: - The fitted language model. - """ - inputs, labels = self._prepare_training_data(target) - dataset = GeneratedDataset(inputs, labels) - - # TODO: use HF Trainer class to train the model - - - def _prepare_training_data(self, target: LanguageModel): - """Generate data from the target language model, using generate() function. - - Helper function of fit(). - Args: - target: target language model. - Returns: - inputs: Generated data (type: HF BatchEncoding): result from calling HF tokenizer. - labels: "Up shift" each token to create the labels. - """ - # Generate samples from the target model, consider this as one batch. - samples = target.generate(condition=None, do_sample=True, max_length=50, temperature=1.0, num_return_sequences=1000) - try: - from transformers import AutoTokenizer - except ImportError: - raise ImportError("You need to install the `transformers` package to use this method.") - - tokenizer = AutoTokenizer.from_pretrained(self.model_name) - inputs = tokenizer(samples, padding=True, truncation=True, return_tensors="pt") # return pytorch tensor - - # Prepare labels by shifting - labels = inputs.input_ids[:, 1:].clone() - try: - import torch - except: - raise ImportError("You need to install/import 'torch' package to use this function.") - labels = torch.nn.functional.pad(labels, (0, 1), value=-100) # Pad with -100 on the right - - # Adjust input_ids by removing the last token to match labels' size - inputs.input_ids = inputs.input_ids[:, :-1] - return inputs, labels - - def _prepare_training_dataset(self, inputs, labels): - """Return customized Dataset object, to be used in HF Trainer class. - - Helper function of fit() - Args: - inputs: generate inputs - labels: labels from generate inputs - Returns: - Dataset object - """ - - try: - import torch - from torch.utils.data import Dataset - except: - raise ImportError("You need both 'torch' and 'torch.utils.data' packages to use this function.") - return GeneratedDataset(inputs, labels) + ) def generate( @@ -154,8 +83,89 @@ def __getitem__(self, idx): item['labels'] = torch.tensor(self.labels[idx]) return item - def __len__(self): - return len(self.labels) +from llments.lm import LMFitter +class HuggingFaceLMFitter(LMFitter): + def __init__(self): + self.base = None + self.target = None + + def fit( + self, + base: HuggingFaceLM, + target: LanguageModel, + batch_size: int, + training_steps: int + ) -> LanguageModel: + """Fit the language model to a target language model's distribution. + + Args: + base: The HF language model to fine-tune. + target: The language model that should be fitted to. + batch_size: Number of examples processed in one step. + training_steps: Number of steps to train. + + Returns: + The fitted language model. + """ + self.base = base + self.target = target + inputs, labels = self._prepare_training_data(target, batch_size, training_steps) + dataset = self._prepare_training_dataset(inputs, labels) + + # TODO: use HF Trainer class to train the model + + + def _prepare_training_data(self, target: LanguageModel, batch, steps): + """Generate data from the target language model, using generate() function. + + Helper function of fit(). + Args: + target: target language model. + batch: Number of examples processed in one step. + steps: Number of steps to train. + Returns: + inputs: Generated data (type: HF BatchEncoding): result from calling HF tokenizer. + labels: "Up shift" each token to create the labels. + """ + try: + import torch + except: + raise ImportError("You need to install/import 'torch' package to use this function.") + + # Generate samples from the target model, consider this as one batch. + samples = target.generate(condition=None, do_sample=True, temperature=1.0, num_return_sequences=batch * steps) + + tokenizer = self.base.text_generator.tokenizer + inputs = tokenizer(samples, padding=True, truncation=True, return_tensors="pt") # return pytorch tensor + + # Prepare labels by shifting + labels = inputs.input_ids[:, 1:].clone() + labels = torch.nn.functional.pad(labels, (0, 1), value=-100) # Pad with -100 on the right + + # Adjust input_ids by removing the last token to match labels' size + inputs.input_ids = inputs.input_ids[:, :-1] + return inputs, labels + + def _prepare_training_dataset(self, inputs, labels): + """Return customized Dataset object, to be used in HF Trainer class. + + Helper function of fit() + Args: + inputs: generate inputs + labels: labels from generate inputs + Returns: + Dataset object + """ + + try: + import torch + from torch.utils.data import Dataset + except: + raise ImportError("You need both 'torch' and 'torch.utils.data' packages to use this function.") + return GeneratedDataset(inputs, labels) + + + def load_from_spec(spec_file: str) -> HuggingFaceLM: @@ -168,10 +178,7 @@ def load_from_spec(spec_file: str) -> HuggingFaceLM: Returns: A HuggingFaceLM instance. """ - try: - import json - except ImportError: - raise ImportError("You need to import/install json to use this function.") + import json with open(spec_file, 'r') as file: spec = json.load(file)