Skip to content

Commit

Permalink
refactor: moved fit() to HuggingFaceLMFitter class
Browse files Browse the repository at this point in the history
  • Loading branch information
wanxinran committed Mar 18, 2024
1 parent ddec969 commit 610913f
Showing 1 changed file with 85 additions and 78 deletions.
163 changes: 85 additions & 78 deletions llments/lm/base/hugging_face.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,78 +21,7 @@ def __init__(
)
self.text_generator: TextGenerationPipeline = pipeline(
"text-generation", model=model, device=device
)
self.model_name = model
self.device = device

def fit(
self, target: LanguageModel, task_description: str | None = None
) -> LanguageModel:
"""Fit the language model to a target language model's distribution.
Args:
target: The language model that should be fitted to.
task_description: A task description that explains more about
what the language model that should be fit is doing (a prompt).
Returns:
The fitted language model.
"""
inputs, labels = self._prepare_training_data(target)
dataset = GeneratedDataset(inputs, labels)

# TODO: use HF Trainer class to train the model


def _prepare_training_data(self, target: LanguageModel):
"""Generate data from the target language model, using generate() function.
Helper function of fit().
Args:
target: target language model.
Returns:
inputs: Generated data (type: HF BatchEncoding): result from calling HF tokenizer.
labels: "Up shift" each token to create the labels.
"""
# Generate samples from the target model, consider this as one batch.
samples = target.generate(condition=None, do_sample=True, max_length=50, temperature=1.0, num_return_sequences=1000)
try:
from transformers import AutoTokenizer
except ImportError:
raise ImportError("You need to install the `transformers` package to use this method.")

tokenizer = AutoTokenizer.from_pretrained(self.model_name)
inputs = tokenizer(samples, padding=True, truncation=True, return_tensors="pt") # return pytorch tensor

# Prepare labels by shifting
labels = inputs.input_ids[:, 1:].clone()
try:
import torch
except:
raise ImportError("You need to install/import 'torch' package to use this function.")
labels = torch.nn.functional.pad(labels, (0, 1), value=-100) # Pad with -100 on the right

# Adjust input_ids by removing the last token to match labels' size
inputs.input_ids = inputs.input_ids[:, :-1]
return inputs, labels

def _prepare_training_dataset(self, inputs, labels):
"""Return customized Dataset object, to be used in HF Trainer class.
Helper function of fit()
Args:
inputs: generate inputs
labels: labels from generate inputs
Returns:
Dataset object
"""

try:
import torch
from torch.utils.data import Dataset
except:
raise ImportError("You need both 'torch' and 'torch.utils.data' packages to use this function.")
return GeneratedDataset(inputs, labels)
)


def generate(
Expand Down Expand Up @@ -154,8 +83,89 @@ def __getitem__(self, idx):
item['labels'] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)
from llments.lm import LMFitter
class HuggingFaceLMFitter(LMFitter):
def __init__(self):
self.base = None
self.target = None

def fit(
self,
base: HuggingFaceLM,
target: LanguageModel,
batch_size: int,
training_steps: int
) -> LanguageModel:
"""Fit the language model to a target language model's distribution.
Args:
base: The HF language model to fine-tune.
target: The language model that should be fitted to.
batch_size: Number of examples processed in one step.
training_steps: Number of steps to train.
Returns:
The fitted language model.
"""
self.base = base
self.target = target
inputs, labels = self._prepare_training_data(target, batch_size, training_steps)
dataset = self._prepare_training_dataset(inputs, labels)

# TODO: use HF Trainer class to train the model


def _prepare_training_data(self, target: LanguageModel, batch, steps):
"""Generate data from the target language model, using generate() function.
Helper function of fit().
Args:
target: target language model.
batch: Number of examples processed in one step.
steps: Number of steps to train.
Returns:
inputs: Generated data (type: HF BatchEncoding): result from calling HF tokenizer.
labels: "Up shift" each token to create the labels.
"""
try:
import torch
except:
raise ImportError("You need to install/import 'torch' package to use this function.")

# Generate samples from the target model, consider this as one batch.
samples = target.generate(condition=None, do_sample=True, temperature=1.0, num_return_sequences=batch * steps)

tokenizer = self.base.text_generator.tokenizer
inputs = tokenizer(samples, padding=True, truncation=True, return_tensors="pt") # return pytorch tensor

# Prepare labels by shifting
labels = inputs.input_ids[:, 1:].clone()
labels = torch.nn.functional.pad(labels, (0, 1), value=-100) # Pad with -100 on the right

# Adjust input_ids by removing the last token to match labels' size
inputs.input_ids = inputs.input_ids[:, :-1]
return inputs, labels

def _prepare_training_dataset(self, inputs, labels):
"""Return customized Dataset object, to be used in HF Trainer class.
Helper function of fit()
Args:
inputs: generate inputs
labels: labels from generate inputs
Returns:
Dataset object
"""

try:
import torch
from torch.utils.data import Dataset
except:
raise ImportError("You need both 'torch' and 'torch.utils.data' packages to use this function.")
return GeneratedDataset(inputs, labels)





def load_from_spec(spec_file: str) -> HuggingFaceLM:
Expand All @@ -168,10 +178,7 @@ def load_from_spec(spec_file: str) -> HuggingFaceLM:
Returns:
A HuggingFaceLM instance.
"""
try:
import json
except ImportError:
raise ImportError("You need to import/install json to use this function.")
import json
with open(spec_file, 'r') as file:
spec = json.load(file)

Expand Down

0 comments on commit 610913f

Please sign in to comment.