Skip to content

Commit

Permalink
feature: HFFitter accepts evaluation dataset lm, renamed empiricalLM (#…
Browse files Browse the repository at this point in the history
…58)

* feature: HFFitter accepts evaluation dataset lm, changed empiricalLM to datasetML

* rename empiricalLM
  • Loading branch information
wanxinran authored May 13, 2024
1 parent 3eca0f5 commit b0fcabd
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 13 deletions.
10 changes: 5 additions & 5 deletions llments/lm/base/empirical.py → llments/lm/base/dataset_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from llments.lm.lm import LanguageModel


class EmpiricalLM(LanguageModel):
class DatasetLM(LanguageModel):
"""An empirical distribution of text data."""

def __init__(self, data: list[str], probs: list[float] | None = None):
Expand Down Expand Up @@ -121,14 +121,14 @@ def set_seed(self, seed: int) -> None:
random.seed(seed)


def load_from_text_file(text_file: str) -> EmpiricalLM:
def load_from_text_file(text_file: str) -> DatasetLM:
"""Load the distribution from a text file."""
with open(text_file, "r") as f:
return EmpiricalLM(f.readlines())
return DatasetLM(f.readlines())


def load_from_json_file(json_file: str) -> EmpiricalLM:
def load_from_json_file(json_file: str) -> DatasetLM:
"""Load the distribution from a text file."""
with open(json_file, "r") as f:
data = json.load(f)
return EmpiricalLM([x["text"] for x in data], [x["prob"] for x in data])
return DatasetLM([x["text"] for x in data], [x["prob"] for x in data])
41 changes: 33 additions & 8 deletions llments/lm/base/hugging_face.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ def fit(
cls,
base: HuggingFaceLM,
target: LanguageModel,
eval_target: LanguageModel | None = None,
batch_size: int = 8, # batch size per device
training_steps: int = 200,
output_dir: str = "./training_results", # ie. checkpoint_dir
Expand All @@ -221,6 +222,7 @@ def fit(
Args:
base: The HF language model to fine-tune.
target: The language model that should be fitted to.
eval_target: The language model used to evaluate the training process.
batch_size: The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training and evaluation.
training_steps: Number of training steps.
training_epochs: Number of iterations to go through the entire dataset.
Expand Down Expand Up @@ -270,6 +272,18 @@ def fit(

# convert tokenized text into a Dataset object
dataset = Dataset.from_dict(inputs)
if eval_target:
eval_samples = eval_target.generate(
condition=None,
do_sample=True,
temperature=1.0,
num_return_sequences=batch_size * training_steps,
)

eval_inputs = base.tokenizer(
eval_samples, padding=True, truncation=True, return_tensors="pt"
)
eval_dataset = Dataset.from_dict(eval_inputs)

training_args = TrainingArguments(
output_dir=output_dir,
Expand All @@ -295,14 +309,25 @@ def fit(
if not os.path.exists(logging_dir):
os.makedirs(logging_dir)

trainer = Trainer(
model=base.model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(
tokenizer=base.tokenizer, mlm=False
),
train_dataset=dataset,
)
if not do_eval:
trainer = Trainer(
model=base.model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(
tokenizer=base.tokenizer, mlm=False
),
train_dataset=dataset,
)
else:
trainer = Trainer(
model=base.model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(
tokenizer=base.tokenizer, mlm=False
),
train_dataset=dataset,
eval_dataset=eval_dataset,
)

trainer.train()
base.tokenizer.save_pretrained(output_dir)
Expand Down

0 comments on commit b0fcabd

Please sign in to comment.