From 69129a70813a26fc34fa327b59c9f8ee1f8bd26f Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 29 Jun 2025 11:55:07 +0530 Subject: [PATCH 1/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/data/phonebook.py | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 lm_engine/data/phonebook.py diff --git a/lm_engine/data/phonebook.py b/lm_engine/data/phonebook.py new file mode 100644 index 000000000..ac472281c --- /dev/null +++ b/lm_engine/data/phonebook.py @@ -0,0 +1,80 @@ +# ************************************************** +# Copyright (c) 2025, Mayank Mishra +# ************************************************** + +from __future__ import annotations + +import itertools +import random +import string + +import torch + +from ..tokenizers import TOKENIZER_TYPE +from .base import BaseDataset + + +def sample_phonebook(tokenizer: TOKENIZER_TYPE, phonebook_size: int, name_length: int = 5, phone_digits: int = 8): + num_total_names = 26**name_length + num_phone_numbers = 10**phone_digits + + assert ( + min(num_total_names, num_phone_numbers) >= phonebook_size + ), f"either {num_total_names} or {num_phone_numbers} is too small!" + + name_iter = list(itertools.product(list(string.ascii_lowercase), repeat=name_length)) + phone_iter = list(itertools.product(list(string.digits), repeat=phone_digits)) + + random.shuffle(name_iter) + random.shuffle(phone_iter) + + ret = [] + for i, name in enumerate(name_iter): + if i == phonebook_size: + # stack and shuffle + return torch.vstack(ret) + ret.append(tokenizer(f'${"".join(name)}|{"".join(phone_iter[i])}.')) + return torch.vstack(ret) + + +class PhonebookDataset(BaseDataset): + def __init__( + self, tokenizer: TOKENIZER_TYPE, phonebook_size: int, name_length: int = 5, num_digits: int = 8 + ) -> PhonebookDataset: + num_total_names = 26**name_length + num_phone_numbers = 10**num_digits + + assert ( + min(num_total_names, num_phone_numbers) >= phonebook_size + ), f"either {num_total_names} or {num_phone_numbers} is too small!" + + name_iter = list(itertools.product(list(string.ascii_lowercase), repeat=name_length)) + phone_iter = list(itertools.product(list(string.digits), repeat=num_digits)) + + random.shuffle(name_iter) + random.shuffle(phone_iter) + + ret = [] + for i, name in enumerate(name_iter): + if i == phonebook_size: + # stack and shuffle + return torch.vstack(ret) + ret.append(tokenizer(f'${"".join(name)}|{"".join(phone_iter[i])}.')) + return torch.vstack(ret) + + def __len__(self): + return len(self.phonebook_dict["input_ids"]) + + def __getitem__(self, idx): + if "global_token_unique_index" in self.phonebook_dict: + return { + "input_ids": self.phonebook_dict["input_ids"][idx], + "mask": self.phonebook_dict["mask"][idx], + "global_seq_unique_index": self.phonebook_dict["global_seq_unique_index"][idx], + "global_token_unique_index": self.phonebook_dict["global_token_unique_index"][idx], + "global_token_per_seq_index": self.phonebook_dict["global_token_per_seq_index"][idx], + "global_seq_random_unique_number": self.phonebook_dict["global_seq_random_unique_number"][idx], + "global_token_random_unique_number": self.phonebook_dict["global_token_random_unique_number"][idx], + } + else: + return {"input_ids": self.phonebook_dict["input_ids"][idx], "mask": self.phonebook_dict["mask"][idx]} From b30296f386cb8788caa2822d00cd15c77cc22fc9 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 29 Jun 2025 12:00:35 +0530 Subject: [PATCH 2/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/data/phonebook.py | 50 +++++++++++-------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/lm_engine/data/phonebook.py b/lm_engine/data/phonebook.py index ac472281c..75fe35444 100644 --- a/lm_engine/data/phonebook.py +++ b/lm_engine/data/phonebook.py @@ -14,32 +14,9 @@ from .base import BaseDataset -def sample_phonebook(tokenizer: TOKENIZER_TYPE, phonebook_size: int, name_length: int = 5, phone_digits: int = 8): - num_total_names = 26**name_length - num_phone_numbers = 10**phone_digits - - assert ( - min(num_total_names, num_phone_numbers) >= phonebook_size - ), f"either {num_total_names} or {num_phone_numbers} is too small!" - - name_iter = list(itertools.product(list(string.ascii_lowercase), repeat=name_length)) - phone_iter = list(itertools.product(list(string.digits), repeat=phone_digits)) - - random.shuffle(name_iter) - random.shuffle(phone_iter) - - ret = [] - for i, name in enumerate(name_iter): - if i == phonebook_size: - # stack and shuffle - return torch.vstack(ret) - ret.append(tokenizer(f'${"".join(name)}|{"".join(phone_iter[i])}.')) - return torch.vstack(ret) - - class PhonebookDataset(BaseDataset): def __init__( - self, tokenizer: TOKENIZER_TYPE, phonebook_size: int, name_length: int = 5, num_digits: int = 8 + self, tokenizer: TOKENIZER_TYPE, phonebook_size: int, name_length: int, num_digits: int ) -> PhonebookDataset: num_total_names = 26**name_length num_phone_numbers = 10**num_digits @@ -48,19 +25,22 @@ def __init__( min(num_total_names, num_phone_numbers) >= phonebook_size ), f"either {num_total_names} or {num_phone_numbers} is too small!" - name_iter = list(itertools.product(list(string.ascii_lowercase), repeat=name_length)) - phone_iter = list(itertools.product(list(string.digits), repeat=num_digits)) + names = list(itertools.product(list(string.ascii_lowercase), repeat=name_length)) + phone_numbers = list(itertools.product(list(string.digits), repeat=num_digits)) + + random.shuffle(names) + random.shuffle(phone_numbers) + + names = names[:phonebook_size] + phone_numbers = phone_numbers[:phonebook_size] - random.shuffle(name_iter) - random.shuffle(phone_iter) + self.dataset = [] + for i in range(phonebook_size): + sample = "".join(names[i]) + "" + "".join(phone_numbers[i]) + sample = tokenizer(sample, add_special_tokens=False) + sample += [tokenizer.eos_token_id] - ret = [] - for i, name in enumerate(name_iter): - if i == phonebook_size: - # stack and shuffle - return torch.vstack(ret) - ret.append(tokenizer(f'${"".join(name)}|{"".join(phone_iter[i])}.')) - return torch.vstack(ret) + self.dataset.append(sample) def __len__(self): return len(self.phonebook_dict["input_ids"]) From 515f5109a7a335355d42e679027b0d54782cc159 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 29 Jun 2025 12:23:16 +0530 Subject: [PATCH 3/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/arguments.py | 7 +--- lm_engine/data/base.py | 6 +--- lm_engine/data/phonebook.py | 71 ++++++++++++++++++++++++------------- 3 files changed, 48 insertions(+), 36 deletions(-) diff --git a/lm_engine/arguments.py b/lm_engine/arguments.py index 34ed5fd5f..2ae0c9ecf 100644 --- a/lm_engine/arguments.py +++ b/lm_engine/arguments.py @@ -191,12 +191,7 @@ class OptimizerArgs(BaseArgs): # backward hooked optimizer use_optimizer_with_backward_hook: bool = False # class args for optimizer - class_args: dict = { - "lr": 1e-5, - "weight_decay": 0.1, - "betas": [0.9, 0.95], - "eps": 1e-10, - } + class_args: dict = {"lr": 1e-5, "weight_decay": 0.1, "betas": [0.9, 0.95], "eps": 1e-10} def model_post_init(self, __context: Any) -> None: _check_not_None([(self.class_name, "optimizer class_name")]) diff --git a/lm_engine/data/base.py b/lm_engine/data/base.py index 01ae4e7c2..84ce329ec 100644 --- a/lm_engine/data/base.py +++ b/lm_engine/data/base.py @@ -39,11 +39,7 @@ def __init__( self.do_format_output = self.output_format != OUTPUT_FORMAT # length to use for trimming (excludes eos) - if max_input_tokens is None: - self.max_input_tokens = None - else: - self.max_input_tokens = max_input_tokens - + self.max_input_tokens = max_input_tokens self.max_output_tokens = None if max_output_tokens is None else max_output_tokens - 1 self.examples = [] diff --git a/lm_engine/data/phonebook.py b/lm_engine/data/phonebook.py index 75fe35444..59250edd5 100644 --- a/lm_engine/data/phonebook.py +++ b/lm_engine/data/phonebook.py @@ -8,53 +8,74 @@ import random import string -import torch +from tqdm import trange +from ..enums import DatasetSplit, Mode from ..tokenizers import TOKENIZER_TYPE from .base import BaseDataset class PhonebookDataset(BaseDataset): def __init__( - self, tokenizer: TOKENIZER_TYPE, phonebook_size: int, name_length: int, num_digits: int + self, + class_args: dict, + split: DatasetSplit, + mode: Mode, + tokenizer: TOKENIZER_TYPE, + data_name: str, + input_format: str, + output_format: str, + max_input_tokens: int, + max_output_tokens: int, + seed: int, ) -> PhonebookDataset: + super().__init__( + class_args=class_args, + split=split, + mode=mode, + tokenizer=tokenizer, + data_name=data_name, + input_format=input_format, + output_format=output_format, + max_input_tokens=max_input_tokens, + max_output_tokens=max_output_tokens, + ) + + assert not self.do_format_input + assert not self.do_format_output + assert self.max_input_tokens is None + assert self.max_output_tokens is None + + name_length = self.class_args["name_length"] + num_digits = self.class_args["num_digits"] + seed = self.class_args.get("seed", 42) + num_total_names = 26**name_length num_phone_numbers = 10**num_digits + self.phonebook_size = self.class_args.get("phonebook_size", min(num_total_names, num_phone_numbers)) + assert ( - min(num_total_names, num_phone_numbers) >= phonebook_size + min(num_total_names, num_phone_numbers) >= self.phonebook_size ), f"either {num_total_names} or {num_phone_numbers} is too small!" names = list(itertools.product(list(string.ascii_lowercase), repeat=name_length)) phone_numbers = list(itertools.product(list(string.digits), repeat=num_digits)) - random.shuffle(names) - random.shuffle(phone_numbers) + local_random = random.Random(seed) + local_random.shuffle(names) + local_random.shuffle(phone_numbers) - names = names[:phonebook_size] - phone_numbers = phone_numbers[:phonebook_size] + names = names[: self.phonebook_size] + phone_numbers = phone_numbers[: self.phonebook_size] - self.dataset = [] - for i in range(phonebook_size): + self.examples = [] + for i in trange(self.phonebook_size): sample = "".join(names[i]) + "" + "".join(phone_numbers[i]) sample = tokenizer(sample, add_special_tokens=False) sample += [tokenizer.eos_token_id] - self.dataset.append(sample) + self.examples.append(sample) def __len__(self): - return len(self.phonebook_dict["input_ids"]) - - def __getitem__(self, idx): - if "global_token_unique_index" in self.phonebook_dict: - return { - "input_ids": self.phonebook_dict["input_ids"][idx], - "mask": self.phonebook_dict["mask"][idx], - "global_seq_unique_index": self.phonebook_dict["global_seq_unique_index"][idx], - "global_token_unique_index": self.phonebook_dict["global_token_unique_index"][idx], - "global_token_per_seq_index": self.phonebook_dict["global_token_per_seq_index"][idx], - "global_seq_random_unique_number": self.phonebook_dict["global_seq_random_unique_number"][idx], - "global_token_random_unique_number": self.phonebook_dict["global_token_random_unique_number"][idx], - } - else: - return {"input_ids": self.phonebook_dict["input_ids"][idx], "mask": self.phonebook_dict["mask"][idx]} + return self.phonebook_size From cb08347e287a4ced44af65c943129315fd3212d1 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 29 Jun 2025 17:02:09 +0530 Subject: [PATCH 4/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/data/phonebook.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lm_engine/data/phonebook.py b/lm_engine/data/phonebook.py index 59250edd5..db3f92c15 100644 --- a/lm_engine/data/phonebook.py +++ b/lm_engine/data/phonebook.py @@ -27,7 +27,6 @@ def __init__( output_format: str, max_input_tokens: int, max_output_tokens: int, - seed: int, ) -> PhonebookDataset: super().__init__( class_args=class_args, @@ -77,5 +76,5 @@ def __init__( self.examples.append(sample) - def __len__(self): + def __len__(self) -> int: return self.phonebook_size From c82b0c1f15b5af1c8df1653ebfdb1a48f281741a Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 29 Jun 2025 17:15:17 +0530 Subject: [PATCH 5/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/tokenizers/alpha_numeric.py | 28 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/lm_engine/tokenizers/alpha_numeric.py b/lm_engine/tokenizers/alpha_numeric.py index 569b7de35..d89372ac4 100644 --- a/lm_engine/tokenizers/alpha_numeric.py +++ b/lm_engine/tokenizers/alpha_numeric.py @@ -71,19 +71,27 @@ def __call__( def _get_token_id(self, x: str) -> None: assert isinstance(x, str) - assert len(x) == 1 - xid = ord(x) - - if self._0 <= xid <= self._9: - y = xid - self._0 - elif self.a <= xid <= self.z: - y = xid - self.a + 10 - elif self.A <= xid <= self.Z: - y = xid - self.A + 36 - elif xid == self.eos_token: + if len(x) == 1: + xid = ord(x) + + if self._0 <= xid <= self._9: + y = xid - self._0 + elif self.a <= xid <= self.z: + y = xid - self.a + 10 + elif self.A <= xid <= self.Z: + y = xid - self.A + 36 + else: + raise ValueError(f"unexpected token ({x})") + elif x == self.eos_token: y = self.eos_token_id + elif x in self.special_tokens: + y = self.special_tokens[x] else: raise ValueError(f"unexpected token ({x})") return y + + def add_special_tokens(self, special_tokens: dict) -> None: + for i, token in enumerate(special_tokens["additional_special_tokens"]): + self.special_tokens[token] = self.eos_token_id + i + 1 From 5d3ff60dde35f791805a9a11b80b83ca82cec230 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 29 Jun 2025 17:54:34 +0530 Subject: [PATCH 6/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/data/__init__.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/lm_engine/data/__init__.py b/lm_engine/data/__init__.py index dd01bf10e..eb85e1aee 100644 --- a/lm_engine/data/__init__.py +++ b/lm_engine/data/__init__.py @@ -19,20 +19,23 @@ from .ibm import get_ibm_dataloaders from .instruction_tuning import AlpacaDataset, DollyDataset, SlimOrcaDataset from .megatron import get_megatron_gpt_dataloaders +from .phonebook import PhonebookDataset from .sampler import BlendedDistributedSampler from .sst2 import SST2Dataset from .utils import collate_fn, custom_iterator, get_next_batch -_DATASETS_LIST = { - "AlpacaDataset": AlpacaDataset, - "DebugDataset": DebugDataset, - "DollyDataset": DollyDataset, - "HuggingFaceDataset": HuggingFaceDataset, - "SlimOrcaDataset": SlimOrcaDataset, - "SST2Dataset": SST2Dataset, +_FINETUNING_DATASETS_LIST = { + AlpacaDataset.__name__: AlpacaDataset, + DebugDataset.__name__: DebugDataset, + DollyDataset.__name__: DollyDataset, + HuggingFaceDataset.__name__: HuggingFaceDataset, + SlimOrcaDataset.__name__: SlimOrcaDataset, + SST2Dataset.__name__: SST2Dataset, } +_PRETRAINING_DATASETS_LIST = {PhonebookDataset.__name__: PhonebookDataset} + def get_datasets_list( dataset_args_list: list[DatasetArgs], split: DatasetSplit, mode: Mode, tokenizer: TOKENIZER_TYPE @@ -55,10 +58,10 @@ def get_datasets_list( datasets_list = [] data_sampling_ratios = [] for data_args in dataset_args_list: - if data_args.class_name not in _DATASETS_LIST: + if data_args.class_name not in _FINETUNING_DATASETS_LIST: raise ValueError(f"invalid class_name ({data_args.class_name}) for dataset") - dataset = _DATASETS_LIST[data_args.class_name]( + dataset = _FINETUNING_DATASETS_LIST[data_args.class_name]( class_args=data_args.class_args, split=split, mode=mode, @@ -122,9 +125,15 @@ def get_finetuning_dataloader( def get_pretraining_dataloaders( args: TrainingArgs, tokenizer: TOKENIZER_TYPE, consumed_samples: int ) -> tuple[ResumableDataLoader, list[ResumableDataLoader], list[ResumableDataLoader]]: - if args.datasets[0].class_name == "MegatronDataset": + assert len(args.datasets) == 1 + class_name = args.datasets[0].class_name + + if class_name in _PRETRAINING_DATASETS_LIST: + _PRETRAINING_DATASETS_LIST[class_name]() + + if class_name == "MegatronDataset": dataloaders = get_megatron_gpt_dataloaders(args, tokenizer, consumed_samples=consumed_samples) - elif args.datasets[0].class_name == "IBMDataset": + elif class_name == "IBMDataset": dataloaders = get_ibm_dataloaders(args, tokenizer) return dataloaders From 19d0006f155578c10feefbb2c106d75a4b94d4e8 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Mon, 30 Jun 2025 04:45:59 +0530 Subject: [PATCH 7/7] tokenizer Signed-off-by: Mayank Mishra --- lm_engine/data/__init__.py | 43 ++++++++++++++++++++++++++++++------- lm_engine/data/phonebook.py | 5 ++++- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/lm_engine/data/__init__.py b/lm_engine/data/__init__.py index eb85e1aee..3e671a0f9 100644 --- a/lm_engine/data/__init__.py +++ b/lm_engine/data/__init__.py @@ -25,7 +25,7 @@ from .utils import collate_fn, custom_iterator, get_next_batch -_FINETUNING_DATASETS_LIST = { +_FINETUNING_DATASETS_MAPPING = { AlpacaDataset.__name__: AlpacaDataset, DebugDataset.__name__: DebugDataset, DollyDataset.__name__: DollyDataset, @@ -34,7 +34,7 @@ SST2Dataset.__name__: SST2Dataset, } -_PRETRAINING_DATASETS_LIST = {PhonebookDataset.__name__: PhonebookDataset} +_PRETRAINING_DATASETS_MAPPING = {PhonebookDataset.__name__: PhonebookDataset} def get_datasets_list( @@ -58,10 +58,10 @@ def get_datasets_list( datasets_list = [] data_sampling_ratios = [] for data_args in dataset_args_list: - if data_args.class_name not in _FINETUNING_DATASETS_LIST: + if data_args.class_name not in _FINETUNING_DATASETS_MAPPING: raise ValueError(f"invalid class_name ({data_args.class_name}) for dataset") - dataset = _FINETUNING_DATASETS_LIST[data_args.class_name]( + dataset = _FINETUNING_DATASETS_MAPPING[data_args.class_name]( class_args=data_args.class_args, split=split, mode=mode, @@ -123,15 +123,42 @@ def get_finetuning_dataloader( def get_pretraining_dataloaders( - args: TrainingArgs, tokenizer: TOKENIZER_TYPE, consumed_samples: int + args: TrainingArgs, tokenizer: TOKENIZER_TYPE, consumed_samples: int, mode: Mode ) -> tuple[ResumableDataLoader, list[ResumableDataLoader], list[ResumableDataLoader]]: assert len(args.datasets) == 1 class_name = args.datasets[0].class_name - if class_name in _PRETRAINING_DATASETS_LIST: - _PRETRAINING_DATASETS_LIST[class_name]() + if class_name in _PRETRAINING_DATASETS_MAPPING: + assert args.load_args is None - if class_name == "MegatronDataset": + train_dataloader = _PRETRAINING_DATASETS_MAPPING[class_name]( + class_args=args.datasets[0].class_args, + split=DatasetSplit.train, + mode=mode, + tokenizer=tokenizer, + data_name="", + input_format="__input__", + output_format="__output__", + max_input_tokens=None, + max_output_tokens=None, + ) + + val_dataloaders = [ + _PRETRAINING_DATASETS_MAPPING[class_name]( + class_args=args.datasets[0].class_args, + split=DatasetSplit.val, + mode=mode, + tokenizer=tokenizer, + data_name="", + input_format="__input__", + output_format="__output__", + max_input_tokens=None, + max_output_tokens=None, + ) + ] + + dataloaders = (train_dataloader, val_dataloaders, val_dataloaders) + elif class_name == "MegatronDataset": dataloaders = get_megatron_gpt_dataloaders(args, tokenizer, consumed_samples=consumed_samples) elif class_name == "IBMDataset": dataloaders = get_ibm_dataloaders(args, tokenizer) diff --git a/lm_engine/data/phonebook.py b/lm_engine/data/phonebook.py index db3f92c15..a9b81298d 100644 --- a/lm_engine/data/phonebook.py +++ b/lm_engine/data/phonebook.py @@ -40,10 +40,13 @@ def __init__( max_output_tokens=max_output_tokens, ) + self.separator_token = "" + assert not self.do_format_input assert not self.do_format_output assert self.max_input_tokens is None assert self.max_output_tokens is None + assert self.separator_token in tokenizer.get_vocab() name_length = self.class_args["name_length"] num_digits = self.class_args["num_digits"] @@ -70,7 +73,7 @@ def __init__( self.examples = [] for i in trange(self.phonebook_size): - sample = "".join(names[i]) + "" + "".join(phone_numbers[i]) + sample = "".join(names[i]) + self.separator_token + "".join(phone_numbers[i]) sample = tokenizer(sample, add_special_tokens=False) sample += [tokenizer.eos_token_id]