diff --git a/config/config.py b/config/config.py index 8059298..1cee163 100644 --- a/config/config.py +++ b/config/config.py @@ -1,10 +1,11 @@ -from typing import Optional +from typing import Optional, ClassVar from dataclasses import dataclass, field @dataclass class CFG: """ + Configuration class for LaMDA model. """ num_tokens: int = field( @@ -32,3 +33,90 @@ class CFG: metadata = {'help': 'dimension of the head'} ) + """ + Configuration for data loader. + """ + + train_dataset_name: Optional[str] = field( + default="", + metadata={"help": "Path to Hugging Face training dataset."} + ) + + eval_dataset_name: Optional[str] = field( + default="", + metadata={"help": "Path to Hugging Face validation dataset."} + ) + + choose_train_split: Optional[str] = field( + default="", + metadata={"help": "Choose Hugging Face training dataset split."} + ) + + choose_eval_split: Optional[str] = field( + default="", + metadata={"help": "Choose Hugging Face validation dataset split."} + ) + + train_columns: ClassVar[list[str]] = field( + default = [], + metadata={"help": "Train dataset columns to remove."} + ) + + eval_columns: ClassVar[list[str]] = field( + default = [], + metadata={"help": "Validation dataset columns to remove."} + ) + + train_buffer: Optional[int] = field( + default=10000, + metadata={"help": "Size of buffer used to shuffle streaming dataset."} + ) + + eval_buffer: Optional[int] = field( + default=1000, + metadata={"help": "Size of buffer used to shuffle streaming dataset."} + ) + + seed: Optional[int] = field( + default=42, + metadata={"help": "Random seed used for reproducibility."} + ) + + tokenizer_seq_length: Optional[int] = field( + default=512, + metadata={"help": "Sequence lengths used for tokenizing examples."} + ) + + select_input_string: Optional[str] = field( + default="content", + metadata={"help": "Select the key to used as the input string column."} + ) + + set_format: Optional[str] = field( + default="torch", + metadata={"help": "Convert the format to PyTorch Tensors"} + ) + + batch_size: Optional[int] = field( + default=4, + metadata={"help": "Batch size for training and validation."} + ) + + save_to_path: Optional[str] = field( + default="''", + metadata={"help": "Save the dataset to local disk."} + ) + + """ + Configuration for Weights and Biases + """ + + use_wanb: bool = field( + default = False, + metadata = {'help': 'Whether to use Weights and Biases for logging'} + ) + + project_name: Optional[str] = field( + default="LaMDA pre-training", + metadata = {'help': 'Name of the project'} + ) \ No newline at end of file diff --git a/config/dataloader_config.py b/config/dataloader_config.py deleted file mode 100644 index 0c58cbc..0000000 --- a/config/dataloader_config.py +++ /dev/null @@ -1,78 +0,0 @@ -from typing import Optional, ClassVar -from dataclasses import dataclass, field - -@dataclass -class DataLoaderArguments: - """ - Configuration for data loader. - """ - - train_dataset_name: Optional[str] = field( - default="", - metadata={"help": "Path to Hugging Face training dataset."} - ) - - eval_dataset_name: Optional[str] = field( - default="", - metadata={"help": "Path to Hugging Face validation dataset."} - ) - - choose_train_split: Optional[str] = field( - default="", - metadata={"help": "Choose Hugging Face training dataset split."} - ) - - choose_eval_split: Optional[str] = field( - default="", - metadata={"help": "Choose Hugging Face validation dataset split."} - ) - - train_columns: ClassVar[list[str]] = field( - default = [], - metadata={"help": "Train dataset columns to remove."} - ) - - eval_columns: ClassVar[list[str]] = field( - default = [], - metadata={"help": "Validation dataset columns to remove."} - ) - - train_buffer: Optional[int] = field( - default=10000, - metadata={"help": "Size of buffer used to shuffle streaming dataset."} - ) - - eval_buffer: Optional[int] = field( - default=1000, - metadata={"help": "Size of buffer used to shuffle streaming dataset."} - ) - - seed: Optional[int] = field( - default=42, - metadata={"help": "Random seed used for reproducibility."} - ) - - tokenizer_seq_length: Optional[int] = field( - default=512, - metadata={"help": "Sequence lengths used for tokenizing examples."} - ) - - select_input_string: Optional[str] = field( - default="content", - metadata={"help": "Select the key to used as the input string column."} - ) - - set_format: Optional[str] = field( - default="torch", - metadata={"help": "Convert the format to PyTorch Tensors"} - ) - - batch_size: Optional[int] = field( - default=4, - metadata={"help": "Batch size for training and validation."} - ) - - save_to_path: Optional[str] = field( - default="''", - metadata={"help": "Save the dataset to local disk."} - ) \ No newline at end of file diff --git a/train.py b/train.py index 9210549..bb02266 100644 --- a/train.py +++ b/train.py @@ -2,6 +2,7 @@ from torch import nn import colossalai +import wandb from config.config import CFG @@ -34,7 +35,10 @@ def LaMDA_Trainer(cfg: CFG): # optimizer function - optimizer = torch.optim.Adam(model.parameters(), lr = cfg.lr) + optimizer = torch.optim.Adam( + model.parameters(), + lr = cfg.lr + ) # initialze model, optimizer, criterion, and data loaders @@ -45,29 +49,42 @@ def LaMDA_Trainer(cfg: CFG): train_dataloader, test_dataloader ) + if cfg.use_wanb == True: - engine.train() - for step, batch in enumerate(train_dataloader): - inputs, labels = batch['inputs'].cuda(), batch['labels'].cuda() - - engine.zero_grad() - outputs = engine(inputs) - - train_loss = engine.loss_fn(outputs, labels) + # initialize Weights and Biases Logging + wandb.init(project = cfg.project_name) - engine.backward(train_loss) - engine.step() + engine.train() + for step, batch in enumerate(train_dataloader): - engine.eval() - for step, batch in enumerate(test_dataloader): inputs, labels = batch['inputs'].cuda(), batch['labels'].cuda() - - with torch.no_grad(): - outputs = engine(inputs) - test_loss = engine.loss_fn(outputs, labels) - engine.backward(test_loss) + engine.zero_grad() + outputs = engine(inputs) + + train_loss = engine.loss_fn(outputs, labels) + wandb.log({"train_loss": train_loss}) + + engine.backward(train_loss) engine.step() + wandb.log({"step": step}) + + engine.eval() + for step, batch in enumerate(test_dataloader): + inputs, labels = batch['inputs'].cuda(), batch['labels'].cuda() + + with torch.no_grad(): + outputs = engine(inputs) + test_loss = engine.loss_fn(outputs, labels) + wandb.log({"test_loss": test_loss}) + + engine.backward(test_loss) + engine.step() + + wandb.alert( + title = 'Training Complete', + text = "Training complete." + ) if __name__ == "__main__":