diff --git a/bert_pytorch/trainer/pretrain.py b/bert_pytorch/trainer/pretrain.py index 0b882dd..d2df86f 100644 --- a/bert_pytorch/trainer/pretrain.py +++ b/bert_pytorch/trainer/pretrain.py @@ -59,8 +59,11 @@ def __init__(self, bert: BERT, vocab_size: int, self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token - self.criterion = nn.NLLLoss(ignore_index=0) - + self.criterion_mask_lm = nn.NLLLoss(ignore_index=0) + + # Using Negative Log Likelihood Loss function for predicting the is_next + self.criterion_is_next = nn.NLLLoss() + self.log_freq = log_freq print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) @@ -102,10 +105,10 @@ def iteration(self, epoch, data_loader, train=True): next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"]) # 2-1. NLL(negative log likelihood) loss of is_next classification result - next_loss = self.criterion(next_sent_output, data["is_next"]) + next_loss = self.criterion_is_next(next_sent_output, data["is_next"]) # 2-2. NLLLoss of predicting masked token word - mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) + mask_loss = self.criterion_mask_lm(mask_lm_output.transpose(1, 2), data["bert_label"]) # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure loss = next_loss + mask_loss