Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mlm and llrd #6

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions notebooks/training/bert_masked_lm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@

import argparse
import json
import numpy as np
import torch as th

from huggingface_hub import login
from transformers import (AutoTokenizer, AutoModelForMaskedLM,
Trainer, TrainingArguments)

from utils.generic_utils import preprocess_tweet


LABEL_NAMES = [
'Alakasiz', 'Barinma', 'Elektronik',
'Giysi', 'Kurtarma', 'Lojistik', 'Saglik',
'Su', 'Guvenlik', 'Yemek']


class DepremTweetUnlabeledDataset(th.utils.data.Dataset):

def __init__(self, tweets, tokenizer):
self.tweets = tweets
self.tokenizer = tokenizer

def __len__(self):
return len(self.tweets)

def __getitem__(self, idx):
text = self.tweets[idx]
encoding = self.tokenizer(text, max_length=64, padding="max_length", truncation=True)
encoding = {key: th.tensor(val) for key, val in encoding.items()}
encoding["labels"] = encoding["input_ids"]

inp = encoding["input_ids"]
tokens = range(len(inp))
# We need to select 15% random tokens from the given list
num_of_token_to_mask = round(len(tokens) * 0.15)
token_to_mask = np.random.choice(np.array(tokens),
size=num_of_token_to_mask,
replace=False).tolist()
# Now we have the indices where we need to mask the tokens
inp[token_to_mask] = self.tokenizer.mask_token_id
encoding["input_ids"] = inp

return dict(encoding)


def prepare_datasets(json_path, tokenizer):
with open(json_path, 'r') as f:
json_ = json.load(f)

tweets = [preprocess_tweet(tweet['full_text'], remove_hashtags=False, remove_handles=False)
for tweet in json_]
tweets = list(set(tweets)) # Remove duplicates
print("Number of tweets: {}".format(len(tweets)))
print("Sample tweets:")
print(np.random.choice(tweets, 10))
print()

n_train = int(len(tweets) * 0.85)
train_ds = DepremTweetUnlabeledDataset(tweets[:n_train], tokenizer)
val_ds = DepremTweetUnlabeledDataset(tweets[n_train:], tokenizer)

return train_ds, val_ds


def clean_dataset(dataset):
raise NotImplementedError


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="dbmdz/bert-base-turkish-uncased")
parser.add_argument("--output_dir", type=str, default="./output-intent")
parser.add_argument("--hf_token", type=str, required=True)
args = parser.parse_args()

login(token=args.hf_token)

tokenizer = AutoTokenizer.from_pretrained(args.model_name)
train_ds, val_ds = prepare_datasets(
"postgres_public_feeds_entry.json", tokenizer)
# Note: above line of code could be replaced with downloading the dataset
# from HF and preprocessing it (see next two lines for example)
# train_ds = datasets.load_dataset("deprem-private/deprem_tweet_unlabeled", "plain_text")
# train_ds = clean_dataset(train_ds)

model = AutoModelForMaskedLM.from_pretrained(args.model_name)

training_args = TrainingArguments(
output_dir=args.output_dir,
evaluation_strategy="steps",
save_strategy="steps",
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
weight_decay=0.01,
num_train_epochs=1,
eval_steps=1000,
logging_steps=1000
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=val_ds
)
trainer.train()


if __name__ == '__main__':
main()
175 changes: 175 additions & 0 deletions notebooks/training/intent_bert_llrd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@

import argparse
import numpy as np
import os
import torch as th

from collections import OrderedDict
from huggingface_hub import login
from sklearn.metrics import classification_report
from transformers import (AdamW, AutoTokenizer, AutoModelForSequenceClassification,
DataCollatorWithPadding, Trainer, TrainingArguments, EarlyStoppingCallback,
get_cosine_schedule_with_warmup)

from utils.generic_utils import set_seed_everywhere, select_thresholds, compute_f1
from utils.dataset_utils import prep_datasets
from utils.training_utils import ImbalancedTrainer, compute_class_weights


LABEL_IDX2NAME = OrderedDict([
(0, 'Lojistik'),
(1, 'Elektrik Kaynagi'),
(2, 'Arama Ekipmani'),
(3, 'Cenaze'),
(4, 'Giysi'),
(5, 'Enkaz Kaldirma'),
(6, 'Isinma'),
(7, 'Barınma'),
(8, 'Tuvalet'),
(9, 'Su'),
(10, 'Yemek'),
(11, 'Saglik'),
(12, 'Alakasiz')])

os.environ["WANDB_DISABLED"] = "true"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'


def get_optimizer_grouped_parameters(
model, model_type,
learning_rate, weight_decay,
layerwise_learning_rate_decay
):
no_decay = ["bias", "LayerNorm.weight"]
# initialize lr for task specific layer
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if "classifier" in n or "pooler" in n],
"weight_decay": 0.0,
"lr": learning_rate,
},
]
# initialize lrs for every layer
num_layers = model.config.num_hidden_layers
layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
layers.reverse()
lr = learning_rate
for layer in layers:
lr *= layerwise_learning_rate_decay
optimizer_grouped_parameters += [
{
"params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
"lr": lr,
},
{
"params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
"lr": lr,
},
]
return optimizer_grouped_parameters


def get_llrd_optimizer_scheduler(model, learning_rate=1e-5, weight_decay=0.01, layerwise_learning_rate_decay=0.95):
grouped_optimizer_params = get_optimizer_grouped_parameters(
model, 'bert',
learning_rate, weight_decay,
layerwise_learning_rate_decay
)
optimizer = AdamW(
grouped_optimizer_params,
lr=learning_rate,
eps=1e-6,
correct_bias=True
)
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=15
)
# Note: linear schedule fails to converge for unknown reasons.

return optimizer, scheduler


def main():
# Define argpars for training parameters
parser = argparse.ArgumentParser()
parser.add_argument("--n_seeds", type=int, default=1, help="Number of trials to run with different seeds")
parser.add_argument("--model_name", type=str, default="dbmdz/bert-base-turkish-uncased",
help="Name or path of the model to use. For example, could be"
"<path-to-BERT-finetuned-for-MLM-on-unlabelled-tweets>")
parser.add_argument("--output_dir", type=str, default="./output-intent")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--hf_token", type=str, required=True)
parser.add_argument("--layerwise_LR_decay_rate", type=float, default=0.8)
args = parser.parse_args()

login(token=args.hf_token)
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=100)
train_ds, val_ds, test_ds, mlb_labels = prep_datasets(
tokenizer,
labelidx2name=LABEL_IDX2NAME,
path="deprem-private/intent-v13")

f1s = []
for i in range(args.n_seeds):
set_seed_everywhere(i)
model = AutoModelForSequenceClassification.from_pretrained(
args.model_name, num_labels=len(LABEL_IDX2NAME), problem_type="multi_label_classification")

training_args = TrainingArguments(
output_dir=args.output_dir,
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size * 2,
report_to=None,
num_train_epochs=15,
metric_for_best_model="macro f1",
load_best_model_at_end=True,
group_by_length=True
)
optimizer, scheduler = get_llrd_optimizer_scheduler(
model,
learning_rate=5e-5,
weight_decay=0.01, # Weight decay defined here instead of training_args
layerwise_learning_rate_decay=args.layerwise_LR_decay_rate)

trainer = ImbalancedTrainer(
class_weights=compute_class_weights(mlb_labels),
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=val_ds,
data_collator=data_collator,
compute_metrics=compute_f1,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
optimizers=(optimizer, scheduler)
)
trainer.train()

# Choose the best thresholds per label using train+val data
train_preds = trainer.predict(train_ds)
val_preds = trainer.predict(val_ds)
thresholds = select_thresholds(
np.concatenate([train_preds.label_ids, val_preds.label_ids]),
np.concatenate([train_preds.predictions, val_preds.predictions])
)
# Evaluate on test data
test_preds = trainer.predict(test_ds)
f1 = compute_f1((test_preds.predictions, test_preds.label_ids), thresholds=thresholds)
f1s.append(f1["macro f1"])
report = classification_report(
test_preds.label_ids.astype(int),
(th.sigmoid(th.from_numpy(test_preds.predictions)).numpy() > thresholds).astype(int),
target_names=LABEL_IDX2NAME.values(), digits=3)
print(report)

print("Mean F1: {:.2f}, Std F1: {:.2f}".format(np.mean(f1s) * 100, np.std(f1s) * 100))


if __name__ == '__main__':
main()
Empty file added utils/__init__.py
Empty file.
82 changes: 82 additions & 0 deletions utils/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@

import numpy as np
import pandas as pd
import torch as th

from datasets import load_dataset
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from unicode_tr import unicode_tr


def prep_datasets(tokenizer, labelidx2name, path, label_col="label", text_col="image_url"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

labelidx2name argumanına gerek yok, datasets.Dataset internal olarak bu şemayı under-the-hood tutuyor, direkt onu utilize edebiliriz. HF'de usage kısmında var readme'de.

# to convert from id to string
print(dataset["train"].features["label"].int2str(tweet["label"]))
# to convert from string to id
print(dataset["train"].features["label"].str2int(tweet["label_name"]))

intent = load_dataset(path, use_auth_token=True)
print(intent["train"], intent["test"])

for instance in intent["train"]:
print(unicode_tr(instance["image_url"]).lower())
break

df_train = pd.DataFrame().from_records(list(intent["train"]))
df_test = pd.DataFrame().from_records(list(intent["test"]))

df_train[text_col] = df_train[text_col].apply(lambda x: unicode_tr(x).lower())
df_test[text_col] = df_test[text_col].apply(lambda x: unicode_tr(x).lower())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lower() yaparken şuna dikkat etmeliyiz Türkçe için ı/i.


# Next, we remove the rows that have no labels
df_train = df_train[df_train[label_col].notnull()].reset_index(drop=True)
df_test = df_test[df_test[label_col].notnull()].reset_index(drop=True)

# df_train.labels.apply(lambda x: len(x))
#
# labels = set()
# for label in df_train.labels.values:
# labels.update({l for l in label})
#
# name2ix = {v: k for k, v in labelidx2name.items()}
# labels = name2ix.keys()

mlb = MultiLabelBinarizer(classes=list(labelidx2name.values()))
mlb_labels = mlb.fit_transform(df_train.label.tolist())

cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(cv.split(df_train.index.tolist(), mlb_labels)):
df_train.loc[val_idx, 'kfold'] = int(fold)

df_train, df_val = df_train[df_train['kfold'] != 0], df_train[df_train['kfold'] == 0]

train_ds = IntentDataset(df_train, tokenizer, labelidx2name, label_col, text_col)
val_ds = IntentDataset(df_val, tokenizer, labelidx2name, label_col, text_col)
test_ds = IntentDataset(df_test, tokenizer, labelidx2name, label_col, text_col)

return train_ds, val_ds, test_ds, mlb_labels


class IntentDataset(th.utils.data.Dataset):

def __init__(self, df, tokenizer, labelidx2name, label_col="label", text_col="image_url"):
self.df = df
self.tokenizer = tokenizer
self.labelidx2name = labelidx2name
self.name2ix = {v: k for k, v in labelidx2name.items()}
self.num_classes = len(labelidx2name)
self.label_col = label_col
self.text_col = text_col

def __len__(self):
return len(self.df)

def __getitem__(self, idx):
row = self.df.iloc[idx]
text, label = row.image_url, self._encode_label(row[self.label_col])
encoding = self.tokenizer(text, max_length=100, padding="max_length", truncation=True)
encoding = {key: th.tensor(val, dtype=th.int64) for key, val in encoding.items()}
encoding[self.label_col] = th.tensor(label, dtype=th.float32)
return dict(encoding)

def _encode_label(self, labels):
encoded_labels = np.zeros(self.num_classes)
for label in labels:
encoded_labels[self.name2ix[label]] = 1.0
return encoded_labels
Loading