acikyazilimagi · metekemertas · Feb 14, 2023 · Feb 15, 2023 · Feb 15, 2023 · devrimcavusoglu
diff --git a/notebooks/training/bert_masked_lm.py b/notebooks/training/bert_masked_lm.py
@@ -0,0 +1,112 @@
+
+import argparse
+import json
+import numpy as np
+import torch as th
+
+from huggingface_hub import login
+from transformers import (AutoTokenizer, AutoModelForMaskedLM,
+                          Trainer, TrainingArguments)
+
+from utils.generic_utils import preprocess_tweet
+
+
+LABEL_NAMES = [
+    'Alakasiz', 'Barinma', 'Elektronik',
+    'Giysi', 'Kurtarma', 'Lojistik', 'Saglik',
+    'Su', 'Guvenlik', 'Yemek']
+
+
+class DepremTweetUnlabeledDataset(th.utils.data.Dataset):
+
+    def __init__(self, tweets, tokenizer):
+        self.tweets = tweets
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.tweets)
+
+    def __getitem__(self, idx):
+        text = self.tweets[idx]
+        encoding = self.tokenizer(text, max_length=64, padding="max_length", truncation=True)
+        encoding = {key: th.tensor(val) for key, val in encoding.items()}
+        encoding["labels"] = encoding["input_ids"]
+
+        inp = encoding["input_ids"]
+        tokens = range(len(inp))
+        # We need to select 15% random tokens from the given list
+        num_of_token_to_mask = round(len(tokens) * 0.15)
+        token_to_mask = np.random.choice(np.array(tokens),
+                                         size=num_of_token_to_mask,
+                                         replace=False).tolist()
+        # Now we have the indices where we need to mask the tokens
+        inp[token_to_mask] = self.tokenizer.mask_token_id
+        encoding["input_ids"] = inp
+
+        return dict(encoding)
+
+
+def prepare_datasets(json_path, tokenizer):
+    with open(json_path, 'r') as f:
+        json_ = json.load(f)
+
+    tweets = [preprocess_tweet(tweet['full_text'], remove_hashtags=False, remove_handles=False)
+              for tweet in json_]
+    tweets = list(set(tweets))  # Remove duplicates
+    print("Number of tweets: {}".format(len(tweets)))
+    print("Sample tweets:")
+    print(np.random.choice(tweets, 10))
+    print()
+
+    n_train = int(len(tweets) * 0.85)
+    train_ds = DepremTweetUnlabeledDataset(tweets[:n_train], tokenizer)
+    val_ds = DepremTweetUnlabeledDataset(tweets[n_train:], tokenizer)
+
+    return train_ds, val_ds
+
+
+def clean_dataset(dataset):
+    raise NotImplementedError
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default="dbmdz/bert-base-turkish-uncased")
+    parser.add_argument("--output_dir", type=str, default="./output-intent")
+    parser.add_argument("--hf_token", type=str, required=True)
+    args = parser.parse_args()
+
+    login(token=args.hf_token)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    train_ds, val_ds = prepare_datasets(
+        "postgres_public_feeds_entry.json", tokenizer)
+    # Note: above line of code could be replaced with downloading the dataset
+    # from HF and preprocessing it (see next two lines for example)
+    # train_ds = datasets.load_dataset("deprem-private/deprem_tweet_unlabeled", "plain_text")
+    # train_ds = clean_dataset(train_ds)
+
+    model = AutoModelForMaskedLM.from_pretrained(args.model_name)
+
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        evaluation_strategy="steps",
+        save_strategy="steps",
+        per_device_train_batch_size=32,
+        per_device_eval_batch_size=32,
+        weight_decay=0.01,
+        num_train_epochs=1,
+        eval_steps=1000,
+        logging_steps=1000
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=val_ds
+    )
+    trainer.train()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/notebooks/training/intent_bert_llrd.py b/notebooks/training/intent_bert_llrd.py
@@ -0,0 +1,175 @@
+
+import argparse
+import numpy as np
+import os
+import torch as th
+
+from collections import OrderedDict
+from huggingface_hub import login
+from sklearn.metrics import classification_report
+from transformers import (AdamW, AutoTokenizer, AutoModelForSequenceClassification,
+                          DataCollatorWithPadding, Trainer, TrainingArguments, EarlyStoppingCallback,
+                          get_cosine_schedule_with_warmup)
+
+from utils.generic_utils import set_seed_everywhere, select_thresholds, compute_f1
+from utils.dataset_utils import prep_datasets
+from utils.training_utils import ImbalancedTrainer, compute_class_weights
+
+
+LABEL_IDX2NAME = OrderedDict([
+        (0, 'Lojistik'),
+        (1, 'Elektrik Kaynagi'),
+        (2, 'Arama Ekipmani'),
+        (3, 'Cenaze'),
+        (4, 'Giysi'),
+        (5, 'Enkaz Kaldirma'),
+        (6, 'Isinma'),
+        (7, 'Barınma'),
+        (8, 'Tuvalet'),
+        (9, 'Su'),
+        (10, 'Yemek'),
+        (11, 'Saglik'),
+        (12, 'Alakasiz')])
+
+os.environ["WANDB_DISABLED"] = "true"
+os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
+
+
+def get_optimizer_grouped_parameters(
+    model, model_type,
+    learning_rate, weight_decay,
+    layerwise_learning_rate_decay
+):
+    no_decay = ["bias", "LayerNorm.weight"]
+    # initialize lr for task specific layer
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if "classifier" in n or "pooler" in n],
+            "weight_decay": 0.0,
+            "lr": learning_rate,
+        },
+    ]
+    # initialize lrs for every layer
+    num_layers = model.config.num_hidden_layers
+    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
+    layers.reverse()
+    lr = learning_rate
+    for layer in layers:
+        lr *= layerwise_learning_rate_decay
+        optimizer_grouped_parameters += [
+            {
+                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": weight_decay,
+                "lr": lr,
+            },
+            {
+                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+                "lr": lr,
+            },
+        ]
+    return optimizer_grouped_parameters
+
+
+def get_llrd_optimizer_scheduler(model, learning_rate=1e-5, weight_decay=0.01, layerwise_learning_rate_decay=0.95):
+    grouped_optimizer_params = get_optimizer_grouped_parameters(
+        model, 'bert',
+        learning_rate, weight_decay,
+        layerwise_learning_rate_decay
+    )
+    optimizer = AdamW(
+        grouped_optimizer_params,
+        lr=learning_rate,
+        eps=1e-6,
+        correct_bias=True
+    )
+    scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=0,
+        num_training_steps=15
+    )
+    # Note: linear schedule fails to converge for unknown reasons.
+
+    return optimizer, scheduler
+
+
+def main():
+    # Define argpars for training parameters
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--n_seeds", type=int, default=1, help="Number of trials to run with different seeds")
+    parser.add_argument("--model_name", type=str, default="dbmdz/bert-base-turkish-uncased",
+                        help="Name or path of the model to use. For example, could be"
+                             "<path-to-BERT-finetuned-for-MLM-on-unlabelled-tweets>")
+    parser.add_argument("--output_dir", type=str, default="./output-intent")
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--hf_token", type=str, required=True)
+    parser.add_argument("--layerwise_LR_decay_rate", type=float, default=0.8)
+    args = parser.parse_args()
+
+    login(token=args.hf_token)
+    tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
+    data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=100)
+    train_ds, val_ds, test_ds, mlb_labels = prep_datasets(
+        tokenizer,
+        labelidx2name=LABEL_IDX2NAME,
+        path="deprem-private/intent-v13")
+
+    f1s = []
+    for i in range(args.n_seeds):
+        set_seed_everywhere(i)
+        model = AutoModelForSequenceClassification.from_pretrained(
+            args.model_name, num_labels=len(LABEL_IDX2NAME), problem_type="multi_label_classification")
+
+        training_args = TrainingArguments(
+            output_dir=args.output_dir,
+            evaluation_strategy="epoch",
+            save_strategy="epoch",
+            per_device_train_batch_size=args.batch_size,
+            per_device_eval_batch_size=args.batch_size * 2,
+            report_to=None,
+            num_train_epochs=15,
+            metric_for_best_model="macro f1",
+            load_best_model_at_end=True,
+            group_by_length=True
+        )
+        optimizer, scheduler = get_llrd_optimizer_scheduler(
+            model,
+            learning_rate=5e-5,
+            weight_decay=0.01,  # Weight decay defined here instead of training_args
+            layerwise_learning_rate_decay=args.layerwise_LR_decay_rate)
+
+        trainer = ImbalancedTrainer(
+            class_weights=compute_class_weights(mlb_labels),
+            model=model,
+            args=training_args,
+            train_dataset=train_ds,
+            eval_dataset=val_ds,
+            data_collator=data_collator,
+            compute_metrics=compute_f1,
+            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+            optimizers=(optimizer, scheduler)
+        )
+        trainer.train()
+
+        # Choose the best thresholds per label using train+val data
+        train_preds = trainer.predict(train_ds)
+        val_preds = trainer.predict(val_ds)
+        thresholds = select_thresholds(
+            np.concatenate([train_preds.label_ids, val_preds.label_ids]),
+            np.concatenate([train_preds.predictions, val_preds.predictions])
+        )
+        # Evaluate on test data
+        test_preds = trainer.predict(test_ds)
+        f1 = compute_f1((test_preds.predictions, test_preds.label_ids), thresholds=thresholds)
+        f1s.append(f1["macro f1"])
+        report = classification_report(
+            test_preds.label_ids.astype(int),
+            (th.sigmoid(th.from_numpy(test_preds.predictions)).numpy() > thresholds).astype(int),
+            target_names=LABEL_IDX2NAME.values(), digits=3)
+        print(report)
+
+    print("Mean F1: {:.2f}, Std F1: {:.2f}".format(np.mean(f1s) * 100, np.std(f1s) * 100))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/__init__.py b/utils/__init__.py
diff --git a/utils/dataset_utils.py b/utils/dataset_utils.py
@@ -0,0 +1,82 @@
+
+import numpy as np
+import pandas as pd
+import torch as th
+
+from datasets import load_dataset
+from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
+from sklearn.preprocessing import MultiLabelBinarizer
+from unicode_tr import unicode_tr
+
+
+def prep_datasets(tokenizer, labelidx2name, path, label_col="label", text_col="image_url"):
+    intent = load_dataset(path, use_auth_token=True)
+    print(intent["train"], intent["test"])
+
+    for instance in intent["train"]:
+        print(unicode_tr(instance["image_url"]).lower())
+        break
+
+    df_train = pd.DataFrame().from_records(list(intent["train"]))
+    df_test = pd.DataFrame().from_records(list(intent["test"]))
+
+    df_train[text_col] = df_train[text_col].apply(lambda x: unicode_tr(x).lower())
+    df_test[text_col] = df_test[text_col].apply(lambda x: unicode_tr(x).lower())
+
+    # Next, we remove the rows that have no labels
+    df_train = df_train[df_train[label_col].notnull()].reset_index(drop=True)
+    df_test = df_test[df_test[label_col].notnull()].reset_index(drop=True)
+
+    # df_train.labels.apply(lambda x: len(x))
+    #
+    # labels = set()
+    # for label in df_train.labels.values:
+    #     labels.update({l for l in label})
+    #
+    # name2ix = {v: k for k, v in labelidx2name.items()}
+    # labels = name2ix.keys()
+
+    mlb = MultiLabelBinarizer(classes=list(labelidx2name.values()))
+    mlb_labels = mlb.fit_transform(df_train.label.tolist())
+
+    cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+
+    for fold, (train_idx, val_idx) in enumerate(cv.split(df_train.index.tolist(), mlb_labels)):
+        df_train.loc[val_idx, 'kfold'] = int(fold)
+
+    df_train, df_val = df_train[df_train['kfold'] != 0], df_train[df_train['kfold'] == 0]
+
+    train_ds = IntentDataset(df_train, tokenizer, labelidx2name, label_col, text_col)
+    val_ds = IntentDataset(df_val, tokenizer, labelidx2name, label_col, text_col)
+    test_ds = IntentDataset(df_test, tokenizer, labelidx2name, label_col, text_col)
+
+    return train_ds, val_ds, test_ds, mlb_labels
+
+
+class IntentDataset(th.utils.data.Dataset):
+
+    def __init__(self, df, tokenizer, labelidx2name, label_col="label", text_col="image_url"):
+        self.df = df
+        self.tokenizer = tokenizer
+        self.labelidx2name = labelidx2name
+        self.name2ix = {v: k for k, v in labelidx2name.items()}
+        self.num_classes = len(labelidx2name)
+        self.label_col = label_col
+        self.text_col = text_col
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        text, label = row.image_url, self._encode_label(row[self.label_col])
+        encoding = self.tokenizer(text, max_length=100, padding="max_length", truncation=True)
+        encoding = {key: th.tensor(val, dtype=th.int64) for key, val in encoding.items()}
+        encoding[self.label_col] = th.tensor(label, dtype=th.float32)
+        return dict(encoding)
+
+    def _encode_label(self, labels):
+        encoded_labels = np.zeros(self.num_classes)
+        for label in labels:
+            encoded_labels[self.name2ix[label]] = 1.0
+        return encoded_labels
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
@@ -0,0 +1,67 @@
+
+import numpy as np
+import random
+import re
+import torch as th
+
+from sklearn.metrics import classification_report, f1_score
+
+
+def set_seed_everywhere(seed):
+    th.manual_seed(seed)
+    if th.cuda.is_available():
+        th.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def compute_f1(eval_pred, thresholds=None):
+    logits, labels = eval_pred
+    probs = th.sigmoid(th.from_numpy(logits)).numpy()
+    if thresholds is None:
+        thresholds = select_thresholds(labels, probs)
+    predictions = (probs > thresholds).astype(int)
+    clf_dict = classification_report(labels, predictions, zero_division=0, output_dict=True)
+    return {"micro f1": clf_dict["micro avg"]["f1-score"],
+            "macro f1": clf_dict["macro avg"]["f1-score"]}
+
+
+def select_thresholds(eval_labels, eval_probs, search_range=(0.3, 0.7), metric="macro"):
+    """Selects the best threshold for each class based on the F1 score."""
+    lower, upper = search_range
+    assert lower > 0 and upper < 1
+    best_thresholds_per_class = []
+    for i in range(eval_labels.shape[1]):
+        candidate_thresholds = np.arange(lower, upper, .01)
+        scores = []
+        for threshold in candidate_thresholds:
+            score = f1_score(
+                eval_labels[:, i],
+                (eval_probs[:, i] > threshold).astype(int),
+                average=metric)
+            scores.append(score)
+        best_threshold = candidate_thresholds[np.argmax(scores)]
+        best_thresholds_per_class.append(best_threshold)
+    thresholds = np.array(best_thresholds_per_class)
+
+    return thresholds
+
+
+# Preprocessing function to clean the tweets.
+# Use with caution: removing hashtags and handles _may_ reduce model performance.
+def preprocess_tweet(tweet, remove_hashtags=False, remove_handles=False):
+    # remove handles, hashtags, urls
+    if remove_hashtags:
+        tweet = re.sub(r'#\w+', '', tweet)
+    if remove_handles:
+        tweet = re.sub(r'@\w+', '', tweet)
+
+    # remove urls
+    tweet = re.sub(r'http\S+', '', tweet)
+    tweet = re.sub(r'www\S+', '', tweet)
+    tweet = re.sub(r'pic.twitter\S+', '', tweet)
+
+    tweet = re.sub(r'\W', ' ', tweet)  # remove special characters
+    tweet = re.sub(r'\s+', ' ', tweet)  # remove multiple whitespaces
+
+    return tweet.strip()
diff --git a/utils/training_utils.py b/utils/training_utils.py
@@ -0,0 +1,72 @@
+
+import numpy as np
+import torch as th
+
+from transformers import Trainer
+
+
+def compute_class_weights(mlb_labels):
+    occ_ratios = (mlb_labels.sum() / mlb_labels.sum(axis=0))
+    occ_ratios /= occ_ratios.min()
+    occ_ratios = np.power(occ_ratios, 1 / 3)
+
+    class_weights = dict(zip(np.arange(mlb_labels.shape[1]), occ_ratios))
+
+    return class_weights
+
+
+class ImbalancedTrainer(Trainer):
+    def __init__(self, class_weights,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # You pass the class weights when instantiating the Trainer
+        self.class_weights = th.Tensor(list(class_weights.values())).cuda()
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        if self.label_smoother is not None and "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        outputs = model(**inputs)
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        if labels is not None:
+            loss = self.label_smoother(outputs, labels)
+        else:
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+
+            # Changes start here
+            # loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+            logits = outputs['logits']
+            criterion = FocalLoss(self.class_weights)
+            loss = criterion(logits, inputs['labels'])
+            # Changes end here
+
+        return (loss, outputs) if return_outputs else loss
+
+
+class FocalLoss(th.nn.Module):
+    def __init__(self, pos_weight, alpha=0.1, gamma=2., reduction='mean'):
+        super(FocalLoss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+        self.pos_weight = pos_weight.to('cuda')
+
+    def forward(self, inputs, targets):
+        BCE_loss = th.nn.BCEWithLogitsLoss(reduction='none', pos_weight=self.pos_weight)(inputs, targets)
+        pt = th.exp(-BCE_loss)
+        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
+
+        if self.reduction == 'mean':
+            return F_loss.mean()
+        elif self.reduction == 'sum':
+            return F_loss.sum()
+        else:
+            return F_loss