data_loader.py

import copy
import json
import logging
import os

import torch
from torch.utils.data import TensorDataset

from utils import get_slot_labels


logger = logging.getLogger(__name__)


# class InputExample(object):
#     """
#     A single training/test example for simple sequence classification.

#     Args:
#         guid: Unique id for the example.
#         words: list. The words of the sequence.
#         intent_label: (Optional) string. The intent label of the example.
#         slot_labels: (Optional) list. The slot labels of the example.
#     """

#     def __init__(self, guid, words, intent_label=None, slot_labels=None):
#         self.guid = guid
#         self.words = words
#         self.intent_label = intent_label
#         self.slot_labels = slot_labels

#     def __repr__(self):
#         return str(self.to_json_string())

#     def to_dict(self):
#         """Serializes this instance to a Python dictionary."""
#         output = copy.deepcopy(self.__dict__)
#         return output

#     def to_json_string(self):
#         """Serializes this instance to a JSON string."""
#         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class InputExample(object):
    """
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        words: list. The words of the sequence.
        slot_labels: (Optional) list. The slot labels of the example.
    """

    def __init__(self, guid, words, slot_labels=None):
        self.guid = guid
        self.words = words
        self.slot_labels = slot_labels

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


# class InputFeatures(object):
#     """A single set of features of data."""

#     def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
#         self.input_ids = input_ids
#         self.attention_mask = attention_mask
#         self.token_type_ids = token_type_ids
#         self.intent_label_id = intent_label_id
#         self.slot_labels_ids = slot_labels_ids

#     def __repr__(self):
#         return str(self.to_json_string())

#     def to_dict(self):
#         """Serializes this instance to a Python dictionary."""
#         output = copy.deepcopy(self.__dict__)
#         return output

#     def to_json_string(self):
#         """Serializes this instance to a JSON string."""
#         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


# class JointProcessor(object):
#     """Processor for the JointBERT data set """

#     def __init__(self, args):
#         self.args = args
#         self.intent_labels = get_intent_labels(args)
#         self.slot_labels = get_slot_labels(args)

#         self.input_text_file = "seq.in"
#         self.intent_label_file = "label"
#         self.slot_labels_file = "seq.out"

#     @classmethod
#     def _read_file(cls, input_file, quotechar=None):
#         """Reads a tab separated value file."""
#         with open(input_file, "r", encoding="utf-8") as f:
#             lines = []
#             for line in f:
#                 lines.append(line.strip())
#             return lines

#     def _create_examples(self, texts, intents, slots, set_type):
#         """Creates examples for the training and dev sets."""
#         examples = []
#         for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
#             guid = "%s-%s" % (set_type, i)
#             # 1. input_text
#             words = text.split()  # Some are spaced twice
#             # 2. intent
#             intent_label = (
#                 self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
#             )
#             # 3. slot
#             slot_labels = []
#             for s in slot.split():
#                 slot_labels.append(
#                     self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK")
#                 )

#             assert len(words) == len(slot_labels)
#             examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
#         return examples

#     def get_examples(self, mode):
#         """
#         Args:
#             mode: train, dev, test
#         """
#         data_path = os.path.join(self.args.data_dir, self.args.token_level, mode)
#         logger.info("LOOKING AT {}".format(data_path))
#         return self._create_examples(
#             texts=self._read_file(os.path.join(data_path, self.input_text_file)),
#             intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
#             slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
#             set_type=mode,
#         )
class Processor(object):
    """Processor for the model data set """

    def __init__(self, args):
        self.args = args
        self.slot_labels = get_slot_labels(args)

        self.input_text_slot_labels_file = "_word.json"

    @classmethod
    def _read_file(cls, input_file, quotechar=None):
        """Reads a file where each line contain a json dict corresponding to an example."""
        with open(input_file, "r", encoding="utf-8") as f:
            all_words = []
            all_tags = []
            for line in f:
                line = json.loads(line)
                words = line["words"]
                words = [word.strip() for word in words]
                words = ' '.join(words)
                tags = line["tags"]
                tags = [tag.strip() for tag in tags]
                tags = ' '.join(tags)
                all_words.append(words)
                all_tags.append(tags)
            return all_words, all_tags

    def _create_examples(self, texts, slots, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for i, (text, slot) in enumerate(zip(texts, slots)):
            guid = "%s-%s" % (set_type, i)
            # 1. input_text
            words = text.split()  # Some are spaced twice
            # 2. slot
            slot_labels = []
            for s in slot.split():
                slot_labels.append(
                    self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK")
                )

            assert len(words) == len(slot_labels)
            examples.append(InputExample(guid=guid, words=words, slot_labels=slot_labels))
        return examples

    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test
        """
        data_path = os.path.join(self.args.data_dir, self.args.token_level, mode+self.input_text_slot_labels_file)
        logger.info("LOOKING AT {}".format(data_path))
        texts, slots = self._read_file(data_path)
        return self._create_examples(
            texts,
            slots,
            set_type=mode,
        )


# processors = {"syllable-level": JointProcessor, "word-level": JointProcessor}
processors = {"syllable": Processor, "word": Processor}


# def convert_examples_to_features(
#     examples,
#     max_seq_len,
#     tokenizer,
#     pad_token_label_id=-100,
#     cls_token_segment_id=0,
#     pad_token_segment_id=0,
#     sequence_a_segment_id=0,
#     mask_padding_with_zero=True,
# ):
#     # Setting based on the current model type
#     cls_token = tokenizer.cls_token
#     sep_token = tokenizer.sep_token
#     unk_token = tokenizer.unk_token
#     pad_token_id = tokenizer.pad_token_id

#     features = []
#     for (ex_index, example) in enumerate(examples):
#         if ex_index % 5000 == 0:
#             logger.info("Writing example %d of %d" % (ex_index, len(examples)))

#         # Tokenize word by word (for NER)
#         tokens = []
#         slot_labels_ids = []
#         for word, slot_label in zip(example.words, example.slot_labels):
#             word_tokens = tokenizer.tokenize(word)
#             if not word_tokens:
#                 word_tokens = [unk_token]  # For handling the bad-encoded word
#             tokens.extend(word_tokens)
#             # Use the real label id for the first token of the word, and padding ids for the remaining tokens
#             slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

#         # Account for [CLS] and [SEP]
#         special_tokens_count = 2
#         if len(tokens) > max_seq_len - special_tokens_count:
#             tokens = tokens[: (max_seq_len - special_tokens_count)]
#             slot_labels_ids = slot_labels_ids[: (max_seq_len - special_tokens_count)]

#         # Add [SEP] token
#         tokens += [sep_token]
#         slot_labels_ids += [pad_token_label_id]
#         token_type_ids = [sequence_a_segment_id] * len(tokens)

#         # Add [CLS] token
#         tokens = [cls_token] + tokens
#         slot_labels_ids = [pad_token_label_id] + slot_labels_ids
#         token_type_ids = [cls_token_segment_id] + token_type_ids

#         input_ids = tokenizer.convert_tokens_to_ids(tokens)

#         # The mask has 1 for real tokens and 0 for padding tokens. Only real
#         # tokens are attended to.
#         attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

#         # Zero-pad up to the sequence length.
#         padding_length = max_seq_len - len(input_ids)
#         input_ids = input_ids + ([pad_token_id] * padding_length)
#         attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
#         token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
#         slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)

#         assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
#         assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
#             len(attention_mask), max_seq_len
#         )
#         assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(
#             len(token_type_ids), max_seq_len
#         )
#         assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(
#             len(slot_labels_ids), max_seq_len
#         )

#         intent_label_id = int(example.intent_label)

#         if ex_index < 5:
#             logger.info("*** Example ***")
#             logger.info("guid: %s" % example.guid)
#             logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
#             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#             logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
#             logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
#             logger.info("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))
#             logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))

#         features.append(
#             InputFeatures(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 token_type_ids=token_type_ids,
#                 intent_label_id=intent_label_id,
#                 slot_labels_ids=slot_labels_ids,
#             )
#         )

#     return features
def convert_examples_to_features(
    examples,
    max_seq_len,
    tokenizer,
    pad_token_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        # Tokenize word by word (for NER)
        tokens = []
        slot_labels_ids = []
        for word, slot_label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[: (max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[: (max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)

        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
            len(attention_mask), max_seq_len
        )
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(
            len(token_type_ids), max_seq_len
        )
        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(
            len(slot_labels_ids), max_seq_len
        )

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % example.guid)
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                slot_labels_ids=slot_labels_ids,
            )
        )

    return features


def load_and_cache_examples(args, tokenizer, mode):
    processor = processors[args.token_level](args)

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            mode, args.token_level, list(filter(None, args.model_name_or_path.split("/"))).pop(), args.max_seq_len
        ),
    )

    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        # Load data features from dataset file
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if mode == "train":
            examples = processor.get_examples("train")
        elif mode == "dev":
            examples = processor.get_examples("dev")
        elif mode == "test":
            examples = processor.get_examples("test")
        else:
            raise Exception("For mode, Only train, dev, test is available")

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        pad_token_label_id = args.ignore_index
        features = convert_examples_to_features(
            examples, args.max_seq_len, tokenizer, pad_token_label_id=pad_token_label_id
        )
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_token_type_ids, all_slot_labels_ids
    )
    return dataset