Skip to content

Commit

Permalink
Start rewrite Dataloader
Browse files Browse the repository at this point in the history
  • Loading branch information
TimKoornstra committed Mar 6, 2024
1 parent b397dff commit b77817c
Show file tree
Hide file tree
Showing 8 changed files with 408 additions and 445 deletions.
4 changes: 2 additions & 2 deletions src/data/data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def initialize_data_loader(config: Config,
test_list=config["test_list"],
validation_list=config["validation_list"],
inference_list=config["inference_list"],
char_list=charlist,
multiply=config["multiply"],
charlist=charlist,
multiply=config["aug_multiply"],
check_missing_files=config["check_missing_files"],
replace_final_layer=config["replace_final_layer"],
normalization_file=config["normalization_file"],
Expand Down
317 changes: 138 additions & 179 deletions src/data/loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Imports

# > Standard library
from collections import defaultdict
import logging
import os

Expand All @@ -22,7 +23,7 @@ def __init__(self,
batch_size,
img_size,
augment_model,
char_list=[],
charlist=None,
train_list='',
validation_list='',
test_list='',
Expand All @@ -33,12 +34,13 @@ def __init__(self,
replace_final_layer=False,
use_mask=False
):

# TODO: Change most of these to use config
self.batch_size = batch_size
self.height = img_size[0]
self.augment_model = augment_model
self.channels = img_size[2]
self.partition = []
self.injected_charlist = char_list
self.injected_charlist = charlist
self.train_list = train_list
self.validation_list = validation_list
self.test_list = test_list
Expand All @@ -48,195 +50,91 @@ def __init__(self,
self.check_missing_files = check_missing_files
self.replace_final_layer = replace_final_layer
self.use_mask = use_mask
self.charList = char_list
self.charlist = charlist

def init_data_generator(self,
files,
params,
is_training=False,
deterministic=False):
"""
Create DataGenerator object which is used to load and preprocess
batches of data (tuples) consisting of a file path and a label.
AUTOTUNE is applied to optimize parallel calls to the laod_images
function.
Parameters
----------
files:
Input files for data generator
params: dict
Dict of training parameters used to pre-process the data
is_training: bool
Indicate whether generator is used for training process or not
deterministic: bool
Control the order in which the transformation produces elements,
If set to False, the transformation is allowed to yield elements
out of order to trade determinism for performance.
Returns
----------
DataGenerator
Generator object with loaded images and params
"""
data_generator = DataGenerator(is_training=is_training, **params)

num_batches = np.ceil(len(files) / self.batch_size)
generator = tf.data.Dataset.from_tensor_slices(files)
if is_training:
# Add additional repeat and shuffle for training
generator = generator.repeat().shuffle(len(files))

generator = (generator
.map(data_generator.load_images,
num_parallel_calls=AUTOTUNE,
deterministic=deterministic)
.padded_batch(self.batch_size,
padded_shapes=(
[None, None, self.channels], [None]),
padding_values=(
tf.constant(-10, dtype=tf.float32),
tf.constant(0, dtype=tf.int64)))
.prefetch(AUTOTUNE)
).apply(
tf.data.experimental.assert_cardinality(num_batches))

return generator

def get_generators(self):
"""
Initializes data generators for different dataset partitions and updates
character set and tokenizer based on the dataset.
self.evaluation_list = None
if train_list and validation_list:
self.evaluation_list = validation_list

Firstly, data is created for
partitions, labels, self.tokenizer = self._process_raw_data()
self.raw_data = {split: (partitions[split], labels[split])
for split in ['train', 'evaluation', 'validation',
'test', 'inference']}

"""
# Initialize character set and data partitions with corresponding labels.
self.datasets = self._fill_datasets_dict(partitions, labels)

def _process_raw_data(self):
# Initialize character set and data partitions with corresponding
# labels
characters = set()
partitions = {
'train': [],
'evaluation': [],
'validation': [],
'test': [],
'inference': []
}
labels = {
'train': [],
'evaluation': [],
'validation': [],
'test': [],
'inference': []
}

# Process training data and update characters set and partitions.
if self.train_list:
characters, train_files = self.create_data(
characters, labels, partitions, 'train', self.train_list,
use_multiply=True
)

# Process evaluation data if available.
if self.validation_list:
characters, eval_files = self.create_data(
characters, labels, partitions, 'evaluation',
self.validation_list
partitions = defaultdict(list)
labels = defaultdict(list)

for partition in ['train', 'evaluation', 'validation',
'test', 'inference']:
partition_list = getattr(self, f"{partition}_list", None)
if partition_list:
include_unsupported_chars = partition in ['validation', 'test']
use_multiply = partition == 'train'
characters, _ = self.create_data(
characters=characters,
labels=labels,
partitions=partitions,
partition_name=partition,
data_files=partition_list,
use_multiply=use_multiply,
include_unsupported_chars=include_unsupported_chars
)

# TODO: Replace this by a do_validate flag
# Process validation data if available.
if self.validation_list:
characters, val_files = self.create_data(
characters, labels, partitions, 'validation',
self.validation_list, include_unsupported_chars=True
)

# Process test data if available.
if self.test_list:
characters, test_files = self.create_data(
characters, labels, partitions, 'test', self.test_list,
include_unsupported_chars=True
)

# Process inference data if available.
if self.inference_list:
characters, inf_files = self.create_data(
characters, labels, partitions, 'inference',
self.inference_list, include_unsupported_chars=True,
include_missing_files=True, is_inference=True
)

# Determine the character list for the tokenizer.
# Determine the character list for the tokenizer
if self.injected_charlist and not self.replace_final_layer:
logging.info('Using injected charlist')
self.charList = self.injected_charlist
self.charlist = self.injected_charlist
else:
self.charList = sorted(list(characters))

# Initialize the tokenizer.
self.tokenizer = Tokenizer(self.charList, self.use_mask)

# Define common parameters for all data generators.
train_params = {
'tokenizer': self.tokenizer,
'height': self.height,
'channels': self.channels,
'augment_model': self.augment_model
}

# Initialize data generators for each dataset partition as needed.
training_generator = evaluation_generator = validation_generator = None
test_generator = inference_generator = None
train_batches = 0

if self.train_list:
training_generator = self.init_data_generator(
train_files, train_params, is_training=True
)
train_batches = np.ceil(len(train_files) / self.batch_size)

if self.validation_list:
if self.train_list:
evaluation_generator = self.init_data_generator(
eval_files, train_params, deterministic=True,
is_training=False
self.charlist = sorted(list(characters))

# Initialize the tokenizer
tokenizer = Tokenizer(self.charlist, self.use_mask)

return partitions, labels, tokenizer

def _fill_datasets_dict(self, partitions, labels):
"""
Initializes data generators for different dataset partitions and
updates character set and tokenizer based on the dataset.
"""

# Create datasets for different partitions
datasets = defaultdict(lambda: None)

for partition in ['train', 'evaluation', 'validation',
'test', 'inference']:
partition_list = getattr(self, f"{partition}_list", None)
if partition_list:
# Create dataset for the current partition
files = list(zip(partitions[partition], labels[partition]))
datasets[partition] = create_dataset(
files=files,
tokenizer=self.tokenizer,
augment_model=self.augment_model,
height=self.height,
channels=self.channels,
batch_size=self.batch_size,
is_training=partition == 'train',
deterministic=partition != 'train'
)
validation_generator = self.init_data_generator(
val_files, train_params, deterministic=True,
is_training=False
)

if self.test_list:
test_generator = self.init_data_generator(
test_files, train_params, deterministic=True,
is_training=False
)

if self.inference_list:
inference_generator = self.init_data_generator(
inf_files, train_params, deterministic=True,
is_training=False
)

# Update the partition information.
self.partition = partitions

# Return all initialized generators, tokenizer, and other relevant info.
return (
training_generator,
evaluation_generator,
validation_generator,
test_generator,
inference_generator,
self.tokenizer,
int(train_batches),
labels['validation']
)

return datasets

def get_train_batches(self):
return int(np.ceil(len(self.datasets['train']) / self.batch_size))

def create_data(self, characters, labels, partitions, partition_name,
data_files,
include_unsupported_chars=False,
include_missing_files=False,
is_inference=False, use_multiply=False):
is_inference=False,
use_multiply=False):
"""
Processes data files to create a dataset partition, updating characters,
labels, and partition lists accordingly.
Expand Down Expand Up @@ -338,5 +236,66 @@ def create_data(self, characters, labels, partitions, partition_name,

return characters, processed_files

def get_item(self, partition, item_id):
return self.partition[partition][item_id]
def get_filename(self, partition, item_id):
return self.raw_data[partition][0][item_id]

def get_ground_truth(self, partition, item_id):
return self.raw_data[partition][1][item_id]


def create_dataset(files,
tokenizer,
augment_model,
height,
channels,
batch_size,
is_training=False,
deterministic=False):
"""
Create DataGenerator object which is used to load and preprocess
batches of data (tuples) consisting of a file path and a label.
AUTOTUNE is applied to optimize parallel calls to the laod_images
function.
Parameters
----------
files:
Input files for data generator
params: dict
Dict of training parameters used to pre-process the data
is_training: bool
Indicate whether generator is used for training process or not
deterministic: bool
Control the order in which the transformation produces elements,
If set to False, the transformation is allowed to yield elements
out of order to trade determinism for performance.
Returns
----------
DataGenerator
Generator object with loaded images and params
"""

data_generator = DataGenerator(tokenizer, augment_model, height,
channels, is_training)

num_batches = np.ceil(len(files) / batch_size)
generator = tf.data.Dataset.from_tensor_slices(files)
if is_training:
# Add additional repeat and shuffle for training
generator = generator.repeat().shuffle(len(files))

generator = (generator
.map(data_generator.load_images,
num_parallel_calls=AUTOTUNE,
deterministic=deterministic)
.padded_batch(batch_size,
padded_shapes=([None, None, channels], [None]),
padding_values=(
tf.constant(-10, dtype=tf.float32),
tf.constant(0, dtype=tf.int64)))
.prefetch(AUTOTUNE)
).apply(
tf.data.experimental.assert_cardinality(num_batches))

return generator
Loading

0 comments on commit b77817c

Please sign in to comment.