utils.py

import os
import yaml
import multiprocessing
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
from keras_custom.generators import gen, gen_w_heldout

"""
Shared utility functionalities.
    `load_config`: load configuration file

    `produce_orig_reprs`:
        1. Given the original stimuli (or the backgrounded stimuli for task6),
        2. Load a pretrained model, compute layer (user given) representations.

    `data_loader`:
        1. Take the layer reprs, stack them into X, Y (giant matrices)
        2. Save them and load back in later for training.
        3. The stacking/saving thing only needs to do once.
"""

def load_config(config_version):
    with open(os.path.join('configs', f'{config_version}.yaml')) as f:
        config = yaml.safe_load(f)
    print(f'[Check] Loading [{config_version}]')
    return config


def produce_orig_reprs(model, model_name, layer, preprocess_func, stimulus_set, return_images=False):
    """
    Purpose:
    --------
        Produce given layer's representations of 8 or 16
        stimuli with no data augmentation only preprocessed.

    inputs:
    -------
        model: A specified model capped at some layer
        model_name: vgg16 / vgg19 / vit_b16
        layer: the layer to intercept representations from
        preprocess_func: Model-specific preprocessing routine
        stimulus_set: ..
        return_images: default False (return model predictions)
                        If True, return the preprocessed original
                        images.

    returns:
    --------
        reprs: layer activations for all images.
                reprs will have shape (N, D)
        dataset: A tf Dataset which produces original images later 
                    used for visual examination.
    """
    data_dir = f'stimuli/original/task{stimulus_set}'
    print(f'[Check]: using data from {data_dir}')

    batch_size = len(os.listdir(data_dir))
    print(f'[Check] batch_size={batch_size}')

    # this loads original images
    dataset = tf.keras.preprocessing.image_dataset_from_directory(
                data_dir,
                shuffle=False,
                image_size=(224, 224),
                batch_size=batch_size)

    # this loads model-specific processed images.
    if model_name == 'vit_b16':
        from utils import ViT_ImageDataGenerator
        datagen = ViT_ImageDataGenerator(
                    preprocessing_function=preprocess_func)
    else:
        datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                    preprocessing_function=preprocess_func)

    generator = datagen.flow_from_directory(
            data_dir,
            target_size=(224, 224),
            batch_size=batch_size,
            class_mode='sparse',
            shuffle=False)
    
    if return_images is False:
        if model_name == 'vit_b16':
            # `layer_x_..`
            layer_index = int(layer[6:7])
            # Iterate over the generator to 
            # collect the representations.
            reprs = []
            for i in range(len(generator)):
                x, y = next(generator)  # x -> (8, 3, 224, 224)
                if 'msa' in layer:
                    # Grabs the MSA outputs 
                    # \in  (bs, seq_len, num_heads, head_dim)
                    # e.g. (8,  197,    12,         64)
                    layer_reprs = model(
                        x, training=False, 
                        output_msa_states=True
                    ).attentions[layer_index].numpy()
                else:
                    layer_reprs = model(
                        x, training=False, 
                        output_hidden_states=True
                    ).hidden_states[layer_index].numpy()

                # Need to flatten the non-batch dimensions as
                # we defer layer output in data pipeline instead
                # of in models.py
                layer_reprs = layer_reprs.reshape(x.shape[0], -1)
                reprs.append(layer_reprs)

            # vstack to reduce the generator step dimension.
            reprs = np.vstack(reprs)
        else:
            reprs = model.predict(generator)
    else:
        reprs, _ = next(generator)

        ### TEST gen match real image. ###
        # fig, ax = plt.subplots(1, reprs.shape[0])
        # for i in range(reprs.shape[0]):
        #     ax[i].imshow(reprs[i]/255.)
        #     ax[i].set_title(f'type[{i}]')
        # plt.savefig('testGenmatch.pdf')
        # exit()
        ### ###

    return reprs, dataset


def data_loader(config, input_shape, seed=42):
    """
    Purpose:
    --------
        - Load train/val datasets.
        - Also we have the option to load dataset for heldout training,
          in other words, a class will be held out during training.
        - This data_loader is compatible for task1-6.
    
    Impl:
    -----
        Load and stack all data-points as a giant matrix to be 
        shuffled and splitted later for training/validation. 
        The entire matrix will be saved so next time we do not 
        have to stack one data-point at a time but loading in
        the entire matrix at once for training/validation.

    inputs:
    -------
        config: ..
        input_shape: this is used to set the empty array to enable concat.
                     and should be the fc1 output size.
        seed: control randomness in train/val split
    """
    XY_dir = config['XY_dir']
    stimulus_set = config['stimulus_set']
    split_ratio = config['split_ratio']
    model_name = config['model_name']
    layer = config['layer']

    # we only stack the data once, once saved we can load off the disk.
    if os.path.exists(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}/X.npy'):
        print(f'[Check] Loading pre-saved X and Y from {XY_dir}/{model_name}/{layer}/task{stimulus_set}/')
        X = np.load(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}/X.npy')
        Y = np.load(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}/Y.npy')
        print(f'[Check] X, Y shape = {X.shape}, {Y.shape}')

    # first time, we stack and save dataset into disk.
    else:
        # if not the Current Biology set, we have 3 features.
        if stimulus_set not in [6, '6']:
            orig2binary = {
                        '000': [0, 0, 0], 
                        '001': [0, 0, 1],
                        '010': [0, 1, 0],
                        '011': [0, 1, 1],
                        '100': [1, 0, 0],
                        '101': [1, 0, 1],
                        '110': [1, 1, 0],
                        '111': [1, 1, 1]}
        # 4 features.
        else:
            orig2binary = {'0000': [0,0,0,0],
                            '0001': [0,0,0,1],
                            '0010': [0,0,1,0],
                            '0011': [0,0,1,1],
                            '0100': [0,1,0,0],
                            '0101': [0,1,0,1],
                            '0110': [0,1,1,0],
                            '0111': [0,1,1,1],
                            '1000': [1,0,0,0],
                            '1001': [1,0,0,1],
                            '1010': [1,0,1,0],
                            '1011': [1,0,1,1],
                            '1100': [1,1,0,0],
                            '1101': [1,1,0,1],
                            '1110': [1,1,1,0],
                            '1111': [1,1,1,1]}

        X = np.empty(input_shape)
        if config['stimulus_set'] not in ['6', 6]:
            Y = np.empty(3)
        else:
            Y = np.empty(4)
        mapping = orig2binary

        preprocessed_dir = config['preprocessed_dir']
        data_dir = f'stimuli/{preprocessed_dir}/{model_name}/{layer}_reprs/task{stimulus_set}/'
        print(f'[Check] Stacking reprs from {data_dir}')
        for stimulus_type in sorted(os.listdir(data_dir)):
            print(f'[Check] Stacking stimulus [{stimulus_type}]')
            y = mapping[stimulus_type]
            for fname in os.listdir(os.path.join(data_dir, stimulus_type)):
                fpath = os.path.join(data_dir, stimulus_type, fname)
                x = np.load(fpath)
                X = np.vstack((X, x))
                Y = np.vstack((Y, y))
                
        X = X[1:, :]
        Y = Y[1:, :]
        print(f'[Check] X.shape={X.shape}')
        print(f'[Check] Y.shape={Y.shape}')

        # save the stacked dataset
        if not os.path.exists(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}'):
            os.makedirs(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}')
        np.save(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}/X.npy', X)
        np.save(f'resources/{XY_dir}/{model_name}/{layer}/task{stimulus_set}/Y.npy', Y)
        print('[Check] saved X, Y.')


    # if heldout, we slice a subset of the X and Y
    # based on the stimulus type we want to hold out.
    heldout_class = config['heldout']
    if config['heldout'] is not None:
        if stimulus_set not in [6, '6']:
            num_sample_per_class = 1024
            if heldout_class == '000':
                a = num_sample_per_class * 0
            if heldout_class == '001':
                a = num_sample_per_class * 1
            if heldout_class == '010':
                a = num_sample_per_class * 2
            if heldout_class == '011':
                a = num_sample_per_class * 3
            if heldout_class == '100':
                a = num_sample_per_class * 4
            if heldout_class == '101':
                a = num_sample_per_class * 5
            if heldout_class == '110':
                a = num_sample_per_class * 6
            if heldout_class == '111':
                a = num_sample_per_class * 7
            
            heldout_indices = np.arange(a, a+num_sample_per_class)
            X = np.delete(X, heldout_indices, axis=0)
            Y = np.delete(Y, heldout_indices, axis=0)
            print(f'[Check] holding out [{heldout_class}]')
        else:
            # because task6 has different number of samples
            # the slices need to set up differently
            num_sample_per_class = 400
            if heldout_class == '0000':
                a = num_sample_per_class * 0
            if heldout_class == '0001':
                a = num_sample_per_class * 1
            if heldout_class == '0010':
                a = num_sample_per_class * 2
            if heldout_class == '0011':
                a = num_sample_per_class * 3
            if heldout_class == '0100':
                a = num_sample_per_class * 4
            if heldout_class == '0101':
                a = num_sample_per_class * 5
            if heldout_class == '0110':
                a = num_sample_per_class * 6
            if heldout_class == '0111':
                a = num_sample_per_class * 7

            if heldout_class == '1000':
                a = num_sample_per_class * 8
            if heldout_class == '1001':
                a = num_sample_per_class * 9
            if heldout_class == '1010':
                a = num_sample_per_class * 10
            if heldout_class == '1011':
                a = num_sample_per_class * 11
            if heldout_class == '1100':
                a = num_sample_per_class * 12
            if heldout_class == '1101':
                a = num_sample_per_class * 13
            if heldout_class == '1110':
                a = num_sample_per_class * 14
            if heldout_class == '1111':
                a = num_sample_per_class * 15

            heldout_indices = np.arange(a, a+num_sample_per_class)
            X = np.delete(X, heldout_indices, axis=0)
            Y = np.delete(Y, heldout_indices, axis=0)
            print(f'[Check] holding out [{heldout_class}]')

    X_train, X_val, \
        Y_train, Y_val = train_test_split(
                            X, Y, 
                            test_size=split_ratio, 
                            random_state=seed)
    print(f'[Check] Training data: {X_train.shape}')
    print(f'[Check] Validation data: {X_val.shape}')
    return (X_train, Y_train), (X_val, Y_val)


def data_loader_gen(config, preprocess_func, shuffle, seed=42):
    """Use generator as data loader for training"""

    preprocessed_dir = config['preprocessed_dir']
    model_name = config['model_name']
    stimulus_set = config['stimulus_set']
    directory = f'stimuli/{preprocessed_dir}/{model_name}/processed_imgs/task{stimulus_set}'

    # TODO. Not ideal but does the trick of loading .npy images
    # from `gen.py`
    if stimulus_set not in ['6', 6]:
        class_mode = 'binary_feat3'
        preprocess_func = None
    else:
        class_mode = 'binary_feat4'

    print(f'[Check] Generator loading data from {directory}')
    train_data = gen.DirectoryIterator(
            directory=directory,
            class_mode=class_mode,
            batch_size=config['batch_size'],
            shuffle=shuffle,
            seed=seed,
            validation_split=config['split_ratio'],
            subset='training',
            preprocessing_function=preprocess_func)

    train_data = label_converter(train_data, stimulus_set)
    train_steps = train_data.compute_step_size()

    val_data = gen.DirectoryIterator(
            directory=directory,
            class_mode=class_mode,
            batch_size=config['batch_size'],
            shuffle=shuffle,
            seed=seed,
            validation_split=config['split_ratio'],
            subset='validation',
            preprocessing_function=preprocess_func)

    val_data = label_converter(val_data, stimulus_set)
    val_steps = val_data.compute_step_size()

    print(f'[Check] train/val steps={train_steps},{val_steps}')
    return train_data, train_steps, val_data, val_steps


def data_loader_gen_v2(config, preprocess_func, shuffle, seed=42):
    """
    v2: supports heldout training

    Use generator as data loader for training
    """
    preprocessed_dir = config['preprocessed_dir']
    model_name = config['model_name']
    stimulus_set = config['stimulus_set']
    directory = f'stimuli/{preprocessed_dir}/{model_name}/processed_imgs/task{stimulus_set}'
    heldout_class = config['heldout']

    if stimulus_set not in ['6', 6]:
        class_mode = 'binary_feat3'
        preprocess_func = None
        all_classes = ['000', '001', '010', '011',
                       '100', '101', '110', '111']
    else:
        class_mode = 'binary_feat4'
        NotImplementedError()
    all_classes_indices = dict(zip(all_classes, range(len(all_classes))))

    # NOTE(ken), this is a hacky bit where we maually construct the dict
    # such that heldout can be done.
    if heldout_class is None:
        classes = all_classes
        class_indices = all_classes_indices
    else:
        classes = [c for c in all_classes if c!= heldout_class]
        class_indices = {}
        for c in classes:
            class_indices[c] = all_classes_indices[c]
    print(f'[Check] class_indices = {class_indices}')

    print(f'[Check] Generator loading data from {directory}')
    train_data = gen_w_heldout.DirectoryIterator(
            directory=directory,
            class_mode=class_mode,
            batch_size=config['batch_size'],
            shuffle=shuffle,
            seed=seed,
            validation_split=config['split_ratio'],
            subset='training',
            preprocessing_function=preprocess_func,
            classes=classes,
            class_indices=class_indices)

    train_data = label_converter(train_data, stimulus_set)
    train_steps = train_data.compute_step_size()

    val_data = gen_w_heldout.DirectoryIterator(
            directory=directory,
            class_mode=class_mode,
            batch_size=config['batch_size'],
            shuffle=shuffle,
            seed=seed,
            validation_split=config['split_ratio'],
            subset='validation',
            preprocessing_function=preprocess_func,
            classes=classes,
            class_indices=class_indices)

    val_data = label_converter(val_data, stimulus_set)
    val_steps = val_data.compute_step_size()

    print(f'[Check] train/val steps={train_steps},{val_steps}')
    return train_data, train_steps, val_data, val_steps


def label_converter(generator, stimulus_set):
    """
    Purpose:
    --------
        Only used when 
            train == 'fulltrain' & task == 'binary'
        Or train == 'funtune' & task == 'binary' & stimulus_set = 6
        This is because the default generator produces labels 
        as `sparse` ints whereas for binary prediction we want 
        to predict 0/1. 
    Impl:
    -----
        We have to intercept the default generators and manually 
        substitute the y labels using a mapping.
    """
    if stimulus_set not in ['6', 6]:
        class2binary = {0: [0, 0, 0], 
                        1: [0, 0, 1],
                        2: [0, 1, 0],
                        3: [0, 1, 1],
                        4: [1, 0, 0],
                        5: [1, 0, 1],
                        6: [1, 1, 0],
                        7: [1, 1, 1]}
    # This is when task=6, we have 4 features as targets.
    else:
        class2binary = {0: [0,0,0,0],
                        1: [0,0,0,1],
                        2: [0,0,1,0],
                        3: [0,0,1,1],
                        4: [0,1,0,0],
                        5: [0,1,0,1],
                        6: [0,1,1,0],
                        7: [0,1,1,1],
                        8: [1,0,0,0],
                        9: [1,0,0,1],
                        10: [1,0,1,0],
                        11: [1,0,1,1],
                        12: [1,1,0,0],
                        13: [1,1,0,1],
                        14: [1,1,1,0],
                        15: [1,1,1,1]}

    mapped_classes = []
    for i, label in enumerate(generator.classes):
        temp = class2binary[label]
        mapped_classes.append(temp)
    generator.classes = mapped_classes
    return generator


def cuda_manager(target, args_list, cuda_id_list, n_concurrent=None):
    """Create CUDA manager.
    Arguments:
        target: A target function to be evaluated.
        args_list: A list of dictionaries, where each dictionary
            contains the arguments necessary for the target function.
        cuda_id_list: A list of eligable CUDA IDs.
        n_concurrent (optional): The number of concurrent CUDA
            processes allowed. By default this is equal to the length
            of `cuda_id_list`.
    Raises:
        Exception
    """
    if n_concurrent is None:
        n_concurrent = len(cuda_id_list)
    else:
        n_concurrent = min([n_concurrent, len(cuda_id_list)])

    shared_exception = multiprocessing.Queue()

    n_task = len(args_list)

    args_queue = multiprocessing.Queue()
    for args in args_list:
        args_queue.put(args)

    # Use a semaphore to make one child process per CUDA ID.
    # NOTE: Using a pool of workers may not work with TF because it
    # re-uses existing processes, which may not release the GPU's memory.
    sema = multiprocessing.BoundedSemaphore(n_concurrent)

    # Use manager to share list of available CUDA IDs among child processes.
    with multiprocessing.Manager() as manager:
        available_cuda = manager.list(cuda_id_list)

        process_list = []
        for _ in range(n_task):
            process_list.append(
                multiprocessing.Process(
                    target=cuda_child,
                    args=(
                        target, args_queue, available_cuda, shared_exception,
                        sema
                    )
                )
            )

        for p in process_list:
            p.start()

        for p in process_list:
            p.join()

    #  Check for raised exceptions.
    e_list = [shared_exception.get() for _ in process_list]
    for e in e_list:
        if e is not None:
            raise e


def cuda_child(target, args_queue, available_cuda, shared_exception, sema):
    """Create child process of the CUDA manager.
    Arguments:
        target: The function to evaluate.
        args_queue: A multiprocessing.Queue that yields a dictionary
            for consumption by `target`.
        available_cuda: A multiprocessing.Manager.list object for
            tracking CUDA device availablility.
        shared_exception: A multiprocessing.Queue for exception
            handling.
        sema: A multiprocessing.BoundedSemaphore object ensuring there
            are never more processes than eligable CUDA devices.
    """
    try:
        sema.acquire()
        args = args_queue.get()
        cuda_id = available_cuda.pop()

        os.environ["CUDA_VISIBLE_DEVICES"] = "{0}".format(cuda_id)

        target(**args)

        shared_exception.put(None)
        available_cuda.append(cuda_id)
        sema.release()

    except Exception as e:
        shared_exception.put(e)
        

class ViT_ImageDataGenerator(tf.keras.preprocessing.image.ImageDataGenerator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.data_format = 'channels_first'

    def standardize(self, x):
        """Applies the normalization configuration in-place to a batch of inputs.

        `x` is changed in-place since the function is mainly used internally
        to standardize images and feed them to your network. If a copy of `x`
        would be created instead it would have a significant performance cost.
        If you want to apply this method without changing the input in-place
        you can call the method creating a copy before:

        standardize(np.copy(x))

        # Arguments
            x: Batch of inputs to be normalized.

        # Returns
            The inputs, normalized.
        """
        if self.preprocessing_function:
            x = self.preprocessing_function(x, return_tensors="tf")['pixel_values']
        if self.rescale:
            x *= self.rescale
        if self.samplewise_center:
            x -= np.mean(x, keepdims=True)
        if self.samplewise_std_normalization:
            x /= (np.std(x, keepdims=True) + 1e-6)

        if self.featurewise_center:
            if self.mean is not None:
                x -= self.mean
            else:
                warnings.warn('This ImageDataGenerator specifies '
                              '`featurewise_center`, but it hasn\'t '
                              'been fit on any training data. Fit it '
                              'first by calling `.fit(numpy_data)`.')
        if self.featurewise_std_normalization:
            if self.std is not None:
                x /= (self.std + 1e-6)
            else:
                warnings.warn('This ImageDataGenerator specifies '
                              '`featurewise_std_normalization`, '
                              'but it hasn\'t '
                              'been fit on any training data. Fit it '
                              'first by calling `.fit(numpy_data)`.')
        if self.zca_whitening:
            if self.principal_components is not None:
                flatx = np.reshape(x, (-1, np.prod(x.shape[-3:])))
                whitex = np.dot(flatx, self.principal_components)
                x = np.reshape(whitex, x.shape)
            else:
                warnings.warn('This ImageDataGenerator specifies '
                              '`zca_whitening`, but it hasn\'t '
                              'been fit on any training data. Fit it '
                              'first by calling `.fit(numpy_data)`.')
        return x
    

if __name__ == '__main__':
    pass