Adding CUB data processing.

mateoespinosa · Sep 19, 2022 · 5735e3e · 5735e3e
1 parent b419f19
commit 5735e3e
Show file tree

Hide file tree

Showing 7 changed files with 402 additions and 0 deletions.
diff --git a/cem/data/CUB200/__init__.py b/cem/data/CUB200/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author: Mateo Espinosa Zarlenga
+# @Date:   2022-09-19 18:36:35
+# @Last Modified by:   Mateo Espinosa Zarlenga
+# @Last Modified time: 2022-09-19 18:36:35
diff --git a/cem/data/CUB200/class_attr_data_10/test.pkl b/cem/data/CUB200/class_attr_data_10/test.pkl
diff --git a/cem/data/CUB200/class_attr_data_10/train.pkl b/cem/data/CUB200/class_attr_data_10/train.pkl
diff --git a/cem/data/CUB200/class_attr_data_10/val.pkl b/cem/data/CUB200/class_attr_data_10/val.pkl
diff --git a/cem/data/CUB200/cub_loader.py b/cem/data/CUB200/cub_loader.py
@@ -0,0 +1,310 @@
+"""
+General utils for training, evaluation and data loading
+
+Adapted from: https://github.com/yewsiang/ConceptBottleneck/blob/master/CUB/cub_loader.py
+"""
+import os
+import torch
+import pickle
+import numpy as np
+import torchvision.transforms as transforms
+
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader
+
+# General
+BASE_DIR = ''
+N_ATTRIBUTES = 112
+N_CLASSES = 200
+
+# Training
+UPWEIGHT_RATIO = 9.0
+MIN_LR = 0.0001
+LR_DECAY_SIZE = 0.1
+
+class Sampler(object):
+    """Base class for all Samplers.
+    Every Sampler subclass has to provide an __iter__ method, providing a way
+    to iterate over indices of dataset elements, and a __len__ method that
+    returns the length of the returned iterators.
+    """
+
+    def __init__(self, data_source):
+        pass
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+class StratifiedSampler(Sampler):
+    """Stratified Sampling
+    Provides equal representation of target classes in each batch
+    """
+    def __init__(self, class_vector, batch_size):
+        """
+        Arguments
+        ---------
+        class_vector : torch tensor
+            a vector of class labels
+        batch_size : integer
+            batch_size
+        """
+        self.n_splits = int(class_vector.size(0) / batch_size)
+        self.class_vector = class_vector
+
+    def gen_sample_array(self):
+        try:
+            from sklearn.model_selection import StratifiedShuffleSplit
+        except:
+            print('Need scikit-learn for this functionality')
+        import numpy as np
+
+        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
+        X = torch.randn(self.class_vector.size(0),2).numpy()
+        y = self.class_vector.numpy()
+        s.get_n_splits(X, y)
+
+        train_index, test_index = next(s.split(X, y))
+        return np.hstack([train_index, test_index])
+
+    def __iter__(self):
+        return iter(self.gen_sample_array())
+
+    def __len__(self):
+        return len(self.class_vector)
+
+class CUBDataset(Dataset):
+    """
+    Returns a compatible Torch Dataset object customized for the CUB dataset
+    """
+
+    def __init__(self, pkl_file_paths, use_attr, no_img, uncertain_label, image_dir, n_class_attr, root_dir='../data/CUB200/', transform=None, concept_transform=None):
+        """
+        Arguments:
+        pkl_file_paths: list of full path to all the pkl data
+        use_attr: whether to load the attributes (e.g. False for simple finetune)
+        no_img: whether to load the images (e.g. False for A -> Y model)
+        uncertain_label: if True, use 'uncertain_attribute_label' field (i.e. label weighted by uncertainty score, e.g. 1 & 3(probably) -> 0.75)
+        image_dir: default = 'images'. Will be append to the parent dir
+        n_class_attr: number of classes to predict for each attribute. If 3, then make a separate class for not visible
+        transform: whether to apply any special transformation. Default = None, i.e. use standard ImageNet preprocessing
+        """
+        self.data = []
+        self.is_train = any(["train" in path for path in pkl_file_paths])
+        if not self.is_train:
+            assert any([("test" in path) or ("val" in path) for path in pkl_file_paths])
+        for file_path in pkl_file_paths:
+            self.data.extend(pickle.load(open(file_path, 'rb')))
+        self.transform = transform
+        self.concept_transform = concept_transform
+        self.use_attr = use_attr
+        self.no_img = no_img
+        self.uncertain_label = uncertain_label
+        self.image_dir = image_dir
+        self.n_class_attr = n_class_attr
+        self.root_dir = root_dir
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        img_data = self.data[idx]
+        img_path = img_data['img_path']
+        img_path = img_path.replace(
+            '/juice/scr/scr102/scr/thaonguyen/CUB_supervision/datasets/',
+            '../data/CUB200/'
+        )
+        # Trim unnecessary paths
+        try:
+            idx = img_path.split('/').index('CUB_200_2011')
+            # if self.image_dir != 'images':
+            #     img_path = '/'.join([self.image_dir] + img_path.split('/')[idx+1:])
+            #     img_path = img_path.replace('images/', '')
+            # else:
+            img_path = self.root_dir + '/' + '/'.join(img_path.split('/')[idx:])
+            img = Image.open(img_path).convert('RGB')
+        except:
+            img_path_split = img_path.split('/')
+            split = 'train' if self.is_train else 'test'
+            img_path = '/'.join(img_path_split[:2] + [split] + img_path_split[2:])
+            img = Image.open(img_path).convert('RGB')
+
+        class_label = img_data['class_label']
+        if self.transform:
+            img = self.transform(img)
+
+        if self.use_attr:
+            if self.uncertain_label:
+                attr_label = img_data['uncertain_attribute_label']
+            else:
+                attr_label = img_data['attribute_label']
+            if self.concept_transform is not None:
+                attr_label = self.concept_transform(attr_label)
+            if self.no_img:
+                if self.n_class_attr == 3:
+                    one_hot_attr_label = np.zeros((N_ATTRIBUTES, self.n_class_attr))
+                    one_hot_attr_label[np.arange(N_ATTRIBUTES), attr_label] = 1
+                    return one_hot_attr_label, class_label
+                else:
+                    return attr_label, class_label
+            else:
+                return img, class_label, torch.FloatTensor(attr_label)
+        else:
+            return img, class_label
+
+
+class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
+    """Samples elements randomly from a given list of indices for imbalanced dataset
+    Arguments:
+        indices (list, optional): a list of indices
+        num_samples (int, optional): number of samples to draw
+    """
+
+    def __init__(self, dataset, indices=None):
+        # if indices is not provided,
+        # all elements in the dataset will be considered
+        self.indices = list(range(len(dataset))) \
+            if indices is None else indices
+
+        # if num_samples is not provided,
+        # draw `len(indices)` samples in each iteration
+        self.num_samples = len(self.indices)
+
+        # distribution of classes in the dataset
+        label_to_count = {}
+        for idx in self.indices:
+            label = self._get_label(dataset, idx)
+            if label in label_to_count:
+                label_to_count[label] += 1
+            else:
+                label_to_count[label] = 1
+
+        # weight for each sample
+        weights = [1.0 / label_to_count[self._get_label(dataset, idx)]
+                   for idx in self.indices]
+        self.weights = torch.DoubleTensor(weights)
+
+    def _get_label(self, dataset, idx):  # Note: for single attribute dataset
+        return dataset.data[idx]['attribute_label'][0]
+
+    def __iter__(self):
+        idx = (self.indices[i] for i in torch.multinomial(
+            self.weights, self.num_samples, replacement=True))
+        return idx
+
+    def __len__(self):
+        return self.num_samples
+
+
+def load_data(
+    pkl_paths,
+    use_attr,
+    no_img,
+    batch_size,
+    uncertain_label=False,
+    n_class_attr=2,
+    image_dir='images',
+    resampling=False,
+    resol=299,
+    root_dir='../data/CUB200/',
+    num_workers=1,
+    concept_transform=None,
+):
+    """
+    Note: Inception needs (299,299,3) images with inputs scaled between -1 and 1
+    Loads data with transformations applied, and upsample the minority class if there is class imbalance and weighted loss is not used
+    NOTE: resampling is customized for first attribute only, so change sampler.py if necessary
+    """
+    resized_resol = int(resol * 256/224)
+    is_training = any(['train.pkl' in f for f in pkl_paths])
+    if is_training:
+        transform = transforms.Compose([
+            #transforms.Resize((resized_resol, resized_resol)),
+            #transforms.RandomSizedCrop(resol),
+            transforms.ColorJitter(brightness=32/255, saturation=(0.5, 1.5)),
+            transforms.RandomResizedCrop(resol),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(), #implicitly divides by 255
+            transforms.Normalize(mean = [0.5, 0.5, 0.5], std = [2, 2, 2])
+            #transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], std = [ 0.229, 0.224, 0.225 ]),
+            ])
+    else:
+        transform = transforms.Compose([
+            #transforms.Resize((resized_resol, resized_resol)),
+            transforms.CenterCrop(resol),
+            transforms.ToTensor(), #implicitly divides by 255
+            transforms.Normalize(mean = [0.5, 0.5, 0.5], std = [2, 2, 2])
+            #transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], std = [ 0.229, 0.224, 0.225 ]),
+            ])
+
+    dataset = CUBDataset(
+        pkl_file_paths=pkl_paths,
+        use_attr=use_attr,
+        no_img=no_img,
+        uncertain_label=uncertain_label,
+        image_dir=image_dir,
+        n_class_attr=n_class_attr,
+        transform=transform,
+        root_dir=root_dir,
+        concept_transform=concept_transform,
+    )
+    if is_training:
+        drop_last = True
+        shuffle = True
+    else:
+        drop_last = False
+        shuffle = False
+    if resampling:
+        sampler = StratifiedSampler(ImbalancedDatasetSampler(dataset), batch_size=batch_size)
+        loader = DataLoader(dataset, batch_sampler=sampler, num_workers=num_workers)
+    else:
+        loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
+    return loader
+
+def find_class_imbalance(pkl_file, multiple_attr=False, attr_idx=-1):
+    """
+    Calculate class imbalance ratio for binary attribute labels stored in pkl_file
+    If attr_idx >= 0, then only return ratio for the corresponding attribute id
+    If multiple_attr is True, then return imbalance ratio separately for each attribute. Else, calculate the overall imbalance across all attributes
+    """
+    imbalance_ratio = []
+    data = pickle.load(open(os.path.join(BASE_DIR, pkl_file), 'rb'))
+    n = len(data)
+    n_attr = len(data[0]['attribute_label'])
+    if attr_idx >= 0:
+        n_attr = 1
+    if multiple_attr:
+        n_ones = [0] * n_attr
+        total = [n] * n_attr
+    else:
+        n_ones = [0]
+        total = [n * n_attr]
+    for d in data:
+        labels = d['attribute_label']
+        if multiple_attr:
+            for i in range(n_attr):
+                n_ones[i] += labels[i]
+        else:
+            if attr_idx >= 0:
+                n_ones[0] += labels[attr_idx]
+            else:
+                n_ones[0] += sum(labels)
+    for j in range(len(n_ones)):
+        imbalance_ratio.append(total[j]/n_ones[j] - 1)
+    if not multiple_attr: #e.g. [9.0] --> [9.0] * 312
+        imbalance_ratio *= n_attr
+    return imbalance_ratio
+
+
+if __name__ == '__main__':
+    # train_loader = load_data([train_data_path], args.use_attr, args.no_img, args.batch_size, args.uncertain_labels,
+    #                          image_dir=args.image_dir,
+    #                          n_class_attr=args.n_class_attr, resampling=args.resampling)
+    # val_loader = load_data([val_data_path], args.use_attr, args.no_img, args.batch_size, image_dir=args.image_dir,
+    #                        n_class_attr=args.n_class_attr)
+    val_loader = load_data(pkl_paths=['val.pkl'], use_attr=True, no_img=False, batch_size=128)
+    loader = DataLoader(val_loader, batch_size=128, shuffle=True, drop_last=True)
+    b = next(iter(loader))
+    print()
diff --git a/cem/data/CUB200/data_processing.py b/cem/data/CUB200/data_processing.py
@@ -0,0 +1,82 @@
+"""
+Make train, val, test datasets based on train_test_split.txt, and by sampling
+val_ratio of the official train data to make a validation set.
+Each dataset is a list of metadata, each includes official image id, full image
+path, class label, attribute labels, attribute certainty scores, and attribute
+labels calibrated for uncertainty
+
+Taken from: https://github.com/yewsiang/ConceptBottleneck/blob/master/CUB/data_processing.py
+"""
+import os
+import random
+from os import listdir
+from os.path import isfile, isdir, join
+from collections import defaultdict as ddict
+
+
+def extract_data(data_dir):
+    cwd = os.getcwd()
+    data_path = join(cwd, data_dir + '/images')
+    val_ratio = 0.2
+
+    path_to_id_map = dict() #map from full image path to image id
+    with open(data_path.replace('images', 'images.txt'), 'r') as f:
+        for line in f:
+            items = line.strip().split()
+            key_str = join(data_path, items[1]).replace('\\', '/')
+            path_to_id_map[key_str] = int(items[0])
+
+    attribute_labels_all = ddict(list) #map from image id to a list of attribute labels
+    attribute_certainties_all = ddict(list) #map from image id to a list of attribute certainties
+    attribute_uncertain_labels_all = ddict(list) #map from image id to a list of attribute labels calibrated for uncertainty
+    # 1 = not visible, 2 = guessing, 3 = probably, 4 = definitely
+    uncertainty_map = {1: {1: 0, 2: 0.5, 3: 0.75, 4:1}, #calibrate main label based on uncertainty label
+                        0: {1: 0, 2: 0.5, 3: 0.25, 4: 0}}
+    with open(join(cwd, data_dir + '/attributes/image_attribute_labels.txt'), 'r') as f:
+        for line in f:
+            file_idx, attribute_idx, attribute_label, attribute_certainty = line.strip().split()[:4]
+            attribute_label = int(attribute_label)
+            attribute_certainty = int(attribute_certainty)
+            uncertain_label = uncertainty_map[attribute_label][attribute_certainty]
+            attribute_labels_all[int(file_idx)].append(attribute_label)
+            attribute_uncertain_labels_all[int(file_idx)].append(uncertain_label)
+            attribute_certainties_all[int(file_idx)].append(attribute_certainty)
+
+    is_train_test = dict() #map from image id to 0 / 1 (1 = train)
+    with open(join(cwd, data_dir + '/train_test_split.txt'), 'r') as f:
+        for line in f:
+            idx, is_train = line.strip().split()
+            is_train_test[int(idx)] = int(is_train)
+    print("Number of train images from official train test split:", sum(list(is_train_test.values())))
+
+    train_val_data, test_data = [], []
+    train_data, val_data = [], []
+    folder_list = [f for f in listdir(data_path) if isdir(join(data_path, f))]
+    folder_list.sort() #sort by class index
+    for i, folder in enumerate(folder_list):
+        folder_path = join(data_path, folder)
+        classfile_list = [cf for cf in listdir(folder_path) if (isfile(join(folder_path,cf)) and cf[0] != '.')]
+        #classfile_list.sort()
+        for cf in classfile_list:
+            key_str = join(folder_path, cf).replace('\\', '/')
+            img_id = path_to_id_map[key_str]
+            img_path = join(folder_path, cf).replace('\\', '/')
+            metadata = {'id': img_id, 'img_path': img_path, 'class_label': i,
+                      'attribute_label': attribute_labels_all[img_id], 'attribute_certainty': attribute_certainties_all[img_id],
+                      'uncertain_attribute_label': attribute_uncertain_labels_all[img_id]}
+            if is_train_test[img_id]:
+                train_val_data.append(metadata)
+                # if val_files is not None:
+                #     if img_path in val_files:
+                #         val_data.append(metadata)
+                #     else:
+                #         train_data.append(metadata)
+            else:
+                test_data.append(metadata)
+
+    random.shuffle(train_val_data)
+    split = int(val_ratio * len(train_val_data))
+    train_data = train_val_data[split :]
+    val_data = train_val_data[: split]
+    print('Size of train set:', len(train_data))
+    return train_data, val_data, test_data