Skip to content

Commit

Permalink
Adding CUB data processing.
Browse files Browse the repository at this point in the history
  • Loading branch information
mateoespinosa committed Sep 19, 2022
1 parent b419f19 commit 5735e3e
Show file tree
Hide file tree
Showing 7 changed files with 402 additions and 0 deletions.
5 changes: 5 additions & 0 deletions cem/data/CUB200/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
# @Author: Mateo Espinosa Zarlenga
# @Date: 2022-09-19 18:36:35
# @Last Modified by: Mateo Espinosa Zarlenga
# @Last Modified time: 2022-09-19 18:36:35
Binary file added cem/data/CUB200/class_attr_data_10/test.pkl
Binary file not shown.
Binary file added cem/data/CUB200/class_attr_data_10/train.pkl
Binary file not shown.
Binary file added cem/data/CUB200/class_attr_data_10/val.pkl
Binary file not shown.
310 changes: 310 additions & 0 deletions cem/data/CUB200/cub_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
"""
General utils for training, evaluation and data loading
Adapted from: https://github.com/yewsiang/ConceptBottleneck/blob/master/CUB/cub_loader.py
"""
import os
import torch
import pickle
import numpy as np
import torchvision.transforms as transforms

from PIL import Image
from torch.utils.data import Dataset, DataLoader

# General
BASE_DIR = ''
N_ATTRIBUTES = 112
N_CLASSES = 200

# Training
UPWEIGHT_RATIO = 9.0
MIN_LR = 0.0001
LR_DECAY_SIZE = 0.1

class Sampler(object):
"""Base class for all Samplers.
Every Sampler subclass has to provide an __iter__ method, providing a way
to iterate over indices of dataset elements, and a __len__ method that
returns the length of the returned iterators.
"""

def __init__(self, data_source):
pass

def __iter__(self):
raise NotImplementedError

def __len__(self):
raise NotImplementedError

class StratifiedSampler(Sampler):
"""Stratified Sampling
Provides equal representation of target classes in each batch
"""
def __init__(self, class_vector, batch_size):
"""
Arguments
---------
class_vector : torch tensor
a vector of class labels
batch_size : integer
batch_size
"""
self.n_splits = int(class_vector.size(0) / batch_size)
self.class_vector = class_vector

def gen_sample_array(self):
try:
from sklearn.model_selection import StratifiedShuffleSplit
except:
print('Need scikit-learn for this functionality')
import numpy as np

s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
X = torch.randn(self.class_vector.size(0),2).numpy()
y = self.class_vector.numpy()
s.get_n_splits(X, y)

train_index, test_index = next(s.split(X, y))
return np.hstack([train_index, test_index])

def __iter__(self):
return iter(self.gen_sample_array())

def __len__(self):
return len(self.class_vector)

class CUBDataset(Dataset):
"""
Returns a compatible Torch Dataset object customized for the CUB dataset
"""

def __init__(self, pkl_file_paths, use_attr, no_img, uncertain_label, image_dir, n_class_attr, root_dir='../data/CUB200/', transform=None, concept_transform=None):
"""
Arguments:
pkl_file_paths: list of full path to all the pkl data
use_attr: whether to load the attributes (e.g. False for simple finetune)
no_img: whether to load the images (e.g. False for A -> Y model)
uncertain_label: if True, use 'uncertain_attribute_label' field (i.e. label weighted by uncertainty score, e.g. 1 & 3(probably) -> 0.75)
image_dir: default = 'images'. Will be append to the parent dir
n_class_attr: number of classes to predict for each attribute. If 3, then make a separate class for not visible
transform: whether to apply any special transformation. Default = None, i.e. use standard ImageNet preprocessing
"""
self.data = []
self.is_train = any(["train" in path for path in pkl_file_paths])
if not self.is_train:
assert any([("test" in path) or ("val" in path) for path in pkl_file_paths])
for file_path in pkl_file_paths:
self.data.extend(pickle.load(open(file_path, 'rb')))
self.transform = transform
self.concept_transform = concept_transform
self.use_attr = use_attr
self.no_img = no_img
self.uncertain_label = uncertain_label
self.image_dir = image_dir
self.n_class_attr = n_class_attr
self.root_dir = root_dir

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
img_data = self.data[idx]
img_path = img_data['img_path']
img_path = img_path.replace(
'/juice/scr/scr102/scr/thaonguyen/CUB_supervision/datasets/',
'../data/CUB200/'
)
# Trim unnecessary paths
try:
idx = img_path.split('/').index('CUB_200_2011')
# if self.image_dir != 'images':
# img_path = '/'.join([self.image_dir] + img_path.split('/')[idx+1:])
# img_path = img_path.replace('images/', '')
# else:
img_path = self.root_dir + '/' + '/'.join(img_path.split('/')[idx:])
img = Image.open(img_path).convert('RGB')
except:
img_path_split = img_path.split('/')
split = 'train' if self.is_train else 'test'
img_path = '/'.join(img_path_split[:2] + [split] + img_path_split[2:])
img = Image.open(img_path).convert('RGB')

class_label = img_data['class_label']
if self.transform:
img = self.transform(img)

if self.use_attr:
if self.uncertain_label:
attr_label = img_data['uncertain_attribute_label']
else:
attr_label = img_data['attribute_label']
if self.concept_transform is not None:
attr_label = self.concept_transform(attr_label)
if self.no_img:
if self.n_class_attr == 3:
one_hot_attr_label = np.zeros((N_ATTRIBUTES, self.n_class_attr))
one_hot_attr_label[np.arange(N_ATTRIBUTES), attr_label] = 1
return one_hot_attr_label, class_label
else:
return attr_label, class_label
else:
return img, class_label, torch.FloatTensor(attr_label)
else:
return img, class_label


class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
"""Samples elements randomly from a given list of indices for imbalanced dataset
Arguments:
indices (list, optional): a list of indices
num_samples (int, optional): number of samples to draw
"""

def __init__(self, dataset, indices=None):
# if indices is not provided,
# all elements in the dataset will be considered
self.indices = list(range(len(dataset))) \
if indices is None else indices

# if num_samples is not provided,
# draw `len(indices)` samples in each iteration
self.num_samples = len(self.indices)

# distribution of classes in the dataset
label_to_count = {}
for idx in self.indices:
label = self._get_label(dataset, idx)
if label in label_to_count:
label_to_count[label] += 1
else:
label_to_count[label] = 1

# weight for each sample
weights = [1.0 / label_to_count[self._get_label(dataset, idx)]
for idx in self.indices]
self.weights = torch.DoubleTensor(weights)

def _get_label(self, dataset, idx): # Note: for single attribute dataset
return dataset.data[idx]['attribute_label'][0]

def __iter__(self):
idx = (self.indices[i] for i in torch.multinomial(
self.weights, self.num_samples, replacement=True))
return idx

def __len__(self):
return self.num_samples


def load_data(
pkl_paths,
use_attr,
no_img,
batch_size,
uncertain_label=False,
n_class_attr=2,
image_dir='images',
resampling=False,
resol=299,
root_dir='../data/CUB200/',
num_workers=1,
concept_transform=None,
):
"""
Note: Inception needs (299,299,3) images with inputs scaled between -1 and 1
Loads data with transformations applied, and upsample the minority class if there is class imbalance and weighted loss is not used
NOTE: resampling is customized for first attribute only, so change sampler.py if necessary
"""
resized_resol = int(resol * 256/224)
is_training = any(['train.pkl' in f for f in pkl_paths])
if is_training:
transform = transforms.Compose([
#transforms.Resize((resized_resol, resized_resol)),
#transforms.RandomSizedCrop(resol),
transforms.ColorJitter(brightness=32/255, saturation=(0.5, 1.5)),
transforms.RandomResizedCrop(resol),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(), #implicitly divides by 255
transforms.Normalize(mean = [0.5, 0.5, 0.5], std = [2, 2, 2])
#transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], std = [ 0.229, 0.224, 0.225 ]),
])
else:
transform = transforms.Compose([
#transforms.Resize((resized_resol, resized_resol)),
transforms.CenterCrop(resol),
transforms.ToTensor(), #implicitly divides by 255
transforms.Normalize(mean = [0.5, 0.5, 0.5], std = [2, 2, 2])
#transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], std = [ 0.229, 0.224, 0.225 ]),
])

dataset = CUBDataset(
pkl_file_paths=pkl_paths,
use_attr=use_attr,
no_img=no_img,
uncertain_label=uncertain_label,
image_dir=image_dir,
n_class_attr=n_class_attr,
transform=transform,
root_dir=root_dir,
concept_transform=concept_transform,
)
if is_training:
drop_last = True
shuffle = True
else:
drop_last = False
shuffle = False
if resampling:
sampler = StratifiedSampler(ImbalancedDatasetSampler(dataset), batch_size=batch_size)
loader = DataLoader(dataset, batch_sampler=sampler, num_workers=num_workers)
else:
loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
return loader

def find_class_imbalance(pkl_file, multiple_attr=False, attr_idx=-1):
"""
Calculate class imbalance ratio for binary attribute labels stored in pkl_file
If attr_idx >= 0, then only return ratio for the corresponding attribute id
If multiple_attr is True, then return imbalance ratio separately for each attribute. Else, calculate the overall imbalance across all attributes
"""
imbalance_ratio = []
data = pickle.load(open(os.path.join(BASE_DIR, pkl_file), 'rb'))
n = len(data)
n_attr = len(data[0]['attribute_label'])
if attr_idx >= 0:
n_attr = 1
if multiple_attr:
n_ones = [0] * n_attr
total = [n] * n_attr
else:
n_ones = [0]
total = [n * n_attr]
for d in data:
labels = d['attribute_label']
if multiple_attr:
for i in range(n_attr):
n_ones[i] += labels[i]
else:
if attr_idx >= 0:
n_ones[0] += labels[attr_idx]
else:
n_ones[0] += sum(labels)
for j in range(len(n_ones)):
imbalance_ratio.append(total[j]/n_ones[j] - 1)
if not multiple_attr: #e.g. [9.0] --> [9.0] * 312
imbalance_ratio *= n_attr
return imbalance_ratio


if __name__ == '__main__':
# train_loader = load_data([train_data_path], args.use_attr, args.no_img, args.batch_size, args.uncertain_labels,
# image_dir=args.image_dir,
# n_class_attr=args.n_class_attr, resampling=args.resampling)
# val_loader = load_data([val_data_path], args.use_attr, args.no_img, args.batch_size, image_dir=args.image_dir,
# n_class_attr=args.n_class_attr)
val_loader = load_data(pkl_paths=['val.pkl'], use_attr=True, no_img=False, batch_size=128)
loader = DataLoader(val_loader, batch_size=128, shuffle=True, drop_last=True)
b = next(iter(loader))
print()
82 changes: 82 additions & 0 deletions cem/data/CUB200/data_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Make train, val, test datasets based on train_test_split.txt, and by sampling
val_ratio of the official train data to make a validation set.
Each dataset is a list of metadata, each includes official image id, full image
path, class label, attribute labels, attribute certainty scores, and attribute
labels calibrated for uncertainty
Taken from: https://github.com/yewsiang/ConceptBottleneck/blob/master/CUB/data_processing.py
"""
import os
import random
from os import listdir
from os.path import isfile, isdir, join
from collections import defaultdict as ddict


def extract_data(data_dir):
cwd = os.getcwd()
data_path = join(cwd, data_dir + '/images')
val_ratio = 0.2

path_to_id_map = dict() #map from full image path to image id
with open(data_path.replace('images', 'images.txt'), 'r') as f:
for line in f:
items = line.strip().split()
key_str = join(data_path, items[1]).replace('\\', '/')
path_to_id_map[key_str] = int(items[0])

attribute_labels_all = ddict(list) #map from image id to a list of attribute labels
attribute_certainties_all = ddict(list) #map from image id to a list of attribute certainties
attribute_uncertain_labels_all = ddict(list) #map from image id to a list of attribute labels calibrated for uncertainty
# 1 = not visible, 2 = guessing, 3 = probably, 4 = definitely
uncertainty_map = {1: {1: 0, 2: 0.5, 3: 0.75, 4:1}, #calibrate main label based on uncertainty label
0: {1: 0, 2: 0.5, 3: 0.25, 4: 0}}
with open(join(cwd, data_dir + '/attributes/image_attribute_labels.txt'), 'r') as f:
for line in f:
file_idx, attribute_idx, attribute_label, attribute_certainty = line.strip().split()[:4]
attribute_label = int(attribute_label)
attribute_certainty = int(attribute_certainty)
uncertain_label = uncertainty_map[attribute_label][attribute_certainty]
attribute_labels_all[int(file_idx)].append(attribute_label)
attribute_uncertain_labels_all[int(file_idx)].append(uncertain_label)
attribute_certainties_all[int(file_idx)].append(attribute_certainty)

is_train_test = dict() #map from image id to 0 / 1 (1 = train)
with open(join(cwd, data_dir + '/train_test_split.txt'), 'r') as f:
for line in f:
idx, is_train = line.strip().split()
is_train_test[int(idx)] = int(is_train)
print("Number of train images from official train test split:", sum(list(is_train_test.values())))

train_val_data, test_data = [], []
train_data, val_data = [], []
folder_list = [f for f in listdir(data_path) if isdir(join(data_path, f))]
folder_list.sort() #sort by class index
for i, folder in enumerate(folder_list):
folder_path = join(data_path, folder)
classfile_list = [cf for cf in listdir(folder_path) if (isfile(join(folder_path,cf)) and cf[0] != '.')]
#classfile_list.sort()
for cf in classfile_list:
key_str = join(folder_path, cf).replace('\\', '/')
img_id = path_to_id_map[key_str]
img_path = join(folder_path, cf).replace('\\', '/')
metadata = {'id': img_id, 'img_path': img_path, 'class_label': i,
'attribute_label': attribute_labels_all[img_id], 'attribute_certainty': attribute_certainties_all[img_id],
'uncertain_attribute_label': attribute_uncertain_labels_all[img_id]}
if is_train_test[img_id]:
train_val_data.append(metadata)
# if val_files is not None:
# if img_path in val_files:
# val_data.append(metadata)
# else:
# train_data.append(metadata)
else:
test_data.append(metadata)

random.shuffle(train_val_data)
split = int(val_ratio * len(train_val_data))
train_data = train_val_data[split :]
val_data = train_val_data[: split]
print('Size of train set:', len(train_data))
return train_data, val_data, test_data
Loading

0 comments on commit 5735e3e

Please sign in to comment.