-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_fold.py
64 lines (56 loc) · 1.93 KB
/
split_fold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from pathlib import Path
import pandas as pd
import json
import numpy as np
VGGSOUND_PATH = "./data/vggsound"
VGGSOUND_PATH = Path(VGGSOUND_PATH)
MAPPING_FILE = VGGSOUND_PATH / 'features/vggsound_mapping.json'
CATEGORY_FILE = VGGSOUND_PATH / 'features/stat.csv'
INPUT_FILE = VGGSOUND_PATH / 'features/vggsound_train_index_classname.hdf5'
with open(MAPPING_FILE, 'r') as input:
class2int = json.load(input)
# class_name -> integer target
# split category-wise
# OUTPUT_FILE = 'category_wise.json'
# df = pd.read_csv(CATEGORY_FILE, sep=',').set_index("class")
# class2category = df.category.to_dict()
# output_json = {}
# categrory2classes = {}
# for _class, _category in class2category.items():
# categrory2classes.setdefault(_category, []).append(_class)
# output_json.setdefault('all_classes', []).append(_class)
# output_json['folds'] = categrory2classes
# with open(OUTPUT_FILE, 'w') as output:
# json.dump(output_json, output)
# randomly split all classes
OUTPUT_FILE = 'class_wise.json'
NUM_FOLDS = 5
import h5py
with h5py.File(INPUT_FILE, 'r') as input:
labels = input['label'][:]
labels = list(map(lambda x: x.decode(), labels))
class_counter = {}
for _label in labels:
class_counter[_label] = class_counter.setdefault(_label, 0) + 1
label_nums = list(class_counter.items())
label_nums.sort(key=lambda x: x[1], reverse=True)
folds = {}
# seq = 0
# for label, num in label_nums:
# folds.setdefault(f'fold{seq}', []).append([label, num])
# seq = (seq + 1) % NUM_FOLDS
random_state = np.random.RandomState(seed=0)
seqs = list(range(NUM_FOLDS))
random_state.shuffle(seqs)
for label, _ in label_nums:
if not seqs:
seqs = list(range(NUM_FOLDS))
random_state.shuffle(seqs)
seq = seqs.pop()
folds.setdefault(f'fold{seq + 1}', []).append(label)
output_json = {
'all_classes': list(class_counter.keys()),
'folds': folds
}
with open(OUTPUT_FILE, 'w') as output:
json.dump(output_json, output)