-
Notifications
You must be signed in to change notification settings - Fork 3
/
make_dataset_for_autoencoder.py
102 lines (83 loc) · 3.74 KB
/
make_dataset_for_autoencoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import random
import json
from argparse import ArgumentParser
from operator import itemgetter
import numpy as np
import pandas as pd
from babi_tools.lib.babi import load_dataset
from babi_tools.ood_augmentation import load_ood
from utils.preprocessing import load_hcn_json
random.seed(273)
np.random.seed(273)
def configure_argument_parser():
result_parser = ArgumentParser('Autoencoder data generation from a bAbI Dialog Task dataset')
result_parser.add_argument('babi_folder')
result_parser.add_argument('result_folder')
result_parser.add_argument('--no_ood_evalset',
action='store_true',
help='Skip making an additional evalset with IND/OOD turns')
result_parser.add_argument('--format', type=str, default='babi', help='babi/icassp')
result_parser.add_argument('--unique_utterances', action='store_true', default=False)
return result_parser
def load_icassp_dataset(in_folder):
result = []
for split_name in ['train', 'dev', 'test', 'train_ood', 'dev_ood', 'test_ood']:
if not os.path.exists(os.path.join(in_folder, '{}.json'.format(split_name))):
continue
split_dialogues = list(enumerate(load_hcn_json(os.path.join(in_folder, '{}.json'.format(split_name)))['dialogs']))
result.append(split_dialogues)
assert len(result) == 3
return result
def load_icassp_json(in_dataset_file):
result = []
with open(in_dataset_file) as dataset_in:
data = json.load(dataset_in)
for dialogue in data['dialogs']:
result.append([])
for idx, turn in enumerate(dialogue['turns']):
if 'input' not in turn or not len(turn['input']['text'].strip()):
continue
agent = 'sys' if idx % 2 == 0 else 'user'
result[-1].append({'text': turn['input']['text'], 'agent': agent})
return result
def save_txt(in_lines, in_dst_file_name):
with open(in_dst_file_name, 'w', encoding='utf-8') as lines_out:
for line in in_lines:
print(line, file=lines_out)
def extract_utterances(in_dialogues, unique=False):
utterances = []
for dialogue_name, dialogue in in_dialogues:
utterances += list(map(itemgetter('input'), dialogue['turns']))
if unique:
utterances = list(set(utterances))
return utterances
def main(in_babi_folder, in_result_folder, in_args):
if in_args.format == 'babi':
dialogues = load_dataset(in_babi_folder, 'task6-dstc2')
else:
dialogues = load_icassp_dataset(in_babi_folder)
train, dev, test = list(map(lambda x: extract_utterances(x, in_args.unique_utterances),
dialogues))
if not os.path.exists(in_result_folder):
os.makedirs(in_result_folder)
save_txt(train, os.path.join(in_result_folder, 'trainset.txt'))
save_txt(dev, os.path.join(in_result_folder, 'devset.txt'))
save_txt(test, os.path.join(in_result_folder, 'testset.txt'))
if in_args.no_ood_evalset:
return
ood = []
for key, values in load_ood().items():
ood += values
unique_test = list(set(test))
unique_ood = list(set(ood))
eval_set = np.concatenate([unique_test,
np.random.choice(unique_ood, size=len(unique_test))])
eval_df = pd.DataFrame({'utterance': eval_set,
'label': np.concatenate([np.ones([len(unique_test)], dtype=np.int32),
np.zeros([len(unique_test)], dtype=np.int32)])})
eval_df.to_json(os.path.join(in_result_folder, 'evalset.json'))
if __name__ == '__main__':
parser = configure_argument_parser()
args = parser.parse_args()
main(args.babi_folder, args.result_folder, args)