-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_dataset.py
129 lines (115 loc) · 6.82 KB
/
make_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from argparse import ArgumentParser
from collections import defaultdict
from os import environ
from pathlib import Path
from transformers import AutoTokenizer
from common.model.const import DEF_ENCODER
from common.sys.convert import string_to_text_instance
from common.sys.key import QUESTION, ANSWER, EQUATION, EXECUTION
from common.sys.pattern import *
from evaluate import Executor
from simulate import Simulator
from yaml import dump as yaml_dump
from json import dump as json_save
from random import seed
from solver import python_code_to_executions, execution_to_python_code
def read_arguments():
parser = ArgumentParser()
parser.add_argument('--template', '-template', '-t', type=str, required=False, default="./resources/tot_temp",
help='Root directory of template YAML files')
parser.add_argument('--vocab', '-vocab', '-v', type=str, required=False, default="./resources/vocab.yaml",
help='Root directory of template YAML files')
parser.add_argument('--num-item', '--num-sample', '-item', '-sample', '-n', type=int, required=False, default=10,
help='Number of items generated for each template file')
parser.add_argument('--output', '-out', '-o', type=str, required=False, default="./resources/new_dataset",
help='Root directory for saving output dataset files')
parser.add_argument('--seed', '-seed', '-s', type=int, default=1029,
help='Random seed for generating items')
parser.add_argument('--time-limit', '-limit', '-l', type=float, default=0.5,
help='Time limit for evaluating python code')
parser.add_argument('--tokenizer', '-tokenizer', '-z', type=str, default=DEF_ENCODER,
help='Pre-trained Tokenizer')
return parser.parse_args()
if __name__ == '__main__':
# Read command-line arguments, including templateroot, numitems, datasetpath, seed
args = read_arguments()
environ['DEBUG'] = 'True'
# Create a simulator of type simulator.Simulator
simulator = Simulator()
# Register seed using simulator.set_seed()
simulator.set_seed(args.seed)
seed(args.seed)
# Create executor to check the dataset
executor = Executor(time_limit=0.5)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
# Register templates using simulator.load_templates()
simulator.load_templates(args.template)
simulator.load_vocab(args.vocab)
try:
# Get generated items using simulator.generate()
problems = {}
splits = defaultdict(list)
# Convert code_template to python codes
for template in simulator.generate(args.num_item):
print('%s 변환중' % template[0].template)
for i, item in enumerate(template):
text = string_to_text_instance(item.text, tokenizer)
execution = python_code_to_executions(item.code_template)
raw_code = execution_to_python_code(execution, text.word_info[0])
item.code, item.executed = executor.run(raw_code)
item.answer = item.executed
item.execution = [x.to_list() for x in execution]
assert ALL_KOREAN_PATTERN.match(item.code) is None, \
'코드에는 한글이 포함될 수 없습니다(Template %s).\n\t실행한 코드\n%s' % (item.template, item.code)
assert '\n' not in item.executed, \
'답은 오직 하나여야 합니다(Template %s). 지금은 %s개의 답이 출력되었습니다: %s' % \
(item.template, item.executed.count('\n') + 1, item.executed.split('\n'))
if NUMBER_PATTERN.fullmatch(item.executed):
assert '.' not in item.executed or UNDER_TWO_DIGIT.fullmatch(item.executed) is not None, \
'출력된 답 "%s"(이)가 대회에서 지정한 출력 형식(정수이거나 소숫점 이하 두자리)에 맞지 않습니다(Template %s).' % \
(item.executed, item.template)
elif FRACTION_PATTERN.fullmatch(item.executed) is None:
assert ALL_KOREAN_PATTERN.fullmatch(item.executed) or VARIABLE_PATTERN.fullmatch(item.executed) is not None, \
'출력된 답 "%s"(이)가 대회에서 지정한 출력 형식(텍스트인 경우 기타 기호 없이 한글만)에 맞지 않습니다(Template %s).' % \
(item.executed, item.template)
assert item.answer == item.executed, \
'기대한 답 "%s"(이)가 계산된 답 "%s"(와)과 일치하지 않습니다(Template %s)!\n\t문제: "%s"\n토큰화: "%s"\n\t실행한 코드\n%s' % \
(item.answer, item.executed, item.template, item.text, tokenizer.decode(text.tokens), item.code)
key = str(len(problems))
problems[key] = item
splits['train' if i % 10 < 8 else ('dev' if i % 10 == 8 else 'test')].append(key)
# Store generated items into datasetpath
output = Path(args.output)
experiments = output / 'split'
if not experiments.exists():
experiments.mkdir(parents=True)
# (1) problemsheet.json
with (output / 'problemsheet.json').open('w+t', encoding='UTF-8') as fp:
obj_to_write = {str(key): {QUESTION: prob.text}
for key, prob in problems.items()}
json_save(obj_to_write, fp)
# (2) answersheet.json
with (output / 'answersheet.json').open('w+t', encoding='UTF-8') as fp:
obj_to_write = {str(key): {ANSWER: prob.answer, EQUATION: prob.code}
for key, prob in problems.items()}
json_save(obj_to_write, fp)
# (3) dataset.json
with (output / 'dataset.json').open('w+t', encoding='UTF-8') as fp:
obj_to_write = {str(key): {QUESTION: prob.text, ANSWER: prob.answer,
EQUATION: prob.code, EXECUTION: prob.execution}
for key, prob in problems.items()}
json_save(obj_to_write, fp, ensure_ascii=False)
# (4) split
for key, split in splits.items():
with (experiments / key).open('w+t', encoding='UTF-8') as fp:
fp.writelines([line + '\n' for line in split])
# (5) dataset.yaml
with (output / 'dataset.yaml').open('w+t', encoding='UTF-8') as fp:
obj_to_write = [{QUESTION: prob.text, ANSWER: prob.answer,
EQUATION: prob.code, 'id': key, 'template': prob.template}
for key, prob in problems.items()]
yaml_dump(obj_to_write, fp, allow_unicode=True, default_style='|', line_break=True)
finally:
# Finalize the executor
executor.close()