-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocess.py
59 lines (48 loc) · 1.84 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import csv
import json
import configargparse
def json2csv(json_fname, csv_fname, args):
with open(json_fname) as f:
data = json.load(f)
csv_data = []
for line in data:
sentence = line['sentence']
sentence = ' '.join(sentence.split()[:args.sent_max_len])
if args.lower: sentence = sentence.lower()
csv_line = {
'tgt': line['label'],
'input': sentence,
'show_inp': sentence,
'ent1': line['ent1'],
'ent2': line['ent2'],
'id': line['id'],
}
csv_data += [csv_line]
with open(csv_fname, 'w') as f:
writer = csv.DictWriter(f, fieldnames=csv_line.keys())
writer.writeheader()
writer.writerows(csv_data)
print('[Info] Writing {} data to {}'.format(len(csv_data), csv_fname))
def get_args():
parser = configargparse.ArgumentParser(
description='Options for preprocessing')
parser.add_argument('-lower', action='store_true', default=False,
help='whether to keep the uppercase')
parser.add_argument('-sent_max_len', default=100, type=int,
help='the maximum number of words allowed in a sentence')
parser.add_argument('-tokenize', action='store_false', default=True,
help='whether to tokenize the sentences')
parser.add_argument('-data_dir', default='data/re_semeval/', type=str,
help='path to load data from')
args = parser.parse_args()
return args
def main():
args = get_args()
data_dir = args.data_dir
for typ in 'train valid test'.split():
json_fname = os.path.join(data_dir, '{}.json'.format(typ))
csv_fname = os.path.join(data_dir, '{}.csv'.format(typ))
json2csv(json_fname, csv_fname, args)
if __name__ == '__main__':
main()