-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquestion_preprocess.py
75 lines (60 loc) · 2.66 KB
/
question_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import torch
import numpy as np
from konlpy.tag import Mecab
import preprocess.datautils.utils as utils
def process_question_multichoice(q_dic:dict):
q = q_dic['question']
answer_cand = np.asarray(q_dic['answers'])
with open('./data/video-narr/video-narr_vocab.json','r') as f:
vocab = json.load(f)
return multichoice_encoding_data(vocab, q, 0, answer_cand)
def multichoice_encoding_data(vocab, question,correct_idx, answer_candidate):
#Encode all questions
print('Encoding data')
m = Mecab().morphs
questions_encoded = []
questions_len =[]
all_answer_candidate_encoded = []
all_answer_candidate_lens = []
question_tokens = m(question)
question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
questions_encoded.append(question_encoded)
questions_len.append(len(question_encoded))
answer = 0
correct_answer = answer
candidates_encoded = []
candidates_len = []
for answer in answer_candidate:
answer_tokens = m(answer)
candidate_encoded = utils.encode(answer_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
candidates_encoded.append(candidate_encoded)
candidates_len.append(len(candidate_encoded))
all_answer_candidate_encoded.append(candidates_encoded)
all_answer_candidate_lens.append(candidates_len)
# Pad encoded questions
max_question_length = max(len(x) for x in questions_encoded)
for qe in questions_encoded:
while len(qe) < max_question_length:
qe.append(vocab['question_answer_token_to_idx']['<NULL>'])
questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
questions_len = np.asarray(questions_len, dtype=np.int32)
print(questions_encoded.shape)
# Pad encoded answer candidates
max_answer_candidate_length = max(max(len(x) for x in candidate) for candidate in all_answer_candidate_encoded)
for ans_cands in all_answer_candidate_encoded:
for ans in ans_cands:
while len(ans) < max_answer_candidate_length:
ans.append(vocab['question_answer_token_to_idx']['<NULL>'])
all_answer_candidate_encoded = np.asarray(all_answer_candidate_encoded, dtype=np.int32)
all_answer_candidate_lens = np.asarray(all_answer_candidate_lens, dtype=np.int32)
print(all_answer_candidate_encoded.shape)
glove_matrix = None
return {
'question': questions_encoded,
'question_len': questions_len,
'ans_candidate': all_answer_candidate_encoded,
'ans_candidate_len': all_answer_candidate_lens,
'answer': correct_answer,
'glove': glove_matrix,
}