-
Notifications
You must be signed in to change notification settings - Fork 1
/
word_dict_gen.py
64 lines (59 loc) · 2.48 KB
/
word_dict_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import json_lines
import re
from keras.preprocessing.text import Tokenizer
# Contextual embeddeding of symbols
texts = [] # list of text samples
id_list = []
question_list = []
label_list = []
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
TEXT_DATA_DIR = os.path.abspath('.') + "/data/pararule"
# TEXT_DATA_DIR = "D:\\AllenAI\\20_newsgroup"
Str = '.jsonl'
CONTEXT_TEXTS = []
#test_str = 'test'
meta_str = 'meta'
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
fpath = os.path.join(path, fname)
if Str in fpath:
#if test_str not in fpath:
if meta_str not in fpath:
with open(fpath) as f:
for l in json_lines.reader(f):
#if l["id"] not in id_list:
id_list.append(l["id"])
questions = l["questions"]
context = l["context"].replace("\n", " ").replace(".", "")
context = context.replace(",", "")
context = context.replace("!", "")
context = re.sub(r'\s+', ' ', context)
CONTEXT_TEXTS.append(context)
# for i in range(len(questions)):
# text = questions[i]["text"]
# label = questions[i]["label"]
# if label == True:
# t = 1
# else:
# t = 0
# q = re.sub(r'\s+', ' ', text)
# q = q.replace(',', '')
# texts.append(context)
# question_list.append(q)
# label_list.append(int(t))
f.close()
# labels.append(label_id)
print('Found %s texts.' % len(CONTEXT_TEXTS))
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS,lower=True,filters="")
tokenizer.fit_on_texts(CONTEXT_TEXTS)
# sequences = tokenizer.texts_to_sequences(texts)
WORD_INDEX = tokenizer.word_index
print('Found %s unique tokens.' % len(WORD_INDEX))