-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_list.py
64 lines (55 loc) · 1.9 KB
/
word_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import numpy as np
import random
import re
import util
# regexp = re.compile('[,.;:@#?!&$”\"\-]+')
SPECIAL = ",.;:@#?!&$”\"\-"
inverted_vocabs = {}
for c in ["UK", "USA"]:
data = list(map(str, util.load_data("./data/%s_tokenized.txt" % c)[5].tolist()))
vocab = util.build_vocab(data, least_freq=21)
inverted_vocabs[c] = {k: v for v, k in enumerate(vocab)}
print("start joint_vocab")
joint_vocab = set(inverted_vocabs["UK"].keys()) & set(inverted_vocabs["USA"].keys())
joint_vocab = {w for w in joint_vocab if not any(special in w for special in SPECIAL)}
word_list = pd.read_csv("./data/word_list.csv", encoding="gbk")
# Clean word list
words = word_list["Word"]
for i in range(len(words)):
words[i] = re.sub("\(.+\)", "", words[i])
words[i] = re.sub("\[.+\]", "", words[i])
words[i] = re.sub("\r\n", ",", words[i])
words[i] = re.sub(" ", "", words[i])
word_list["Word"] = words
# Match joint words
def contains(s, key):
for k in key.split(","):
if k in s:
return k
return None
filtered_set = set()
for w in word_list["Word"]:
kw = contains(joint_vocab, w)
if kw:
filtered_set.add(kw)
pos_set = filtered_set
neg_set = set()
while len(pos_set) > len(neg_set):
sample = random.sample(joint_vocab, len(pos_set) - len(neg_set))
sample = set(filter(lambda x : x not in pos_set, sample))
neg_set.update(sample)
word = list(pos_set) + list(neg_set)
label = [1] * len(pos_set) + [0] * len(neg_set)
from sklearn.model_selection import train_test_split
word_train, word_test, label_train, label_test = train_test_split(word, label)
train_data = pd.DataFrame.from_dict({
"word" : word_train,
"label" : label_train,
})
test_data = pd.DataFrame.from_dict({
"word" : word_test,
"label" : label_test,
})
train_data.to_csv("./data/eval_train_badfreq.csv", index=False)
test_data.to_csv("./data/eval_test_badfreq.csv", index=False)