-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
124 lines (102 loc) · 5.55 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utility import *
from subword_embedding import Subword_Embedding
from match_func import Match
# The main function
if __name__ == '__main__':
########################--LOAD standard-DEFINED DICT--##################################################
# Get the standard terms, synonym terms, and terms' sub-words
file_threshold = '200'
knowledge = read_csv(path_txt='pre_words_dict-' + file_threshold + '.csv')
standard_terms, standard_synonym = knowledge[:, 0], knowledge[:, 1]
print('There are', np.shape(standard_synonym)[0], 'standard and synonym terms!')
# Get the sub-words list
subword_list = read_csv(path_txt='subwords_freq_' + file_threshold + '.csv')
subword_list = subword_list[:, 1]
# Load standard-trained vectors and get word Embeddings of standard and synonym words
pre_trained = load_word_vector(path='data/', word_dim=128)
subword_embed = Subword_Embedding(sub_list=subword_list, pre_trained=pre_trained, standard_synonym=standard_synonym)
synonym_vec, synonym_term = subword_embed.load_standard_vector()
print('There are', np.shape(synonym_term)[0], 'standard and synonym terms that own word vectors!')
while True:
########################--INPUT STRING--#####################################################
# Input a string that to be mapped
input_str = input('Please input a synonym word or term: ')
start = time.time()
# Load Match Class
match_class = Match(input_str=input_str,
knowledge=knowledge,
standard_terms=standard_terms,
standard_synonym=standard_synonym,
sub_list=subword_list,
pre_trained=pre_trained,
synonym_vec=synonym_vec,
synonym_term=synonym_term)
#################--FIND STANDARD TERM--####################################################
# Get all the mapping w.r.t. standard terms
temp_str = remove_punctuation(term=input_str)
temp_str = temp_str.replace(temp_str, temp_str.lower()) # Use lowercase if there is English
# Find and remove English from term
re_eng, eng_subword = find_English_term(term=temp_str)
#####################--GET SUB-WORDS--#########################################################
input_subword = subword_embed.get_subword(term=re_eng, is_print=False)
# Combine the sub-words with the removed English term(s)
subwords = match_class.eng_with_sub(eng=eng_subword, subword=input_subword)
print('All the sub-words are', subwords)
# Get the mapping of each sub-word [Sub-word -> Standard Term]
matched = []
matched_loc = []
for i in subwords:
# This sub-word is in the standard terms
if i in standard_terms and len(i) > 1:
print(i, '----->', i)
matched.append(i)
try:
start_loc = temp_str.index(i)
end_loc = start_loc + len(i)
matched_loc.append([start_loc, end_loc])
except ValueError:
print('{} not found in the search space.'.format(i))
continue
# This sub-word is in the synonym terms
elif i in standard_synonym and len(i) > 1:
s_index = standard_synonym.tolist().index(i)
print(i, '----->', knowledge[s_index, 0])
matched.append(knowledge[s_index, 0])
try:
start_loc = temp_str.index(i)
end_loc = start_loc + len(i)
matched_loc.append([start_loc, end_loc])
except ValueError:
print('{} not found in the search space.'.format(i))
continue
# other non-matched sub-word
else:
print(i, '----->', False)
#################--FIND THE NON-MATCHED term--###################################################
# Get the Non-matched sub-words
non_match = match_class.non_match_word(matched_loc=matched_loc)
# If there was no non-matched sub-words
if non_match == []:
if len(matched) == 1: # One Matched Standard Term
print('Final Mapping ::: ', input_str, '----->', matched[0])
else: # Multiple Matched Standard Terms
match_class.final_mapping(all_standard=matched)
else: # If there were non-matched sub-words
print('The None-matched sub-words are ', non_match, '\n', '-' * 100)
# Sub-words mapped to standard term
# out_standard = match_class.subword_mapping(non_match=non_match)
####################################################################
# out_standard: The Standard term mapped by the Non-matched sub-word
# matched: Matched Standard Term of the sub-word
# non_match: The Non-matched sub-word
# subwords: The sub-words of the input string
# input_str: The input string
input_jieba = jieba.lcut(re_eng, HMM=True)
out_standard = list(set(matched + non_match + subwords + input_jieba))
print('All the sub-words\' mapped standard terms: ', out_standard)
# [Final] standard term Mapping
match_class.final_mapping(all_standard=out_standard)
end = time.time()
print('Used time: %s\n' % (end - start))