-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembeddings.py
31 lines (24 loc) · 1.12 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np
from arabic_reshaper import arabic_reshaper
from bidi.algorithm import get_display
def load_word_emb(word_index, embedding_file):
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(
get_coefs(*o.rstrip().split(" ")) for o in open(embedding_file, encoding="utf8") if
o.rstrip().split(" ")[0] in word_index)
return embeddings_index
def get_emb_matrix(word_index, max_features, embedding_file):
embeddings_index = load_word_emb(word_index, embedding_file)
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
for word, i in word_index.items():
if i >= max_features: continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
if embedding_vector is None:
word = arabic_reshaper.reshape(word)
print(word)
return embedding_matrix