-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword2vec.py
65 lines (52 loc) · 1.94 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""
@Time:Created on 2019/4/30 16:10
@author: LiFan Chen
@Filename: word2vec.py
@Software: PyCharm
"""
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
def seq_to_kmers(seq, k=3):
""" Divide a string into a list of kmers strings.
Parameters:
seq (string)
k (int), default 3
Returns:
List containing a list of kmers.
"""
N = len(seq)
return [seq[i:i+k] for i in range(N - k + 1)]
class Corpus(object):
""" An iteratable for training seq2vec models. """
def __init__(self, dir, ngram):
self.df = pd.read_csv(dir)
self.ngram = ngram
def __iter__(self):
for sentence in self.df.Seq.values:
yield seq_to_kmers(sentence, self.ngram)
def get_protein_embedding(model,protein):
"""get protein embedding,infer a list of 3-mers to (num_word,100) matrix"""
vec = np.zeros((len(protein), 100))
i = 0
for word in protein:
try:
vec[i, ] = model.wv[word]
except KeyError as e:
raise e
except Exception as e:
print(e)
i += 1
return vec
if __name__ == "__main__":
sent_corpus = Corpus("dataset/celegans_uniprot.csv",3)
model = Word2Vec(size=100, window=5, min_count=1, workers=6)
model.build_vocab(sent_corpus)
model.train(sent_corpus,epochs=30,total_examples=model.corpus_count)
model.save("word2vec_30_celegans.model")
"""
model = Word2Vec.load("word2vec_30.model")
vector = get_protein_embedding(model,seq_to_kmers("MSPLNQSAEGLPQEASNRSLNATETSEAWDPRTLQALKISLAVVLSVITLATVLSNAFVLTTILLTRKLHTPANYLIGSLATTDLLVSILVMPISIAYTITHTWNFGQILCDIWLSSDITCCTASILHLCVIALDRYWAITDALEYSKRRTAGHAATMIAIVWAISICISIPPLFWRQAKAQEEMSDCLVNTSQISYTIYSTCGAFYIPSVLLIILYGRIYRAARNRILNPPSLYGKRFTTAHLITGSAGSSLCSLNSSLHEGHSHSAGSPLFFNHVKIKLADSALERKRISAARERKATKILGIILGAFIICWLPFFVVSLVLPICRDSCWIHPALFDFFTWLGYLNSLINPIIYTVFNEEFRQAFQKIVPFRKAS"))
print(vector.shape)
"""