-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword2vec.py
39 lines (29 loc) · 1.1 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
import pandas as pd
import gensim
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
import warnings
warnings.filterwarnings(action='ignore')
data_raw=pd.read_csv('./input/raw_data.csv',sep='\t',header=None,names=['id','question1','question2','is_dup'])
def extract_questions():
"""
Extract questions for making word2vec model.
"""
for dataset in [data_raw]:
for i, row in dataset.iterrows():
if i != 0 and i % 1000 == 0:
logging.info("read {0} sentences".format(i))
if row['question1']:
yield gensim.utils.simple_preprocess(row['question1'])
if row['question2']:
yield gensim.utils.simple_preprocess(row['question2'])
documents = list(extract_questions())
logging.info("Done reading data file")
model = gensim.models.Word2Vec(documents, size=300)
model.train(documents, total_examples=len(documents), epochs=10)
model.save("./input/Quora.w2v")