-
Notifications
You must be signed in to change notification settings - Fork 0
/
Compute_Word_Embeddings.py
176 lines (142 loc) · 5.44 KB
/
Compute_Word_Embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Created on Wed Oct 25 16:25:13 2017
@author: apoorva
"""
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
import numpy as np
import re
import itertools
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import pickle
from nltk.tokenize import RegexpTokenizer
flag=True
#Tokenization and string cleaning
vocab={}
index=0
stop = stopwords.words('english')
def preprocess(sentence):
sentence = sentence.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
return tokens
#Loading data and labels
def load_data_and_labels():
input_x=[]
label_y=[]
dataset=pd.read_csv('re_extraction_2017_10_1.csv',encoding='latin1')
#re_extraction_2017_10_1.csv consists of pre-annotated posts, classified into one of the 4 social support categories. Each post can belong to multiple categories
for i in range(0,1330):
#Data loading
tmp=dataset.iloc[i]['Text']
tmp=tmp.strip()
tmp = preprocess(tmp)
temp_list=[]
for x in tmp:
temp_word = []
for j in x:
if (j >= 'a' and j <= 'z') or (j >= '0' and j <= '9'):
temp_word.append(j)
temp_word = ''.join(temp_word)
if temp_word not in stop:
temp_list.append(temp_word)
#Labels
y_temp=[]
for j in range(1,5):
if flag:
if isinstance((dataset.iloc[i]['Label'+str(j)]), basestring): #Python2
m=dataset.iloc[i]['Label'+str(j)]
y_temp.append(m.strip(" "))
else:
if isinstance((dataset.iloc[i]['Label'+str(j)]), str): #Python3
m=dataset.iloc[i]['Label'+str(j)]
y_temp.append(m.strip(" "))
#Creating label vector corresponding to the class- repeat for each of the four classes
if "'com'" in y_temp:
y = 1
else:
y = 0
input_x.append(temp_list)
label_y.append(y)
return [input_x,label_y]
testset=pd.read_csv('posts_12_4.txt', sep = '\t', header= None, encoding='latin1')
def load_data_and_labels1(begin, end):
input_x=[]
for i in range(begin, end):
#Data
tmp=testset.iloc[i][4]
if isinstance(tmp, basestring):
tmp=tmp.strip()
tmp = preprocess(tmp)
temp_list=[]
for x in tmp:
temp_word = []
for j in x:
if (j >= 'a' and j <= 'z') or (j >= '0' and j <= '9'):
temp_word.append(j)
temp_word = ''.join(temp_word)
if temp_word not in stop:
temp_list.append(temp_word)
input_x.append(temp_list)
else:
input_x.append(['NaN'])
return input_x
#Pads all sentences to the same length. The length is defined by the longest sentence.Returns padded sentences
def pad_sentences(sentences, padding_word=""):
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
# Load and preprocess data
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
#-------Word2Vec---------
import gensim
model = gensim.models.Word2Vec.load('model_embedding.bin')
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
#Creating word embeddings- higher dimension to lower dimension
class TfidfEmbeddingVectorizer(object): #Alternative is MeanEmbeddingVectorizer, gives worse results
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
self.dim = len(word2vec.itervalues().next())
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idfs
max_idf = max(tfidf.idf_)
self.word2weight = defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])
a = TfidfEmbeddingVectorizer(w2v)
a = a.fit(sentences_padded, labels)
#2854280 (2854282 has only 2 prime factors)
#The math that follows is based off of the fact that there are 2854280 posts, processed in batches of size=500
inp = load_data_and_labels1(0*499, (0+1)*499)
s = pad_sentences(inp)
all_posts = a.transform(s)
for i in range(1, 5720):
print i
inp = load_data_and_labels1(i*499, (i+1)*499)
s = pad_sentences(inp)
all_posts = np.concatenate((all_posts, a.transform(s)), axis = 0)
#Saving word emebedding corresposnding to each class
with open('all_com.npy', 'wb') as f:
np.save(f, all_posts)