-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLanguage_identifier_fast.py
281 lines (226 loc) · 12.3 KB
/
Language_identifier_fast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 15 14:21:59 2018
This Language detector model detects seven languages that largely use the Latin
script(English, German, French, Italian, Spanish, Portuguese, Dutch). It is
implemented using a Multiclass Random Forest text classifier using words,
bigrams, and trigrams as features. To optimize training and testing performance
by reducing number of features,a corpus of 300,000 sentences for each language
is leveraged from Leipzig Corpus and the 50 most frequent words, bigrams, and
trigrams are shortlisted as features. The dataframe creation is slightly
complicated, but it is highly vectorized to speed up performance. All train and
test datapoints are then represented in the reduced feature-space. A model
trained on 5,000 sentences from each language takes less than 2 minutes to
train, and performs at 98% accuracy.
To replicate the environment, place the following data files sourced from
http://wortschatz.uni-leipzig.de/en/download in a directory, and assign that to
'dirname'
1. deu_mixed-typical_2011_300K-sentences.txt
2. eng_news_2005_300K-sentences.txt
3. fra_mixed_2009_300K-sentences.txt
4. ita_mixed-typical_2017_300K-sentences.txt
5. nld_mixed_2012_300K-sentences.txt
6. por_newscrawl_2011_300K-sentences.txt
7. spa_news_2006_300K-sentences.txt
Novel ideas - shortlisting features based on frequency to speed up random forest
performance
Scope for improvement - Need to prune feature space to further remove
redundancies. One approach could be through the use of maximal substrings.
For eg - the trigram ' a ' will be a substring of ' a' always and can be removed
@author: Kiran Ramnath
Applicant ID - 201806110737_RamnathKiran
"""
import pandas as pd
from string import punctuation
import time
import numpy as np
import gc
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full
#%%
dirname='C:/Personal/R/UKP/Data/'
#%% Read tab separated files
def read_file(path):
t=pd.read_fwf(path, delimiter="\t",header=None)
t[0]=t[0].apply(lambda row: row.split("\t")[1])
return t
#%% function that takes text and n-gram length as input, returns list of tuples
# of the format [(ngram_count, n-gram)]
def max_ngram_extracter(sent,num_of_chars):
ngram_vectorizer=CountVectorizer(input="content",analyzer="char_wb",ngram_range=(num_of_chars,num_of_chars))
ngrams=ngram_vectorizer.fit_transform(sent)
count_values=ngrams.toarray().sum(axis=0)
vocab = ngram_vectorizer.vocabulary_
counts = sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
counts = [(c[1],c[0]) for c in counts]
return counts
#%% function that takes text as input, returns list of tuples of the format
# [(word_count, word)]
def max_word_extracter(sent):
word_vectorizer=CountVectorizer(input="content",analyzer="word", ngram_range=(1,1))
ngrams=word_vectorizer.fit_transform(sent)
count_values=ngrams.toarray().sum(axis=0)
vocab = word_vectorizer.vocabulary_
counts = sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
counts = [(c[1],c[0]) for c in counts]
return counts
#%% function that reads all csv files (one per language). Contains language
# column that will be used as training class for the classification algorithm.
# returns train and test dataframes, all tuples of bigrams, trigrams, count_words
def language_df_creator(filename,language):
df=read_file(dirname+filename)
df['language']=language
#creating train-test split
df_train, df_test = train_test_split(df, test_size=1000, random_state=42)
#merging all text from train_df in one list to find most frequent words,bigrams and trigrams
lang_txt=df_train.groupby('language')[0].apply(lambda row: (" ").join(row))
lang_count_bigrams=max_ngram_extracter(lang_txt,2)
lang_count_trigrams=max_ngram_extracter(lang_txt,3)
lang_count_words=max_word_extracter(lang_txt)
return df_train, df_test, lang_count_bigrams, lang_count_trigrams, lang_count_words
#%%
english_df_train, english_df_test, english_count_bigrams, english_count_trigrams, english_count_words = language_df_creator("eng_news_2005_300K-sentences.txt","english")
#%%
german_df_train, german_df_test, german_count_bigrams, german_count_trigrams, german_count_words=language_df_creator("deu_mixed-typical_2011_300K-sentences.txt","german")
#%%
italian_df_train, italian_df_test, italian_count_bigrams, italian_count_trigrams, italian_count_words=language_df_creator("ita_mixed-typical_2017_300K-sentences.txt","italian")
#%%
spanish_df_train, spanish_df_test, spanish_count_bigrams, spanish_count_trigrams, spanish_count_words=language_df_creator("spa_news_2006_300K-sentences.txt","spanish")
#%%
portuguese_df_train, portuguese_df_test, portuguese_count_bigrams, portuguese_count_trigrams, portuguese_count_words=language_df_creator("por_newscrawl_2011_300K-sentences.txt","portuguese")
#%%
french_df_train, french_df_test, french_count_bigrams, french_count_trigrams, french_count_words = language_df_creator("fra_mixed_2009_300K-sentences.txt","french")
#%%
dutch_df_train, dutch_df_test, dutch_count_bigrams, dutch_count_trigrams, dutch_count_words=language_df_creator("nld_mixed_2012_300K-sentences.txt","dutch")
#%% creating feature-list containing 50 most frequent words, bigrams, trigrams for all languages
features=set([f[0] for f in english_count_bigrams[0:50]
+english_count_words[0:50]
+english_count_trigrams[0:50]
+german_count_bigrams[0:50]
+german_count_trigrams[0:50]
+german_count_words[0:50]
+italian_count_bigrams[0:50]
+italian_count_trigrams[0:50]
+italian_count_words[0:50]
+spanish_count_bigrams[0:50]
+spanish_count_trigrams[0:50]
+spanish_count_words[0:50]
+portuguese_count_bigrams[0:50]
+portuguese_count_trigrams[0:50]
+portuguese_count_words[0:50]
+french_count_bigrams[0:50]
+french_count_trigrams[0:50]
+french_count_words[0:50]
+dutch_count_bigrams[0:50]
+dutch_count_trigrams[0:50]
+dutch_count_words[0:50]
if f[0] not in punctuation])
#%% Using Gensim's dictionary object to store features. This allows us to
#create dense vectors for all datapoints efficiently, speeding up data creation
dct=Dictionary([list(features)])
#re-assigning features because gensim creates dictionary in alphabetical order
features=list(dct.token2id.keys())
#%% 5,000 sentences from each each language is used to train the classification model
english_df_train_frac=english_df_train.sample(n=5000,random_state=42)
german_df_train_frac=german_df_train.sample(n=5000,random_state=42)
italian_df_train_frac=italian_df_train.sample(n=5000,random_state=42)
spanish_df_train_frac=spanish_df_train.sample(n=5000,random_state=42)
portuguese_df_train_frac=portuguese_df_train.sample(n=5000,random_state=42)
french_df_train_frac=french_df_train.sample(n=5000,random_state=42)
dutch_df_train_frac=dutch_df_train.sample(n=5000,random_state=42)
#%%
train_df=pd.concat([english_df_train_frac,german_df_train_frac,
italian_df_train_frac,spanish_df_train_frac,
portuguese_df_train_frac,french_df_train_frac,dutch_df_train_frac],
ignore_index=True)
#%% Free up memory, perform garbage collection
del german_df_train, french_df_train, english_df_train, dutch_df_train, spanish_df_train, italian_df_train
gc.collect()
#%% create dataframe for all training sentences containing unique features as columns
def create_dataframe_rf(df):
df.rename(columns={0:"text"}, inplace=True)
zero_data=np.zeros(shape=(len(df),len(features)))
feature_df=pd.DataFrame(zero_data,index=df.index, columns=features)
df=pd.concat([df,feature_df],axis=1)
return df
train_df=create_dataframe_rf(train_df)
#train_df=pd.write_csv(dirname+"rftrain_set.csv")
#%% label encoder applies integer labels to all classes
languages=['english','dutch','german','italian','spanish','portuguese','french']
#create flags on the basis of language
le = preprocessing.LabelEncoder()
le.fit(languages)
#%% create features for all sentences, find and populate shortlisted feature columns
# This is the most time consuming step.
def feature_creator(df):
#Using Gensim utility function to populate training dataset
bag_of_words=df['text'].apply(lambda row: sparse2full([(dct.token2id[m[0]],m[1]) for m in max_word_extracter([row]) if m[0] in dct.token2id], length=len(features)))
bag_of_words=np.array(bag_of_words.tolist()).astype(int)
bag_of_bigrams=df['text'].apply(lambda row: sparse2full([(dct.token2id[m[0]],m[1]) for m in max_ngram_extracter([row],2) if m[0] in dct.token2id], length=len(features)))
bag_of_bigrams=np.array(bag_of_bigrams.tolist()).astype(int)
bag_of_trigrams=df['text'].apply(lambda row: sparse2full([(dct.token2id[m[0]],m[1]) for m in max_ngram_extracter([row],3) if m[0] in dct.token2id], length=len(features)))
bag_of_trigrams=np.array(bag_of_trigrams.tolist()).astype(int)
#Add matrix representation of words, bigrams, and trigrams components of all vectors
all_features=bag_of_words+bag_of_bigrams+bag_of_trigrams
all_features.astype(int)
#populate train_df with all features
all_features_df=pd.DataFrame(data=all_features, columns=features)
df.update(all_features_df)
#random forest requires integer labels, so transform text levels to integer levels
df['flag']=le.transform(df['language'])
print ("Feature creation finished")
return df
#%%
time_start=time.time()
train_df = feature_creator(train_df)
time_taken_train_df=time.time()-time_start
#%%
def random_forest(train_df, number_of_estimators):
# Random Forest Model
clf_rforest = RandomForestClassifier(n_estimators=number_of_estimators, random_state=1, min_samples_leaf=5, max_depth=30)
clf_rforest.fit(train_df[features], train_df['flag'])
# Importance of features
rforest_importances = clf_rforest.feature_importances_
rforest_importances = dict(zip(features,rforest_importances))
return clf_rforest, rforest_importances
time_start=time.time()
clf_rforest, rforest_importances=random_forest(train_df, 500)
time_taken_rf=time.time()-time_start
#%% create test set containing 1000 sentences from each language
test_df=pd.concat([english_df_test,german_df_test,italian_df_test,spanish_df_test,
portuguese_df_test,french_df_test,dutch_df_test], ignore_index=True)
#%%
# Process of creating features for scoring dataset
def scoring_df(test_df):
test_df=create_dataframe_rf(test_df)
test_df=feature_creator(test_df)
return test_df
#%%
time_start=time.time()
test_df=scoring_df(test_df)
time_taken_test_df=time.time()-time_start
#%%
time_start=time.time()
test_df['prediction']=le.inverse_transform(clf_rforest.predict(test_df[features]))
time_taken_score=time.time()-time_start
#%%
# Exporting Scored Results to CSV
test_df.to_csv(dirname+"preds_5k.csv")
#%% Print performance metrics like precision, recall, f-1 score
print(classification_report(test_df['language'],test_df['prediction'],target_names=languages))
#%% Print confusion matrix
conf_mat = confusion_matrix(test_df['language'],test_df['prediction'])
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
xticklabels=languages, yticklabels=languages)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()