-
Notifications
You must be signed in to change notification settings - Fork 1
/
Building Heat Map.py
109 lines (87 loc) · 3.62 KB
/
Building Heat Map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Mon May 17 20:20:22 2021
@author: buiqu
"""
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import weightedmedianfunc
import SVD_for_S
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import snowballstemmer
from scipy import sparse
###############
stop_words = set(stopwords.words('english'))
def RemoveStopWords(arrayList):
newList = [w for w in arrayList if not w in stop_words]
return ' '.join(newList)
stemmer = nltk.LancasterStemmer()
def StemmingWordList(arrayList):
newList = [stemmer.stem(word) for word in arrayList]
return ' '.join(newList)
snowball = snowballstemmer.stemmer('english')
def SnowballStemmer(arrayList):
words = snowball.stemWords(arrayList)
return ' '.join(words)
wordNetLemmna = WordNetLemmatizer()
def WordNetLemma(arrayList):
newList = [wordNetLemmna.lemmatize(word) for word in arrayList]
return ' '.join(newList)
#-----Import data-----#
train = pd.read_excel('./Data/training_set_rel3_set1.xlsx')
test = pd.read_excel('./Data/valid_set_set1.xlsx')
train.set_index('ID')
test.set_index('essay_id')
y_train = train['Score']
y_test = test['Score']
X = pd.concat([train,test])
train_numberOfSentences = X['Essay Content'].apply(lambda x: len(x.split('.')))
train_numberOfWords = X['Essay Content'].apply(lambda x: len(x.split()))
content = X['Essay Content']
content = content.apply(lambda x: re.sub('[^a-zA-Z]+', ' ', x))
content = content.apply(lambda x: x.lower())
content = content.apply(lambda x: WordNetLemma(word_tokenize(x)))
content = content.apply(lambda x: RemoveStopWords(word_tokenize(x)))
#content = content.apply(lambda x: SnowballStemmer(word_tokenize(x)))
sns.set_theme()
heatmapData = list()
for neighbors in range(3, 11, 1):
accuracyOfDimension = list()
for dimensions in range(80, 351, 10):
svd = TruncatedSVD(n_components=dimensions)
tfidf = TfidfVectorizer(min_df = 0.01, max_df=0.85, stop_words='english')
x_transform = tfidf.fit_transform(content)
x_transform = sparse.hstack((x_transform, train_numberOfSentences[:,None]))
x_transform = sparse.hstack((x_transform, train_numberOfWords[:,None]))
#x_transform = SVD_for_S.SVD(x_transform.toarray(), dimensions)
x_transform = svd.fit_transform(x_transform)
x_train = x_transform[:len(train)]
x_test = x_transform[len(train):]
nearestNeighbors = NearestNeighbors(n_neighbors=neighbors)
nearestNeighbors.fit(x_train)
test_dist, test_ind = nearestNeighbors.kneighbors(x_test)
#---Using custom weighted----#
prediction_list = list()
n = len(test_ind)
for i in range(0, n):
scores_list = list()
dist_list = test_dist[i]
for i in test_ind[i]:
scores_list.append(y_train[i])
prediction_list.append(round(weightedmedianfunc.weighted_median(scores_list,dist_list)))
accuracy = cohen_kappa_score(y_test, prediction_list,weights='quadratic')
accuracyOfDimension.append(accuracy)
heatmapData.append(accuracyOfDimension)
df = pd.DataFrame(heatmapData, columns=list(range(80, 351, 10)), index=list(range(3,11,1)))
plt.figure(figsize=(16, 16))
sns.heatmap(df)