-
Notifications
You must be signed in to change notification settings - Fork 1
/
Grading_Assistant_loo.py
120 lines (94 loc) · 3.56 KB
/
Grading_Assistant_loo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
"""
Spyder Editor
Running execute the grading assistant
"""
"""
Import necessary libaries
"""
import pandas as pd
import re
import nltk
import math
import weightedmedianfunc
import SVD_for_S
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import snowballstemmer
from scipy import sparse
###############
stop_words = set(stopwords.words('english'))
def RemoveStopWords(arrayList):
newList = [w for w in arrayList if not w in stop_words]
return ' '.join(newList)
stemmer = nltk.LancasterStemmer()
def StemmingWordList(arrayList):
newList = [stemmer.stem(word) for word in arrayList]
return ' '.join(newList)
snowball = snowballstemmer.stemmer('english')
def SnowballStemmer(arrayList):
words = snowball.stemWords(arrayList)
return ' '.join(words)
wordNetLemmna = WordNetLemmatizer()
def WordNetLemma(arrayList):
newList = [wordNetLemmna.lemmatize(word) for word in arrayList]
return ' '.join(newList)
#-----Import data-----#
train = pd.read_excel('./Data/training_set_rel3_set1.xlsx')
test = pd.read_excel('./Data/valid_set_set1.xlsx')
corpus = pd.concat([train,test], ignore_index = True)
Y = corpus['Score']
X = corpus["Essay Content"]
train_numberOfSentences = X.apply(lambda x: len(x.split('.')))
train_numberOfWords = X.apply(lambda x: len(x.split()))
content = X
content = content.apply(lambda x: re.sub('[^a-zA-Z]+', ' ', x))
content = content.apply(lambda x: x.lower())
content = content.apply(lambda x: WordNetLemma(word_tokenize(x)))
content = content.apply(lambda x: RemoveStopWords(word_tokenize(x)))
#content = content.apply(lambda x: SnowballStemmer(word_tokenize(x)))
dimensions = 100
neighbors = 5
#svd = TruncatedSVD(n_components=dimensions)
tfidf = TfidfVectorizer(min_df = 0.01, max_df=0.90, stop_words='english')
x_transform = tfidf.fit_transform(content)
x_transform = sparse.hstack((x_transform, train_numberOfSentences[:,None]))
x_transform = sparse.hstack((x_transform, train_numberOfWords[:,None]))
x_transform = SVD_for_S.SVD(x_transform.toarray(), dimensions)
#x_transform = svd.fit_transform(x_transform)
nearestNeighbors = NearestNeighbors(n_neighbors=neighbors+1)
nearestNeighbors.fit(x_transform)
test_dist, test_ind = nearestNeighbors.kneighbors(x_transform)
#Using regular median
prediction_list = list()
for item in test_ind:
prediction_list.append(round((Y[item[math.floor(neighbors/2.0)+1]]+Y[item[math.ceil(neighbors/2.0)+1]])/2.0))
accuracy = cohen_kappa_score(Y, prediction_list,weights='quadratic')
print('Using vanilla median', accuracy)
#----Using mean score----#
prediction_list = list()
for val in test_ind:
total = 0
for i in val[1:]:
total += Y[i]
avg = round(total / (len(val)-1))
prediction_list.append(avg)
accuracy = cohen_kappa_score(Y, prediction_list,weights='quadratic')
print('Using mean', accuracy)
#---Using custom weighted----#
prediction_list = list()
n = len(test_ind)
for i in range(0, n):
scores_list = list()
dist_list = test_dist[i][1:]
for j in test_ind[i][1:]:
scores_list.append(Y[j])
prediction_list.append(round(weightedmedianfunc.weighted_median(scores_list,dist_list)))
accuracy = cohen_kappa_score(Y, prediction_list,weights='quadratic')
print('The accuracy of Using weighted median', accuracy)