-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclassification.py
110 lines (83 loc) · 4.08 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""classification.ipynb
Automatically generated by Colaboratory.
"""
## An inorganic ABX3 perovskite dataset for target property prediction and classification using machine learning
# AUTHOR - (1) * Ericsson Chenebuah, (1) Michel Nganbe and (2) Alain Tchagang
# 1: Department of Mechanical Engineering, University of Ottawa, 75 Laurier Ave. East, Ottawa, ON, K1N 6N5 Canada
# 2: Digital Technologies Research Centre, National Research Council of Canada, 1200 Montréal Road, Ottawa, ON, K1A 0R6 Canada
# * email: echen013@uottawa.ca
# (09-Feb-2022)
import pandas as pd
import pylab as pl
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# %matplotlib inline
import matplotlib.pyplot as plt
df = pd.read_csv('oqmd_data.csv') #Reading the data
# Downsample majority classes to provide only 2089 samples
i=2089
s1 = df.cs[df.cs.eq('cubic')].sample(n=i, random_state=9).index
s2 = df.cs[df.cs.eq('trigonal')].sample(n=i, random_state=9).index
s3 = df.cs[df.cs.eq('orthorhombic')].sample(n=i, random_state=9).index
s4 = df.cs[df.cs.eq('tetragonal')].sample(n=i, random_state=9).index
df = df.loc[(s1|s2|s3|s4)]
# Train-test splitting strategy to produce only original samples for testing
n=0.3 # 'n' changes based on train-test split ratio. Each class provides 'n*100%' of original samples to the test set
s1 = df.cs[df.cs.eq('cubic')].sample(frac=n, random_state=9).index
s2 = df.cs[df.cs.eq('trigonal')].sample(frac=n, random_state=9).index
s3 = df.cs[df.cs.eq('orthorhombic')].sample(frac=n, random_state=9).index
s4 = df.cs[df.cs.eq('tetragonal')].sample(frac=n, random_state=9).index
df2 = df.loc[(s1|s2|s3|s4)]
df1=pd.concat([df,df2])
df1=df1.reset_index(drop=True)
df_gpby = df1.groupby(list(df1.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
df1=df1.reindex(idx)
y_train = np.asarray(df1['cs1'].astype('float64'))
y_test = np.asarray(df2['cs1'].astype('float64'))
X_train=np.asarray(df1.drop(["name","entry_id","icsd_id","sg","cs","cs1","Ef","Eg"], 1))
X_test=np.asarray(df2.drop(["name","entry_id","icsd_id","sg","cs","cs1","Ef","Eg"], 1))
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# CLASSIFICATION EXPERIMENT 1: Support Vector machine Classifier (SVC)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
model = make_pipeline(StandardScaler(), SVC(gamma='auto', C=100.0, kernel='rbf', probability=True))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# UNCOMMENT AND REPEAT PROCESS FOR XGBOOST, LGBC AND RFR
# CLASSIFICATION EXPERIMENT 2: eXtreme Gradient Boosting (XGBoost)
#import xgboost as xgb
#model = xgb.XGBClassifier(n_estimators=500)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)
# CLASSIFICATION EXPERIMENT 3: Light Gradient Boosting Classifier (LGBC)
#import lightgbm as ltb
#model = ltb.LGBMClassifier(n_estimators=500)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)
# CLASSIFICATION EXPERIMENT 4: Random Forest Classifier (RFR)
#from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier(n_estimators=1000, criterion='gini')
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)
print("--------------------------------------------------------------------")
print("ACCURACY EVALUATION")
print("percentage of correctly predicted classes: %.4f"% (accuracy_score(y_test, y_pred)*100))
print(classification_report(y_test, y_pred, target_names=['Cubic','Trigonal','Orthorhombic','Tetragonal']))
# Illustrate classification result using confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_df = pd.DataFrame(cm,
index = ['Cubic','Trigonal','Orthorhombic','Tetragonal'],
columns = ['Cubic','Trigonal','Orthorhombic','Tetragonal'])
import seaborn as sns
sns.heatmap(cm_df, annot=True, fmt='g', annot_kws={"size": 18})
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
#plt.savefig('SVC.jpg', dpi=1000, bbox_inches = 'tight')
plt.show