classification.py

# -*- coding: utf-8 -*-
"""classification.ipynb

Automatically generated by Colaboratory.

"""

## An inorganic ABX3 perovskite dataset for target property prediction and classification using machine learning

# AUTHOR - (1) * Ericsson Chenebuah, (1) Michel Nganbe and (2) Alain Tchagang 
# 1: Department of Mechanical Engineering, University of Ottawa, 75 Laurier Ave. East, Ottawa, ON, K1N 6N5 Canada
# 2: Digital Technologies Research Centre, National Research Council of Canada, 1200 Montréal Road, Ottawa, ON, K1A 0R6 Canada
# * email: echen013@uottawa.ca 
# (09-Feb-2022)

import pandas as pd
import pylab as pl
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# %matplotlib inline 
import matplotlib.pyplot as plt

df = pd.read_csv('oqmd_data.csv') #Reading the data

# Downsample majority classes to provide only 2089 samples
i=2089
s1 = df.cs[df.cs.eq('cubic')].sample(n=i, random_state=9).index
s2 = df.cs[df.cs.eq('trigonal')].sample(n=i, random_state=9).index
s3 = df.cs[df.cs.eq('orthorhombic')].sample(n=i, random_state=9).index
s4 = df.cs[df.cs.eq('tetragonal')].sample(n=i, random_state=9).index

df = df.loc[(s1|s2|s3|s4)]

# Train-test splitting strategy to produce only original samples for testing
n=0.3 # 'n' changes based on train-test split ratio. Each class provides 'n*100%' of original samples to the test set 

s1 = df.cs[df.cs.eq('cubic')].sample(frac=n, random_state=9).index
s2 = df.cs[df.cs.eq('trigonal')].sample(frac=n, random_state=9).index
s3 = df.cs[df.cs.eq('orthorhombic')].sample(frac=n, random_state=9).index
s4 = df.cs[df.cs.eq('tetragonal')].sample(frac=n, random_state=9).index

df2 = df.loc[(s1|s2|s3|s4)]

df1=pd.concat([df,df2])
df1=df1.reset_index(drop=True)
df_gpby = df1.groupby(list(df1.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
df1=df1.reindex(idx)

y_train = np.asarray(df1['cs1'].astype('float64'))
y_test = np.asarray(df2['cs1'].astype('float64'))

X_train=np.asarray(df1.drop(["name","entry_id","icsd_id","sg","cs","cs1","Ef","Eg"], 1))
X_test=np.asarray(df2.drop(["name","entry_id","icsd_id","sg","cs","cs1","Ef","Eg"], 1))

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# CLASSIFICATION EXPERIMENT 1: Support Vector machine Classifier (SVC)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
model = make_pipeline(StandardScaler(), SVC(gamma='auto', C=100.0, kernel='rbf', probability=True))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# UNCOMMENT AND REPEAT PROCESS FOR XGBOOST, LGBC AND RFR

# CLASSIFICATION EXPERIMENT 2: eXtreme Gradient Boosting (XGBoost)
#import xgboost as xgb
#model = xgb.XGBClassifier(n_estimators=500)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)

# CLASSIFICATION EXPERIMENT 3: Light Gradient Boosting Classifier (LGBC)
#import lightgbm as ltb
#model = ltb.LGBMClassifier(n_estimators=500)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)

# CLASSIFICATION EXPERIMENT 4: Random Forest Classifier (RFR)
#from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier(n_estimators=1000, criterion='gini')
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)

print("--------------------------------------------------------------------")
print("ACCURACY EVALUATION")
print("percentage of correctly predicted classes: %.4f"% (accuracy_score(y_test, y_pred)*100))
print(classification_report(y_test, y_pred, target_names=['Cubic','Trigonal','Orthorhombic','Tetragonal']))

# Illustrate classification result using confusion matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

cm_df = pd.DataFrame(cm,
                     index = ['Cubic','Trigonal','Orthorhombic','Tetragonal'], 
                     columns = ['Cubic','Trigonal','Orthorhombic','Tetragonal'])

import seaborn as sns
sns.heatmap(cm_df, annot=True, fmt='g', annot_kws={"size": 18})
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
#plt.savefig('SVC.jpg', dpi=1000, bbox_inches = 'tight')
plt.show