-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFeatureEngine.py
147 lines (117 loc) · 5.08 KB
/
FeatureEngine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from numpy import mean, absolute
import pandas as pd
import re, os
from pymatgen.core.composition import Composition
class Features:
def __init__(self,formula_file):
self.formula_file=formula_file
self.atomic_data_file='DATA/atomic_data.csv'
def make_features(self,atomic_descriptors,formula_list,targets,encoded_sym,addAVG=True,addAAD=True,addMD=False,addCV=False):
print('--------Generating Features------------')
all_feature_list=[]
for indx0, formula in enumerate(formula_list):
#print(f'{indx0}/{len(formula_list)} -> formula: {formula}')
feature_list = []
atom_symbols=list(atomic_descriptors[0].keys())
comp = Composition(formula)
formula=comp.formula
s = re.findall('([A-Z][a-z]?)([0-9]?\.?[0-9]*)', formula)
comp_vector = [0 for x in range(0, len(atom_symbols))]
feature_list.append(targets[indx0])
for d in encoded_sym[indx0]:
feature_list.append(d)
# Calculating the total number of atoms in the chemical formula
total = 0
for elem, num in s:
if (num == ''):
num = 1
total += int(num)
# Calculating Weighted Average
avg = 0
for des in atomic_descriptors:
des_list = []
for elem, num in s:
if (num == ''):
num = 1
num = int(num)
avg += des[elem] * num
des_list.append((des[elem],num))
avg = avg / total
if(addAVG):
feature_list.append(avg)
# Calculating Average Absolute Deviation
if (addAAD):
avgAD = 0
for y, num in des_list:
ad = abs(y - avg)*num
avgAD += ad
avgAD = avgAD /total
feature_list.append(avgAD)
# Calculating maximum difference
if (addMD):
dif_list=[]
for y1, num1 in des_list:
for y2, num2 in des_list:
dif=abs(y1-y2)
dif_list.append(dif)
max_dif=max(dif_list)
feature_list.append(max_dif)
# Creting Element Ratio Vector
for elem, num in s:
if (num == ''):
num = 1
num = int(num)
index = atom_symbols.index(elem)
comp_vector[index] = int(num) / total
# Uncomment if the element ratio vector is required
if(addCV):
for ratio in comp_vector:
feature_list.append(ratio)
all_feature_list.append(feature_list)
return all_feature_list
def get_formula_list(self):
df_mat = pd.read_csv(self.formula_file, header=None)
formula_list = [x[0] for x in df_mat.values.tolist()]
return formula_list
def get_encoded_sym(self):
df_mat = pd.read_csv(self.formula_file, header=None)
sym_list = [x[2] for x in df_mat.values.tolist()]
encoded_sym=[]
for sym in sym_list:
if(sym=='monoclinic'):
digits = [1,0,0,0,0,0,0]
elif (sym == 'triclinic'):
digits = [0, 1, 0, 0, 0, 0, 0]
elif (sym == 'orthorhombic'):
digits = [0, 0, 1, 0, 0, 0, 0]
elif (sym == 'trigonal'):
digits = [0, 0, 0, 1, 0, 0, 0]
elif (sym == 'hexagonal'):
digits = [0, 0, 0, 0, 1, 0, 0]
elif (sym == 'cubic'):
digits = [0, 0, 0, 0, 0, 1, 0]
elif (sym == 'tetragonal'):
digits = [0, 0, 0, 0, 0, 0, 1]
encoded_sym.append(digits)
return encoded_sym
def get_targets(self):
df_mat = pd.read_csv(self.formula_file, header=None)
targets = [x[1] for x in df_mat.values.tolist()]
return targets
def get_atomic_descriptors(self):
df_des = pd.read_csv(self.atomic_data_file, header=None)
atomic_descriptors=[]
elements = [x[0] for x in df_des.values.tolist()]
for i in range(1, len(df_des.columns)):
tmp = [x[i] for x in df_des.values.tolist()]
des_dict = dict(zip(elements, tmp))
atomic_descriptors.append(des_dict)
return atomic_descriptors
def get_features(self):
formula_list = self.get_formula_list()
encoded_sym=self.get_encoded_sym()
targets=self.get_targets()
atomic_descriptors=self.get_atomic_descriptors()
features = self.make_features(atomic_descriptors=atomic_descriptors,encoded_sym=encoded_sym,
targets=targets,formula_list=formula_list)
return features