Skip to content

Commit

Permalink
More careful truncation of the suspicious training MPDS values
Browse files Browse the repository at this point in the history
  • Loading branch information
blokhin committed Mar 15, 2018
1 parent 355a58e commit 534e1cf
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 70 deletions.
56 changes: 38 additions & 18 deletions mpds_ml_labs/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,42 +6,62 @@
import numpy as np


human_names = {
prop_semantics = {
#'w': {
# 'name': 'band gap for direct transition',
# 'units': 'eV',
# 'symbol': 'e<sub>dir.</sub>',
# 'rounding': 1,
# 'interval': [0.01, 20]
#},
'z': {
'name': 'isothermal bulk modulus',
'units': 'GPa',
'symbol': 'B',
'rounding': 0
'rounding': 0,
'interval': [0.5, 2000]
},
'y': {
'name': 'enthalpy of formation',
'units': 'kJ g-at.-1',
'symbol': '&Delta;H',
'rounding': 0
'rounding': 0,
'interval': [-900, 200]
},
'x': {
'name': 'heat capacity at constant pressure',
'units': 'J K-1 g-at.-1',
'symbol': 'C<sub>p</sub>',
'rounding': 0
'rounding': 0,
'interval': [0, 500]
},
#'w': {
# 'name': 'band gap for direct transition',
# 'units': 'eV',
# 'symbol': 'e<sub>dir.</sub>',
# 'rounding': 1
#},
'k': {
'name': 'Seebeck coefficient',
'units': 'muV K-1',
'symbol': 'S',
'rounding': 1
'rounding': 1,
'interval': [-1000, 1000]
},
'm': {
'name': 'temperature for congruent melting',
'units': 'K',
'symbol': 'T<sub>melt</sub>',
'rounding': 0
'rounding': 0,
'interval': [10, 5000]
},
'd': {
'name': 'Debye temperature',
'units': 'K',
'symbol': '&Theta;<sub>D</sub>',
'rounding': 0,
'interval': [10, 2000]
},
't': {
'name': 'linear thermal expansion coefficient',
'units': 'K-1',
'symbol': '&Theta;<sub>D</sub>',
'rounding': 6,
'interval': [-0.001, 0.001]
}
}

Expand Down Expand Up @@ -112,9 +132,9 @@ def load_ml_model(prop_model_files):
continue

basename = file_name.split(os.sep)[-1]
if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in human_names:
if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in prop_semantics:
prop_id = basename[2:3]
print("Detected property %s in file %s" % (human_names[prop_id]['name'], basename))
print("Detected property %s in file %s" % (prop_semantics[prop_id]['name'], basename))
else:
prop_id = str(n)
print("No property name detected in file %s" % basename)
Expand All @@ -132,7 +152,7 @@ def load_ml_model(prop_model_files):
def get_legend(pred_dict):
legend = {}
for key in pred_dict.keys():
legend[key] = human_names.get(key, {
legend[key] = prop_semantics.get(key, {
'name': 'Unspecified property ' + str(key),
'units': 'arb.u.',
'symbol': 'P' + str(key),
Expand All @@ -147,7 +167,7 @@ def ase_to_ml_model(ase_obj, ml_model):
d_dim = len(descriptor)

if not ml_model: # testing
return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in human_names.keys()}, None
return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in prop_semantics.keys()}, None

for prop_id, regr in ml_model.items(): # production

Expand All @@ -164,8 +184,8 @@ def ase_to_ml_model(ase_obj, ml_model):
return None, str(e)

result[prop_id] = {
'value': round(prediction, human_names[prop_id]['rounding']),
'mae': round(regr.metadata['mae'], human_names[prop_id]['rounding']),
'value': round(prediction, prop_semantics[prop_id]['rounding']),
'mae': round(regr.metadata['mae'], prop_semantics[prop_id]['rounding']),
'r2': regr.metadata['r2']
}

Expand Down
39 changes: 13 additions & 26 deletions mpds_ml_labs/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,14 @@

from mpds_client import MPDSDataRetrieval, APIError

from prediction import human_names
from struct_utils import detect_format, poscar_to_ase, symmetrize, get_formula
from prediction import prop_semantics
from struct_utils import detect_format, poscar_to_ase, symmetrize, get_formula, sgn_to_crsystem
from cif_utils import cif_to_ase


req = httplib2.Http()
client = MPDSDataRetrieval()

def sgn_to_crsystem(number):
if 195 <= number <= 230:
return 'cubic'
elif 168 <= number <= 194:
return 'hexagonal'
elif 143 <= number <= 167:
return 'trigonal'
elif 75 <= number <= 142:
return 'tetragonal'
elif 16 <= number <= 74:
return 'orthorhombic'
elif 3 <= number <= 15:
return 'monoclinic'
else:
return 'triclinic'

def make_request(address, data={}, httpverb='POST', headers={}):

address += '?' + urlencode(data)
Expand All @@ -47,6 +31,9 @@ def make_request(address, data={}, httpverb='POST', headers={}):

if __name__ == '__main__':

try: sys.argv[1]
except IndexError: sys.exit("Structure file must be given!")

structure = open(sys.argv[1]).read()
fmt = detect_format(structure)

Expand All @@ -72,29 +59,29 @@ def make_request(address, data={}, httpverb='POST', headers={}):
raise RuntimeError(answer['error'])

formulae_categ, lattices_categ = get_formula(ase_obj), sgn_to_crsystem(ase_obj.info['spacegroup'].no)
for prop_id, pdata in human_names.items():
for prop_id, pdata in prop_semantics.items():
try:
resp = client.get_dataframe({
'formulae': formulae_categ,
'lattices': lattices_categ,
'props': pdata['name']
})
except APIError as e:
human_names[prop_id]['factual'] = None
prop_semantics[prop_id]['factual'] = None
if e.code == 1:
continue
else:
raise

resp['Value'] = resp['Value'].astype('float64') # to treat values out of bounds given as str
resp = resp[resp['Units'] == pdata['units']]
human_names[prop_id]['factual'] = np.median(resp['Value'])
prop_semantics[prop_id]['factual'] = np.median(resp['Value'])

for prop_id, pdata in answer['prediction'].items():
print("{0:40} = {1:6}, factual {2:6} (MAE = {3:4}), {4}".format(
human_names[prop_id]['name'],
print("{0:40} = {1:6}, factual {2:8} (MAE = {3:4}), {4}".format(
prop_semantics[prop_id]['name'],
pdata['value'],
human_names[prop_id]['factual'] or 'absent',
abs(pdata['value'] - human_names[prop_id]['factual']) if human_names[prop_id]['factual'] else 'unknown',
human_names[prop_id]['units']
prop_semantics[prop_id]['factual'] or 'absent',
pdata['mae'],
prop_semantics[prop_id]['units']
))
6 changes: 3 additions & 3 deletions mpds_ml_labs/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from struct_utils import detect_format, poscar_to_ase, symmetrize
from cif_utils import cif_to_ase
from prediction import ase_to_ml_model, load_ml_model, human_names
from prediction import ase_to_ml_model, load_ml_model, prop_semantics
from common import ML_MODELS, DATA_PATH


Expand Down Expand Up @@ -57,8 +57,8 @@

for prop_id, pdata in prediction.items():
print("{0:40} = {1:6} (MAE = {2:4}), {3}".format(
human_names[prop_id]['name'],
prop_semantics[prop_id]['name'],
pdata['value'],
pdata['mae'],
human_names[prop_id]['units']
prop_semantics[prop_id]['units']
))
40 changes: 17 additions & 23 deletions train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from mpds_client import MPDSDataRetrieval, MPDSExport

from prediction import get_descriptor, human_names
from mpds_ml_labs.prediction import get_descriptor, prop_semantics


def get_regr(a=None, b=None):
Expand Down Expand Up @@ -57,13 +57,13 @@ def mpds_get_data(prop_id, descriptor_kappa):
Fetch, massage, and save dataframe from the MPDS
NB currently pressure is not taken into account!
"""
print("Getting %s with descriptor kappa = %s" % (human_names[prop_id]['name'], descriptor_kappa))
print("Getting %s with descriptor kappa = %s" % (prop_semantics[prop_id]['name'], descriptor_kappa))
starttime = time.time()

client = MPDSDataRetrieval()

props = client.get_dataframe(
{"props": human_names[prop_id]['name']},
{"props": prop_semantics[prop_id]['name']},
fields={'P': [
'sample.material.chemical_formula',
'sample.material.phase_id',
Expand All @@ -77,23 +77,17 @@ def mpds_get_data(prop_id, descriptor_kappa):
)
props['Value'] = props['Value'].astype('float64') # to treat values out of bounds given as str
props = props[np.isfinite(props['Phase'])]
props = props[props['Units'] == human_names[prop_id]['units']]

# filtering some abnormal values
# these should be corrected by LPF editors soon
if prop_id == 'z':
props = props[props['Value'] < 2000]
#elif prop_id == 'w': # NB this requires additional treatment for zero band gaps
# props = props[(props['Value'] > 0) & (props['Value'] < 20)]
elif prop_id == 'u':
props = props[props['Value'] > 0]

to_drop = props[
(props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
props = props[props['Units'] == prop_semantics[prop_id]['units']]
props = props[
(props['Value'] > prop_semantics[prop_id]['interval'][0]) & \
(props['Value'] < prop_semantics[prop_id]['interval'][1])
]

print("Rows to drop by criteria: %s" % len(to_drop))
props.drop(to_drop.index, inplace=True)
if prop_id not in ['m', 'd']:
to_drop = props[
(props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
]
print("Rows to neglect by temperature: %s" % len(to_drop))
props.drop(to_drop.index, inplace=True)

phases_compounds = dict(zip(props['Phase'], props['Compound'])) # keep the mapping for future
avgprops = props.groupby('Phase')['Value'].mean().to_frame().reset_index().rename(columns={'Value': 'Avgvalue'})
Expand Down Expand Up @@ -163,9 +157,9 @@ def tune_model(data_file):
Load saved data and perform simple regressor parameter tuning
"""
basename = data_file.split(os.sep)[-1]
if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in human_names:
if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in prop_semantics:
tag = basename[2:3]
print("Detected property %s" % human_names[tag]['name'])
print("Detected property %s" % prop_semantics[tag]['name'])
else:
tag = None
print("No property name detected")
Expand Down Expand Up @@ -213,14 +207,14 @@ def tune_model(data_file):
sys.exit(
"What to do?\n"
"Please, provide either a *prop_id* letter (%s) for a property data to be downloaded and fitted,\n"
"or a data *filename* for tuning the model." % ", ".join(human_names.keys())
"or a data *filename* for tuning the model." % ", ".join(prop_semantics.keys())
)
try:
descriptor_kappa = int(sys.argv[2])
except:
descriptor_kappa = None

if arg in human_names.keys():
if arg in prop_semantics.keys():

struct_props = mpds_get_data(arg, descriptor_kappa)

Expand Down

0 comments on commit 534e1cf

Please sign in to comment.