featurize_custom.py

"""
Module to create chemical and symmetry-based features using custom functions

Author: Son Gyo Jung
Email: sgj13@cam.ac.uk
"""

import os                                                                                                                                                                                  
import pandas as pd
import numpy as np
import chemparse
import requests
import joblib
import pathlib

from pymatgen import MPRester 
from pymatgen.core import Composition, Element    
from lxml import html


class use_custom_descriptors():
    """
    Class to generate custom features  

    args: 
        (1) name_of_parent_folder (type:str) - must match name of the chemical data file generated by the 'retrieve_data' module 
        (2) csv (type:bool) - whether to save data as csv

    return: 
        (1) pandas.Dataframe of custom features (pkl and/or csv)
    """

    def __init__(self, name_of_parent_folder, csv):
        self.name_of_parent_folder = name_of_parent_folder
        self.csv = csv

        self.cur_dir = pathlib.Path().resolve()
        self.directory = os.path.join(self.cur_dir, 'retrieved_data', self.name_of_parent_folder)

        #Import chemcial data 
        self.df_chem = pd.read_csv(os.path.join(self.directory, r''+ str(self.name_of_parent_folder) + '.csv')) 
        
        #Import space group ref data
        self.df_sg = joblib.load('space_group_ref.pkl')   

        #Drop unwanted columns
        self.df_sg = self.df_sg.drop(columns = ['Unnamed: 0', 'full_name', 'x', 'y', 'z']) 
        
        #Join the two dataframe using 'spacegroup' column
        self.df = pd.merge(self.df_chem, self.df_sg,  how='left', left_on=['spacegroup.number'], right_on = ['sg_no']) 


    def movecol(self, dataframe, cols_to_move = [], ref_col = '', place = 'after'):
        """
        Function to rearrange columns

        arg: 
            (a) cols_to_move (list) - list of columns to move
            (b) ref_col (type:str) - reference column 
            (c) place (type:str) - whether to move the specified columns 'before' or 'after' the reference column (set to 'after' as default)

        return:
            (a) pandas.Dataframe
        """

        cols = dataframe.columns.tolist()

        if place == 'after':
            s1 = cols[:list(cols).index(ref_col) + 1]
            s2 = cols_to_move


        if place == 'before':
            s1 = cols[:list(cols).index(ref_col)]
            s2 = cols_to_move + [ref_col]
        

        s1 = [i for i in s1 if i not in s2]
        s3 = [i for i in cols if i not in s1 + s2]
        

        return dataframe[s1 + s2 + s3]


    def featurize(self):
        """
        Create custom features which includes: 

        (a) symmetry-based features, 
        (b) composition, 
        (c) atomic fraction, 
        (d) weight fraction, 
        (e) weight, 
        (f) total electrons,
        (g) electronegativity, 
        (h) noble_gas, 
        (i) transition_metal, 
        (j) post_transition_metal, 
        (k) rare_earth_metal, 
        (l) metal, metalloid,  
        (m) alkali, 
        (n) alkaline, 
        (o) halogen,
        (p) chalcogen, 
        (q) lanthanoid, 
        (r) actinoid, 
        (s) quadrupolar, 
        (t) s-block, 
        (u) p-block, 
        (v) d-block, 
        (w) f-block, 
        (x) magetic order
        """

        # Create composition column
        self.df['composition'] = self.df['pretty_formula']
        self.df['composition'] = self.df['composition'].apply(lambda x: chemparse.parse_formula(x))
        

        # Move column
        self.df = self.movecol(
                                dataframe  =self.df, 
                                cols_to_move = ['composition'], 
                                ref_col = 'task_id', 
                                place = 'after'
                                )

        self.df['composition'] = self.df['composition'].apply(lambda x: list(x.keys()))


        # Create pandas.Dataframes for elements (based on atomic and weight fraction)
        df_atomic_fraction = pd.DataFrame(index=np.arange(len(self.df['task_id'])), columns=np.arange(118))
        df_wt_fraction = pd.DataFrame(index=np.arange(len(self.df['task_id'])), columns=np.arange(118))


        # List of elements to consider
        name_list = [
                    'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 
                    'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 
                    'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 
                    'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 
                    'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 
                    'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 
                    'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
                    ]


        # Column names
        df_atomic_fraction.columns = name_list
        df_wt_fraction.columns = name_list


        # Join the df_atomic_fraction with df
        df2 = pd.concat([self.df, df_atomic_fraction], axis=1, sort=False)


        # Generate atomic fraction features (https://pymatgen.org/pymatgen.core.composition.html)
        for ele in df_atomic_fraction.columns.values:

            df2[ele] = df2['pretty_formula'].map(lambda x: Composition(str(x)).get_atomic_fraction(Element(str(ele))) if str(ele) in x else 0)

            df2 = df2.rename(columns={str(ele): str(ele) + '_af'})


        # Join the df_wt_fraction with df
        df3 = pd.DataFrame(df2, columns=['task_id', 'pretty_formula'])
        df3 = pd.concat([df3, df_wt_fraction], axis=1, sort=False)


        # Generate weight fraction features (https://pymatgen.org/pymatgen.core.composition.html)
        for ele in df_wt_fraction.columns.values:

            df3[ele] = df3['pretty_formula'].map(lambda x: Composition(str(x)).get_wt_fraction(Element(str(ele))) if str(ele) in x else 0)

            df3 = df3.rename(columns={str(ele): str(ele) + '_wf'})


        # Generate total molecular weight of Composition
        df3['weight'] = None
        df3['weight'] = df3['pretty_formula'].map(lambda x: Composition(str(x)).weight)


        # Generate total electrons
        df3['total_e'] = None
        df3['total_e'] = df3['pretty_formula'].map(lambda x: Composition(str(x)).total_electrons)


        # Generate average electronegativity of the composition
        df3['avg_electroneg'] = None
        df3['avg_electroneg'] = df3['pretty_formula'].map(lambda x: Composition(str(x)).average_electroneg)


        # Check if Composition contains any elements matching a given category
        category = [
                    'noble_gas', 'transition_metal', 'post_transition_metal', 'rare_earth_metal', 'metal', 'metalloid', \
                    'alkali', 'alkaline', 'halogen', 'chalcogen', 'lanthanoid', 'actinoid', 'quadrupolar', 's-block', 'p-block', \
                    'd-block', 'f-block'
                    ]

        for c in category:
            df3[c] = None
            df3[c] = df3['pretty_formula'].map(lambda x: Composition(str(x)).contains_element_type(c))


        # Drop columns
        df3 = df3.drop(['task_id', 'pretty_formula'], axis = 1)


        # Concatenate df2 and df3
        self.df4 = pd.concat([df2, df3], axis=1, sort=False)


        #Save data as csv
        joblib.dump(self.df4, os.path.join(self.directory, r'custom_features_' +  str(self.name_of_parent_folder) + '.pkl'))

        print('Successfully saved data as: ', 'custom_features_' +  str(self.name_of_parent_folder) + '.pkl')

        if self.csv == True:
            self.df4.to_csv(os.path.join(self.directory, r'custom_features_' +  str(self.name_of_parent_folder) + '.csv'))

            print('Successfully saved data as: ', 'custom_features_' +  str(self.name_of_parent_folder) + '.csv')
        

    def join(self):
        """
        Join features with CFID features
        """

        # Import CIFD features
        try:
            df_cifd = joblib.load(os.path.join(self.directory,r'CFID_features_' +  str(self.name_of_parent_folder) + '.pkl'))

        except:
            print('Pickle of CFID features does not exist')
            print('Checking for csv file')

            df_cifd = pd.read_csv(os.path.join(self.directory,r'CFID_features_' +  str(self.name_of_parent_folder) + '.csv'))

        else:
            print('CFID feature file does not exist')


        # Join the two featurised dataframe 
        df_joined = pd.merge(df_cifd, self.df4,  how = 'left', left_on = ['task_id'], right_on = ['task_id'])

        df_joined = self.movecol(
                                dataframe  = df_joined, 
                                cols_to_move = ['composition', 'pretty_formula'], 
                                ref_col = 'task_id', 
                                place = 'after'
                                )


        #Save data as csv
        joblib.dump(df_joined, os.path.join(self.directory, r'cfid_and_custom_features_' +  str(self.name_of_parent_folder) + '.pkl'))

        print('Data saved as: "cfid_and_custom_features_' +  str(self.name_of_parent_folder) + '.pkl"')

        if self.csv == True:
            df_joined.to_csv(os.path.join(self.directory, r'cfid_and_custom_features_' +  str(self.name_of_parent_folder) + '.csv'))

            print('Data saved as: "cfid_and_custom_features_' +  str(self.name_of_parent_folder) + '.csv"')