janosh · janosh · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@ models/**/checkpoints
 data/**/*.json*
 data/**/*.gz*
 data/**/*.zip*
+!data/**/*umap-projections.csv.bz2
 
 # slurm + Weights and Biases logs
 wandb/

diff --git a/data/wbm/umap/2d-umap-projections.csv.bz2 b/data/wbm/umap/2d-umap-projections.csv.bz2
diff --git a/pyproject.toml b/pyproject.toml
@@ -65,8 +65,8 @@ running-models = [
   "megnet",
 ]
 3d-structures = ["crystaltoolkit"]
-df-to-pdf = ["jinja2"]
-fetch-data = ["gdown"]
+fetch-wbm-data = ["gdown"]
+make-wbm-umap = ["umap-learn"]
 
 [tool.setuptools.packages.find]
 include = ["matbench_discovery*"]

diff --git a/scripts/wbm_umap_projection.py b/scripts/wbm_umap_projection.py
@@ -0,0 +1,215 @@
+"""This script computes the UMAP projections of matminer features for the WBM and MP
+datasets. The resulting projections are saved to a .npz file.
+
+Needs pip install umap-learn.
+
+Uses Material Structure and Composition Featurizer
+"A critical examination of robustness and generalizability of machine learning
+prediction of materials properties"
+https://www.nature.com/articles/s41524-023-01012-9
+"""
+
+
+# %%
+from __future__ import annotations
+
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import umap
+from pymatgen.core import Structure
+from pymatviz.io import save_fig
+from tqdm import tqdm
+
+from matbench_discovery import DATA_DIR, PDF_FIGS, id_col
+from matbench_discovery.data import DATA_FILES
+
+__author__ = "Philipp Benner, Janosh Riebesell"
+__date__ = "2023-11-28"
+
+
+# %%
+mp_matminer_feat_path = f"{DATA_DIR}/mp/mp-final-structures-matminer-features.csv.bz2"
+wbm_matminer_feat_path = (
+    f"{DATA_DIR}/wbm/wbm-initial-structures-matminer-features.csv.bz2"
+)
+# default n_umap_neighbors=15 results resemble PCA, i.e. a single blob of points.
+# The larger n_neighbors, the more MP point clusters move apart. Those islands may
+# correspond to some material classes (but we didn't test this).
+n_umap_neighbors = 750
+
+
+# %%
+def featurize_dataframe(
+    df_in: pd.DataFrame | pd.Series,
+    struct_col: str = "structure",
+    ignore_errors: bool = True,
+    chunk_size: int = 30,
+) -> pd.DataFrame:
+    """Featurize a dataframe of structures using matminer Featurizers.
+
+    Args:
+        df_in (pd.DataFrame): DataFrame with a column named "structure"
+        struct_col (str, optional): Name of column containing structures.
+            Defaults to "structure".
+        ignore_errors (bool, optional): Whether to ignore errors. Defaults to True.
+        chunk_size (int, optional): Chunk size for parallelization. Defaults to 30.
+
+    Returns:
+        pd.DataFrame: DataFrame with 273 columns containing matminer features
+    """
+    # For featurization
+    from matminer.featurizers.base import MultipleFeaturizer
+    from matminer.featurizers.composition import (
+        ElementProperty,
+        IonProperty,
+        Stoichiometry,
+        ValenceOrbital,
+    )
+    from matminer.featurizers.structure import (
+        ChemicalOrdering,
+        MaximumPackingEfficiency,
+        SiteStatsFingerprint,
+        StructuralHeterogeneity,
+        StructureComposition,
+    )
+
+    df = df_in.to_frame() if isinstance(df_in, pd.Series) else df_in
+    for struct in df[struct_col]:
+        for site in struct:
+            site.to_unit_cell(in_place=True)
+
+    # 128 structural feature
+    struct_feat = [
+        SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
+        SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
+        StructuralHeterogeneity(),
+        MaximumPackingEfficiency(),
+        ChemicalOrdering(),
+    ]
+    # 145 compositional features
+    comp_feat = [
+        StructureComposition(Stoichiometry()),
+        StructureComposition(ElementProperty.from_preset("magpie")),
+        StructureComposition(ValenceOrbital(props=["frac"])),
+        StructureComposition(IonProperty(fast=True)),
+    ]
+    featurizer = MultipleFeaturizer([*struct_feat, *comp_feat])
+    # Set the chunk size used for Pool.map parallelization
+    featurizer.set_chunksize(chunksize=chunk_size)
+    featurizer.fit(df[struct_col])
+    matminer_features = featurizer.featurize_dataframe(
+        df=df, col_id=struct_col, ignore_errors=ignore_errors
+    )
+    # check failed entries
+    print("Featurization completed.")
+    failed = np.any(pd.isna(matminer_features.iloc[:, df.shape[1] :]), axis=1)
+    if sum(failed) > 0:
+        print(f"Number failed: {sum(failed)}/{len(failed)}")
+    return matminer_features
+
+
+# %%
+def featurize_file(
+    filename: str, struct_col: str = "initial_structure"
+) -> pd.DataFrame:
+    """Featurize pymatgen Structures in a file with matminer."""
+    df_in = pd.read_json(filename).set_index(id_col)
+
+    # ComputedStructureEntry dicts have a "structure" key, if that's missing it's a
+    # Structure dict
+    df_in[struct_col] = [
+        Structure.from_dict(x.get("structure", x))
+        for x in tqdm(df_in[struct_col], leave=False, desc="Hydrate structures")
+    ]
+
+    df_features = featurize_dataframe(df_in[struct_col], struct_col=struct_col)
+
+    return df_features.drop(columns=struct_col)
+
+
+# %%
+def features_to_drop(df_in: pd.DataFrame, threshold: float = 0.95) -> list[str]:
+    """Get column names of features with correlation greater than threshold to drop."""
+    corr_matrix = df_in.corr(method="pearson", numeric_only=False).abs()
+
+    # select upper triangle of correlation matrix
+    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+
+    # return features with correlation greater than the threshold
+    return [col for col in upper_tri if any(upper_tri[col] > threshold)]
+
+
+# %% Compute features and export to CSV
+if not os.path.isfile(mp_matminer_feat_path):
+    print("Computing matminer features for MP...")
+
+    df_mp_feats = featurize_file(
+        DATA_FILES.mp_computed_structure_entries, struct_col="entry"
+    )
+    df_mp_feats.to_csv(mp_matminer_feat_path)
+
+if not os.path.isfile(wbm_matminer_feat_path):
+    print("Computing matminer features for WBM...")
+
+    df_wbm_feats = featurize_file(DATA_FILES.wbm_initial_structures)
+    df_wbm_feats.to_csv(wbm_matminer_feat_path)
+
+
+# %%
+umap_out_path = f"{DATA_DIR}/wbm/umap/2d-umap-projections.csv.bz2"
+if not os.path.isfile(umap_out_path):
+    print("Computing UMAP...")
+
+    df_mp = pd.read_csv(mp_matminer_feat_path).set_index(id_col)
+    df_wbm = pd.read_csv(wbm_matminer_feat_path).set_index(id_col)
+
+    # Drop all rows containing NaN values
+    df_wbm = df_wbm.dropna(axis=0)
+    df_mp = df_mp.dropna(axis=0)
+
+    # Drop highly correlated features
+    cols_to_drop = features_to_drop(df_mp, threshold=0.95)
+    df_mp = df_mp.drop(columns=cols_to_drop)
+    df_wbm = df_wbm.drop(columns=cols_to_drop)
+
+    # Combined data frame
+    df_all = pd.concat((df_wbm, df_mp))
+    df_all.index = df_all.index.str.replace(r"^(mp|mvc)-", "wbm-0")
+    df_all["wbm_step"] = df_all.index.str.split("-").str[1].astype(int)
+
+    # train only on MP data
+    reducer = umap.UMAP(
+        n_neighbors=n_umap_neighbors, n_components=2, random_state=42, low_memory=False
+    )
+    reducer.fit(df_mp)
+
+    # transform everything
+    umap_points = reducer.transform(df_all)
+    cols = [f"UMAP {idx + 1}" for idx in range(umap_points.shape[1])]
+    df_umap = pd.DataFrame(umap_points, index=df_all.index, columns=cols)
+
+    df_umap.to_csv(umap_out_path)
+
+
+# %% Plot UMAP 2d projection
+df_umap = pd.read_csv(umap_out_path).set_index("wbm_step")
+
+assert (cols := list(df_umap)) == ["UMAP 1", "UMAP 2"]
+min_step, max_step = df_umap.index.min(), df_umap.index.max()
+ax = df_umap.plot.scatter(
+    *cols, c=df_umap.index, cmap="Spectral", s=5, figsize=(6, 4), colorbar=False
+)
+cbar = ax.figure.colorbar(
+    ax.collections[0],
+    boundaries=np.arange(min_step, max_step + 2) - 0.5,
+    ticks=range(min_step, max_step + 1),
+)
+cbar.ax.set_title("WBM step (0 = MP)", rotation=90, y=0.5, x=3, va="center")
+
+
+# %%
+plt.tight_layout()
+save_fig(ax, f"{PDF_FIGS}/wbm-final-struct-matminer-features-2d-umap.png", dpi=300)