link MPtrj dataset from /contribute page "Direct Download" section

janosh · janosh · commit 9d1e77ea3607 · 2023-11-13T13:05:49.000-08:00
update MACE readme for 16M MPtrj checkpoint from pbenner
define formula_col to ensure consistency across code base
diff --git a/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py b/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
@@ -10,7 +10,7 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, id_col, today
+from matbench_discovery import ROOT, formula_col, id_col, today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.plots import plt
@@ -68,7 +68,9 @@
 
 
 # %%
-df_wbm["chem_sys"] = df_wbm.formula.str.replace("[0-9]+", "", regex=True).str.split()
+df_wbm["chem_sys"] = (
+    df_wbm[formula_col].str.replace("[0-9]+", "", regex=True).str.split()
+)
 df_wbm["anion"] = None
 df_wbm["anion"][df_wbm.chem_sys.astype(str).str.contains("'O'")] = "oxide"
 df_wbm["anion"][df_wbm.chem_sys.astype(str).str.contains("'S'")] = "sulfide"
diff --git a/data/wbm/eda.py b/data/wbm/eda.py
@@ -12,7 +12,14 @@
 )
 from pymatviz.io import save_fig
 
-from matbench_discovery import PDF_FIGS, ROOT, SITE_FIGS, STABILITY_THRESHOLD, id_col
+from matbench_discovery import (
+    PDF_FIGS,
+    ROOT,
+    SITE_FIGS,
+    STABILITY_THRESHOLD,
+    formula_col,
+    id_col,
+)
 from matbench_discovery import plots as plots
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.energy import mp_elem_reference_entries
@@ -35,8 +42,10 @@
 
 
 # %%
-wbm_occu_counts = count_elements(df_wbm.formula, count_mode="occurrence").astype(int)
-wbm_comp_counts = count_elements(df_wbm.formula, count_mode="composition")
+wbm_occu_counts = count_elements(df_wbm[formula_col], count_mode="occurrence").astype(
+    int
+)
+wbm_comp_counts = count_elements(df_wbm[formula_col], count_mode="composition")
 
 mp_occu_counts = count_elements(df_mp.formula_pretty, count_mode="occurrence").astype(
     int
@@ -60,16 +69,16 @@
 df_wbm["step"] = df_wbm.index.str.split("-").str[1].astype(int)
 assert df_wbm.step.between(1, 5).all()
 for batch in range(1, 6):
-    count_elements(df_wbm[df_wbm.step == batch].formula).to_json(
+    count_elements(df_wbm[df_wbm.step == batch][formula_col]).to_json(
         f"{data_page}/wbm-element-counts-{batch=}.json"
     )
 
 # export element counts by arity (how many elements in the formula)
 comp_col = "composition"
-df_wbm[comp_col] = df_wbm.formula.map(Composition)
+df_wbm[comp_col] = df_wbm[formula_col].map(Composition)
 
 for arity, df_mp in df_wbm.groupby(df_wbm[comp_col].map(len)):
-    count_elements(df_mp.formula).to_json(
+    count_elements(df_mp[formula_col]).to_json(
         f"{data_page}/wbm-element-counts-{arity=}.json"
     )
 
@@ -206,7 +215,7 @@
     y="2d t-SNE 2",
     color=color_col,
     hover_name=id_col,
-    hover_data=("formula", each_true_col),
+    hover_data=(formula_col, each_true_col),
     range_color=(0, clr_range_max),
 )
 fig.show()
@@ -219,7 +228,7 @@
     y="3d t-SNE 2",
     z="3d t-SNE 3",
     color=color_col,
-    custom_data=[id_col, "formula", each_true_col, color_col],
+    custom_data=[id_col, formula_col, each_true_col, color_col],
     range_color=(0, clr_range_max),
 )
 fig.data[0].hovertemplate = (
diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -18,7 +18,7 @@
 from pymatviz.io import save_fig
 from tqdm import tqdm
 
-from matbench_discovery import SITE_FIGS, id_col, today
+from matbench_discovery import SITE_FIGS, formula_col, id_col, today
 from matbench_discovery.data import DATA_FILES
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.plots import pio
@@ -289,7 +289,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 # %%
 col_map = {
-    "# comp": "formula",
+    "# comp": formula_col,
     "nsites": "n_sites",
     "vol": "volume",
     "e": "uncorrected_energy",
@@ -319,7 +319,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 assert sum(no_id_mask := df_summary.index.isna()) == 6, f"{sum(no_id_mask)=}"
 # the 'None' materials have 0 volume, energy, n_sites, bandgap, etc.
-assert all(df_summary[no_id_mask].drop(columns=["formula"]) == 0)
+assert all(df_summary[no_id_mask].drop(columns=[formula_col]) == 0)
 assert len(df_summary.query("volume > 0")) == len(df_wbm) + len(nan_init_structs_ids)
 # make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
 # listed in bad_struct_ids above
@@ -378,13 +378,13 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
 # sort formulas alphabetically
 df_summary["alph_formula"] = [
-    Composition(x).alphabetical_formula for x in df_summary.formula
+    Composition(x).alphabetical_formula for x in df_summary[formula_col]
 ]
 # alphabetical formula and original formula differ due to spaces, number 1 after element
 # symbols (FeO vs Fe1 O1), and element order (FeO vs OFe)
-assert sum(df_summary.alph_formula != df_summary.formula) == 257_483
+assert sum(df_summary.alph_formula != df_summary[formula_col]) == 257_483
 
-df_summary["formula"] = df_summary.pop("alph_formula")
+df_summary[formula_col] = df_summary.pop("alph_formula")
 
 
 # %% write initial structures and computed structure entries to compressed json
@@ -404,10 +404,10 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 # df_summary and df_wbm formulas differ because summary formulas are reduced while
 # df_wbm formulas are not (e.g. Ac6 U2 vs Ac3 U1 in summary). unreduced is more
 # informative so we use it.
-assert sum(df_summary.formula != df_wbm.formula_from_cse) == 114_273
-assert sum(df_summary.formula == df_wbm.formula_from_cse) == 143_214
+assert sum(df_summary[formula_col] != df_wbm.formula_from_cse) == 114_273
+assert sum(df_summary[formula_col] == df_wbm.formula_from_cse) == 143_214
 
-df_summary.formula = df_wbm.formula_from_cse
+df_summary[formula_col] = df_wbm.formula_from_cse
 
 
 # fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
diff --git a/matbench_discovery/__init__.py b/matbench_discovery/__init__.py
@@ -37,3 +37,4 @@
 init_struct_col = "initial_structure"
 struct_col = "structure"
 e_form_col = "formation_energy_per_atom"
+formula_col = "formula"
diff --git a/models/chgnet/analyze_chgnet.py b/models/chgnet/analyze_chgnet.py
@@ -10,7 +10,7 @@
 from pymatviz import density_scatter, plot_structure_2d, ptable_heatmap_plotly
 from pymatviz.io import save_fig
 
-from matbench_discovery import PDF_FIGS, id_col
+from matbench_discovery import PDF_FIGS, formula_col, id_col
 from matbench_discovery import plots as plots
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.preds import PRED_FILES
@@ -26,7 +26,7 @@
 df_chgnet_v020 = pd.read_csv(
     f"{module_dir}/2023-03-06-chgnet-0.2.0-wbm-IS2RE.csv.gz", index_col=id_col
 )
-df_chgnet["formula"] = df_wbm.formula
+df_chgnet[formula_col] = df_wbm[formula_col]
 
 e_form_2000 = "e_form_per_atom_chgnet_relax_steps_2000"
 e_form_500 = "e_form_per_atom_chgnet_relax_steps_500"
@@ -51,15 +51,15 @@
     x=e_form_500,
     y=e_form_2000,
     hover_name=id_col,
-    hover_data=["formula"],
+    hover_data=[formula_col],
     backend="plotly",
     title=f"{len(df_diff)} structures have > {min_e_diff} eV/atom energy diff after "
     "longer relaxation",
 )
 
 
 # %%
-fig = ptable_heatmap_plotly(df_bad.formula)
+fig = ptable_heatmap_plotly(df_bad[formula_col])
 title = "structures with larger error<br>after longer relaxation"
 fig.layout.title.update(text=f"{len(df_diff)} {title}", x=0.4, y=0.9)
 fig.show()
diff --git a/models/chgnet/ctk_structure_viewer.py b/models/chgnet/ctk_structure_viewer.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from crystal_toolkit.helpers.utils import hook_up_fig_with_struct_viewer
 
-from matbench_discovery import id_col
+from matbench_discovery import formula_col, id_col
 from matbench_discovery.preds import PRED_FILES
 
 __author__ = "Janosh Riebesell"
@@ -47,7 +47,7 @@
     y=e_form_2000,
     backend="plotly",
     hover_name=id_col,
-    hover_data=["formula"],
+    hover_data=[formula_col],
     labels=plot_labels,
     size=e_form_abs_diff,
     color=e_form_abs_diff,
diff --git a/models/chgnet/join_chgnet_results.py b/models/chgnet/join_chgnet_results.py
@@ -13,7 +13,7 @@
 from pymatviz import density_scatter
 from tqdm import tqdm
 
-from matbench_discovery import id_col
+from matbench_discovery import formula_col, id_col
 from matbench_discovery.data import as_dict_handler
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.preds import df_preds, e_form_col
@@ -54,11 +54,11 @@
 
 # %% compute corrected formation energies
 e_form_chgnet_col = "e_form_per_atom_chgnet"
-df_chgnet["formula"] = df_preds.formula
+df_chgnet[formula_col] = df_preds[formula_col]
 df_chgnet[e_form_chgnet_col] = [
     get_e_form_per_atom(dict(energy=ene, composition=formula))
     for formula, ene in tqdm(
-        df_chgnet.set_index("formula").chgnet_energy.items(), total=len(df_chgnet)
+        df_chgnet.set_index(formula_col).chgnet_energy.items(), total=len(df_chgnet)
     )
 ]
 df_preds[e_form_chgnet_col] = df_chgnet[e_form_chgnet_col]
diff --git a/models/chgnet/test_chgnet.py b/models/chgnet/test_chgnet.py
@@ -21,7 +21,7 @@
 from pymatgen.core import Structure
 from tqdm import tqdm
 
-from matbench_discovery import id_col, timestamp, today
+from matbench_discovery import formula_col, id_col, timestamp, today
 from matbench_discovery.data import DATA_FILES, as_dict_handler, df_wbm
 from matbench_discovery.plots import wandb_scatter
 from matbench_discovery.slurm import slurm_submit
@@ -125,7 +125,7 @@
 df_wbm[e_pred_col] = df_out[e_pred_col]
 table = wandb.Table(
     dataframe=df_wbm.dropna()[
-        ["uncorrected_energy", e_pred_col, "formula"]
+        ["uncorrected_energy", e_pred_col, formula_col]
     ].reset_index()
 )
 
diff --git a/models/mace/analyze_mace.py b/models/mace/analyze_mace.py
@@ -8,7 +8,7 @@
 from pymatviz import density_scatter, ptable_heatmap_plotly, spacegroup_sunburst
 from pymatviz.io import save_fig
 
-from matbench_discovery import id_col
+from matbench_discovery import formula_col, id_col
 from matbench_discovery import plots as plots
 from matbench_discovery.data import df_wbm
 from matbench_discovery.preds import PRED_FILES
@@ -44,7 +44,7 @@
 
 
 # %%
-fig = ptable_heatmap_plotly(df_low.formula)
+fig = ptable_heatmap_plotly(df_low[formula_col])
 title = f"Elements in {len(df_low):,} MACE severe energy underpredictions"
 fig.layout.title.update(text=title, x=0.4, y=0.95)
 fig.show()
diff --git a/models/mace/join_mace_results.py b/models/mace/join_mace_results.py
@@ -16,7 +16,7 @@
 from pymatviz import density_scatter
 from tqdm import tqdm
 
-from matbench_discovery import id_col
+from matbench_discovery import formula_col, id_col
 from matbench_discovery.data import DATA_FILES, as_dict_handler, df_wbm
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.preds import e_form_col
@@ -80,11 +80,11 @@
 
 # %% compute corrected formation energies
 e_form_mace_col = "e_form_per_atom_mace"
-df_mace["formula"] = df_wbm.formula
+df_mace[formula_col] = df_wbm[formula_col]
 df_mace[e_form_mace_col] = [
     get_e_form_per_atom(dict(energy=cse.energy, composition=formula))
     for formula, cse in tqdm(
-        df_mace.set_index("formula")[entry_col].items(), total=len(df_mace)
+        df_mace.set_index(formula_col)[entry_col].items(), total=len(df_mace)
     )
 ]
 df_wbm[e_form_mace_col] = df_mace[e_form_mace_col]
@@ -106,6 +106,6 @@
 df_bad[e_form_col] = df_wbm[e_form_col]
 df_bad.to_csv(f"{out_path}-bad.csv")
 
-# in_path = f"{module_dir}/2023-08-14-mace-wbm-IS2RE-FIRE"
+# in_path = f"{module_dir}/2023-11-02-mace-wbm-IS2RE-FIRE"
 # df_mace = pd.read_csv(f"{in_path}.csv.gz").set_index(id_col)
 # df_mace = pd.read_json(f"{in_path}.json.gz").set_index(id_col)
diff --git a/models/mace/json_to_extxyz.py b/models/mace/json_to_extxyz.py
@@ -1,6 +1,5 @@
-"""This script converts the MPTrj relaxation trajectories from JSON to
-extended XYZ format. The JSON data was downloaded from
-https://figshare.com/articles/dataset/23713842.
+"""This script converts the MPTrj relaxation trajectories downloaded from
+https://figshare.com/articles/dataset/23713842 from JSON to extended XYZ format.
 """
 
 import json
diff --git a/models/mace/readme.md b/models/mace/readme.md
@@ -1,8 +1,10 @@
 ## MACE formation energy predictions on WBM test set
 
-This submission uses the [`2023-08-14-mace-yuan-trained-mptrj-04.model`](https://figshare.com/ndownloader/files/42374049) checkpoint trained by Yuan Chiang on the [MPtrj dataset](https://figshare.com/articles/dataset/23713842).
+The original MACE submission used the 2M parameter checkpoint [`2023-08-14-mace-yuan-trained-mptrj-04.model`](https://figshare.com/ndownloader/files/42374049) trained by Yuan Chiang on the [MPtrj dataset](https://figshare.com/articles/dataset/23713842).
 We initially tested the `2023-07-14-mace-universal-2-big-128-6.model` checkpoint trained on the much smaller [original M3GNet training set](https://figshare.com/articles/dataset/MPF_2021_2_8/19470599) which we received directly from Ilyes Batatia. MPtrj-trained MACE performed better and was used for the Matbench Discovery v1 submission.
 
+In late October (received 2023-10-29), Philipp Benner trained a much larger 16M parameter MACE for over 100 epochs in MPtrj which achieved an (at the time SOTA) F1 score of 0.64 and DAF of 3.13.
+
 ### Convergence criteria
 
 MACE relaxed each test set structure until the maximum force in the training set dropped below 0.05 eV/Å or 500 optimization steps were reached, whichever occurred first.
diff --git a/models/mace/test_mace.py b/models/mace/test_mace.py
@@ -18,7 +18,7 @@
 from pymatgen.io.ase import AseAtomsAdaptor
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, id_col, timestamp, today
+from matbench_discovery import ROOT, formula_col, id_col, timestamp, today
 from matbench_discovery.data import DATA_FILES, as_dict_handler, df_wbm
 from matbench_discovery.plots import wandb_scatter
 from matbench_discovery.slurm import slurm_submit
@@ -164,7 +164,7 @@
 df_wbm[e_pred_col] = df_out[e_pred_col]
 table = wandb.Table(
     dataframe=df_wbm.dropna()[
-        ["uncorrected_energy", e_pred_col, "formula"]
+        ["uncorrected_energy", e_pred_col, formula_col]
     ].reset_index()
 )
 
diff --git a/models/voronoi/train_test_voronoi_rf.py b/models/voronoi/train_test_voronoi_rf.py
@@ -13,7 +13,7 @@
 from sklearn.metrics import r2_score
 from sklearn.pipeline import Pipeline
 
-from matbench_discovery import ROOT, id_col, today
+from matbench_discovery import ROOT, formula_col, id_col, today
 from matbench_discovery.data import DATA_FILES, df_wbm, glob_to_df
 from matbench_discovery.plots import wandb_scatter
 from matbench_discovery.preds import e_form_col as test_e_form_col
@@ -123,7 +123,7 @@
 df_wbm[pred_col].round(4).to_csv(out_path)
 
 table = wandb.Table(
-    dataframe=df_wbm[["formula", test_e_form_col, pred_col]].reset_index()
+    dataframe=df_wbm[[formula_col, test_e_form_col, pred_col]].reset_index()
 )
 
 df_wbm[pred_col].isna().sum()
diff --git a/models/wrenformer/analyze_wrenformer.py b/models/wrenformer/analyze_wrenformer.py
@@ -10,7 +10,7 @@
 from pymatviz.ptable import ptable_heatmap_plotly
 from pymatviz.utils import add_identity_line, bin_df_cols
 
-from matbench_discovery import PDF_FIGS, SITE_FIGS, id_col
+from matbench_discovery import PDF_FIGS, SITE_FIGS, formula_col, id_col
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.preds import df_each_pred, df_preds, each_true_col
 
@@ -85,7 +85,7 @@
 
 
 # %%
-fig = ptable_heatmap_plotly(df_bad.formula)
+fig = ptable_heatmap_plotly(df_bad[formula_col])
 fig.layout.title = f"Elements in {title}"
 fig.layout.margin = dict(l=0, r=0, t=50, b=0)
 fig.show()
diff --git a/scripts/analyze_model_failure_cases.py b/scripts/analyze_model_failure_cases.py
diff --git a/scripts/model_figs/analyze_model_disagreement.py b/scripts/model_figs/analyze_model_disagreement.py
diff --git a/scripts/model_figs/hist_classified_stable_vs_hull_dist_models.py b/scripts/model_figs/hist_classified_stable_vs_hull_dist_models.py
diff --git a/scripts/model_figs/per_element_errors.py b/scripts/model_figs/per_element_errors.py
diff --git a/scripts/model_figs/scatter_hull_dist_models.py b/scripts/model_figs/scatter_hull_dist_models.py
diff --git a/site/src/routes/contribute/+page.md b/site/src/routes/contribute/+page.md
diff --git a/tests/test_data.py b/tests/test_data.py