Update job_name and out_dir to match actual new pred_file locat…

…ions in `test_<model>_discovery.py` scripts (#204) * rename Model.chgnet to Model.chgnet_030 and Model.m3gnet to Model.m3gnet_ms - update job_name and out_dir to match actual new pred_file locations in test_discovery.py scripts - restore and test __repr__ and __str__ methods on Files enum * fix tests/test_cli.py and add site/tests/table-column-toggle-menu.test.ts delete accidentally committed site/src/lib/ScatterPlot.svelte (orig from elementari) * fix MetricsTable.svelte rendering missing metrics as undefined, should be 'n/a'
janosh · Feb 9, 2025 · 9fa5baa · 9fa5baa
1 parent 2f7ea53
commit 9fa5baa
Show file tree

Hide file tree

Showing 39 changed files with 269 additions and 377 deletions.
diff --git a/matbench_discovery/enums.py b/matbench_discovery/enums.py
@@ -216,6 +216,14 @@ def __new__(cls, val: str, file_path: str) -> Self:
         obj.__dict__ |= dict(file_path=file_path)
         return obj
 
+    def __repr__(self) -> str:
+        """String representation of the file."""
+        return f"{type(self).__name__}.{self.name}"
+
+    def __str__(self) -> str:
+        """String representation of the file."""
+        return self.name
+
     @property
     @abc.abstractmethod
     def url(self) -> str:
@@ -252,15 +260,13 @@ class Model(Files, base_dir=f"{ROOT}/models"):
     """
 
     alignn = auto(), "alignn/alignn.yml"
-    # alignn_pretrained = auto(), "alignn/alignn.yml"
     # alignn_ff = auto(), "alignn/alignn-ff.yml"
 
     # BOWSR optimizer coupled with original megnet
     bowsr_megnet = auto(), "bowsr/bowsr.yml"
 
     # default CHGNet model from publication with 400,438 params
-    chgnet = auto(), "chgnet/chgnet.yml"
-    # chgnet_no_relax = auto(), None, "CHGNet No Relax"
+    chgnet_030 = auto(), "chgnet/chgnet-0.3.0.yml"
 
     # CGCNN 10-member ensemble
     cgcnn = auto(), "cgcnn/cgcnn.yml"
@@ -273,9 +279,9 @@ class Model(Files, base_dir=f"{ROOT}/models"):
     dpa3_v1_openlam = auto(), "deepmd/dpa3-v1-openlam.yml"
 
     # original M3GNet straight from publication, not re-trained
-    m3gnet = auto(), "m3gnet/m3gnet.yml"
-    # m3gnet_direct = auto(), None, "M3GNet DIRECT"
-    # m3gnet_ms = auto(), None, "M3GNet MS"
+    m3gnet_ms = auto(), "m3gnet/m3gnet.yml"
+    # m3gnet_direct = auto(), "M3GNet DIRECT"
+    # m3gnet_ms = auto(), "M3GNet MS"
 
     # MACE-MP-0 medium as published in https://arxiv.org/abs/2401.00096 trained on MPtrj
     mace_mp_0 = auto(), "mace/mace-mp-0.yml"

diff --git a/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz b/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz
diff --git a/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz b/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz
diff --git a/models/alignn/alignn-config.json b/models/alignn/alignn-config.json
diff --git a/models/alignn/alignn-2023.01.10.patch → ...lignn/alignn-mp22/alignn-2023.01.10.patch b/models/alignn/alignn-2023.01.10.patch → ...lignn/alignn-mp22/alignn-2023.01.10.patch
diff --git a/models/alignn/alignn.yml b/models/alignn/alignn.yml
@@ -43,14 +43,14 @@ model_params: 4_026_753 # pre-trained 'mp_e_form_alignn' and our custom MBD chec
 n_estimators: 1
 pr_url: https://github.com/janosh/matbench-discovery/pull/85
 
-# model trained from specifically for MBD
+# model trained on Materials Project energies specifically for this submission
 training_set: [MP 2022]
 
 metrics:
-  phonons: not available # model doesn't predict forces
-  geo_opt: not available
+  phonons: not applicable # reason: ALIGNN does not predict forces
+  geo_opt: not applicable # reason: ALIGNN does not predict forces
   discovery:
-    pred_file: models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz
+    pred_file: models/alignn/alignn-mp22/2023-06-02-wbm-IS2RE.csv.gz
     pred_file_url: https://figshare.com/ndownloader/files/51607262
     pred_col: e_form_per_atom_alignn
     full_test_set:

diff --git a/models/alignn/readme.md b/models/alignn/readme.md
@@ -21,4 +21,4 @@ Replace `/path/to/` with the actual path to the patch file.
 The directory contains the following files, which must be executed in the given order to reproduce the results:
 
 1. [`train_alignn.py`](train_alignn.py): Train an ALIGNN model on all 154k MP computed structure entries. The resulting model checkpoint is saved to the `out_dir` variable in that script and also uploaded to `wandb` from where it is publicly available for 3rd party reproducibility.
-1. [`test_alignn.py`](test_alignn.py): Test a trained ALIGNN model on the WBM data. Generated `2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz`.
+1. [`test_alignn_discovery.py`](test_alignn_discovery.py): Test a trained ALIGNN model on the WBM data. Generated `2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz`.
diff --git a/models/alignn/test_alignn_discovery.py b/models/alignn/test_alignn_discovery.py
@@ -18,7 +18,7 @@
 
 from matbench_discovery import today
 from matbench_discovery.data import df_wbm
-from matbench_discovery.enums import DataFiles, MbdKey, Task
+from matbench_discovery.enums import DataFiles, MbdKey, Model, Task
 from matbench_discovery.hpc import slurm_submit
 from matbench_discovery.plots import wandb_scatter
 
@@ -30,13 +30,13 @@
 
 # %%
 # model_name = "mp_e_form_alignn"  # pre-trained by NIST (not used for MBD submission)
-model_name = DataFiles.alignn_checkpoint.path  # trained by Philipp Benner
+model_name = Model.alignn  # trained by Philipp Benner
 task_type = Task.IS2RE
 target_col = MbdKey.e_form_dft
 input_col = Key.init_struct
 device = "cuda" if torch.cuda.is_available() else "cpu"
-job_name = f"{model_name}-wbm-{task_type}"
-out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+job_name = f"{model_name}/{today}-wbm-{task_type}"
+out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 
 if model_name in all_models:  # load pre-trained model

diff --git a/models/alignn/train_alignn.py b/models/alignn/train_alignn.py
@@ -18,7 +18,7 @@
 from tqdm import tqdm
 
 from matbench_discovery import today
-from matbench_discovery.enums import DataFiles
+from matbench_discovery.enums import DataFiles, Model
 from matbench_discovery.hpc import slurm_submit
 
 __author__ = "Philipp Benner, Janosh Riebesell"
@@ -28,20 +28,18 @@
 
 
 # %%
-model_name = "alignn-mp-e_form"
+model_name = f"{Model.alignn}-mp-e_form"
 target_col = Key.form_energy
 input_col = "atoms"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-job_name = f"train-{model_name}"
+job_name = f"{today}-train-{model_name}"
 
 
 pred_col = "e_form_per_atom_alignn"
 with open(f"{module_dir}/alignn-config.json") as file:
     config = TrainingConfig(**json.load(file))
 
-config.output_dir = out_dir = os.getenv(
-    "SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}"
-)
+config.output_dir = out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 slurm_vars = slurm_submit(
     job_name=job_name,

diff --git a/models/alignn_ff/readme.md b/models/alignn_ff/readme.md
@@ -35,5 +35,5 @@ OSError: [Errno 24] Too many open files
 
 ## Scripts
 
-1. `alignn_ff_relax.py`: Relax WBM test set structures. Set the variable `n_splits` to the number of GPU compute nodes. On each compute node, set the environment variable `TASK_ID` to a value in the range 1-`n_splits`. Set the variable `n_processes_per_task` to the number of processes on a single node. For 48 CPU cores with 4 GPUs a good setting is to use 10 processes.
-2. `test_alignn_ff.py`: Read the relaxed structures from `alignn_ff_relax.py` and make formation energy predictions. Set the variable `n_splits` accordingly.
+1. [`alignn_ff_relax.py`](alignn_ff_relax.py): Relax WBM test set structures. Set the variable `n_splits` to the number of GPU compute nodes. On each compute node, set the environment variable `TASK_ID` to a value in the range 1-`n_splits`. Set the variable `n_processes_per_task` to the number of processes on a single node. For 48 CPU cores with 4 GPUs a good setting is to use 10 processes.
+2. [`test_alignn_ff_discovery.py`](test_alignn_ff_discovery.py): Read the relaxed structures from `alignn_ff_relax.py` and make formation energy predictions. Set the variable `n_splits` accordingly.
diff --git a/models/alignn_ff/test_alignn_ff_discovery.py b/models/alignn_ff/test_alignn_ff_discovery.py
@@ -19,7 +19,7 @@
 
 from matbench_discovery import today
 from matbench_discovery.data import df_wbm
-from matbench_discovery.enums import DataFiles, MbdKey, Task
+from matbench_discovery.enums import DataFiles, MbdKey, Model, Task
 from matbench_discovery.plots import wandb_scatter
 
 __author__ = "Philipp Benner, Janosh Riebesell"
@@ -33,10 +33,9 @@
 # model_name = "mp_e_form_alignnn"  # pre-trained by NIST
 task_type = Task.IS2RE
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_name = f"alignn-ff-wbm-{task_type}"
-job_name = f"{model_name}-relaxed-wbm-{task_type}"
-out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
-in_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+model_name = f"{Model.alignn}-ff"
+job_name = f"{model_name}/{today}-wbm-{task_type}"
+out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 
 if model_name in all_models:  # load pre-trained model
@@ -101,14 +100,7 @@
 df_wbm[pred_col] -= df_wbm.e_correction_per_atom_mp_legacy
 df_wbm[pred_col] += df_wbm.e_correction_per_atom_mp2020
 
-if model_name in all_models:
-    df_wbm[pred_col].round(4).to_csv(
-        f"{module_dir}/{today}-{model_name}-relaxed-wbm-IS2RE.csv.gz"
-    )
-else:
-    df_wbm[pred_col].round(4).to_csv(
-        f"{module_dir}/{today}-alignn-relaxed-wbm-IS2RE.csv.gz"
-    )
+df_wbm[pred_col].round(4).to_csv(f"{out_dir}/wbm-IS2RE.csv.gz")
 
 
 # %%

diff --git a/models/bowsr/test_bowsr_discovery.py b/models/bowsr/test_bowsr_discovery.py
@@ -1,3 +1,8 @@
+"""To slurm submit this file: python path/to/file.py slurm-submit
+Requires MEGNet and MAML installation: pip install megnet maml
+https://github.com/materialsvirtuallab/maml
+"""
+
 # %%
 import contextlib
 import os
@@ -21,11 +26,6 @@
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-15"
 
-"""
-To slurm submit this file: python path/to/file.py slurm-submit
-Requires MEGNet and MAML installation: pip install megnet maml
-https://github.com/materialsvirtuallab/maml
-"""
 
 task_type = Task.IS2RE
 module_dir = os.path.dirname(__file__)
@@ -34,9 +34,9 @@
 # see https://stackoverflow.com/a/55431306 for how to change array throttling
 # post submission
 slurm_max_parallel = 100
-energy_model = Model.megnet.label.lower()
-job_name = f"bowsr-{energy_model}-wbm-{task_type}"
-out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+energy_model = Model.megnet
+job_name = f"bowsr-{energy_model}/{today}-wbm-{task_type}"
+out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 data_path = {
     Task.IS2RE: DataFiles.wbm_initial_structures.path,

diff --git a/models/cgcnn/test_cgcnn_discovery.py b/models/cgcnn/test_cgcnn_discovery.py
@@ -1,3 +1,9 @@
+"""
+Download WandB checkpoints for an ensemble of CGCNN models trained on all MP
+formation energies, then make predictions on some dataset, prints ensemble metrics and
+saves predictions to CSV.
+"""
+
 # %%
 import os
 from importlib.metadata import version
@@ -15,24 +21,20 @@
 
 from matbench_discovery import CHECKPOINT_DIR, WANDB_PATH, WBM_DIR, today
 from matbench_discovery.data import df_wbm
-from matbench_discovery.enums import DataFiles, MbdKey, Task
+from matbench_discovery.enums import DataFiles, MbdKey, Model, Task
 from matbench_discovery.hpc import slurm_submit
 from matbench_discovery.plots import wandb_scatter
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-15"
 
-"""
-Download WandB checkpoints for an ensemble of CGCNN models trained on all MP
-formation energies, then make predictions on some dataset, prints ensemble metrics and
-saves predictions to CSV.
-"""
 
 task_type = Task.IS2RE
 debug = False
-job_name = f"test-cgcnn-wbm-{task_type}"
+model_name = Model.cgcnn  # or Model.cgcnn_p
+job_name = f"{model_name}/{today}-wbm-{task_type}"
 module_dir = os.path.dirname(__file__)
-out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 slurm_vars = slurm_submit(
     job_name=job_name,

diff --git a/models/cgcnn/train_cgcnn.py b/models/cgcnn/train_cgcnn.py
@@ -1,3 +1,5 @@
+"""Train a CGCNN ensemble on target_col of data_path."""
+
 # %%
 import os
 from importlib.metadata import version
@@ -17,10 +19,6 @@
 from matbench_discovery.hpc import slurm_submit
 from matbench_discovery.structure import perturb_structure
 
-"""
-Train a CGCNN ensemble on target_col of data_path.
-"""
-
 __author__ = "Janosh Riebesell"
 __date__ = "2022-06-13"
 
@@ -32,12 +30,12 @@
 # 0 for no perturbation, n>1 means train on n perturbations of each crystal
 # in the training set all assigned the same original target energy
 n_perturb = 0
-job_name = f"train-cgcnn-robust-{n_perturb=}"
+job_name = f"{today}-train-cgcnn-robust-{n_perturb=}"
 print(f"{job_name=}")
 robust = "robust" in job_name.lower()
 ensemble_size = 10
 module_dir = os.path.dirname(__file__)
-out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 slurm_vars = slurm_submit(
     job_name=job_name,

diff --git a/models/chgnet/analyze_chgnet.py b/models/chgnet/analyze_chgnet.py
@@ -20,7 +20,7 @@
 
 
 # %%
-df_chgnet = df_chgnet_v030 = pd.read_csv(Model.chgnet.discovery_path)
+df_chgnet = df_chgnet_v030 = pd.read_csv(Model.chgnet_030.discovery_path)
 df_chgnet_v020 = pd.read_csv(
     f"{module_dir}/2023-03-06-chgnet-0.2.0-wbm-IS2RE.csv.gz", index_col=Key.mat_id
 )

diff --git a/models/chgnet/chgnet.yml → models/chgnet/chgnet-0.3.0.yml b/models/chgnet/chgnet.yml → models/chgnet/chgnet-0.3.0.yml
diff --git a/models/chgnet/ctk_structure_viewer.py b/models/chgnet/ctk_structure_viewer.py
@@ -18,14 +18,14 @@
 e_form_2000 = "e_form_per_atom_chgnet_2000"
 e_form_500 = "e_form_per_atom_chgnet_500"
 
-df_chgnet = pd.read_json(Model.chgnet.geo_opt_path)
+df_chgnet = pd.read_json(Model.chgnet_030.geo_opt_path)
 df_chgnet = df_chgnet.set_index(Key.mat_id)
 
-df_chgnet_2000 = pd.read_csv(Model.chgnet.discovery_path)
+df_chgnet_2000 = pd.read_csv(Model.chgnet_030.discovery_path)
 df_chgnet_2000 = df_chgnet_2000.set_index(Key.mat_id).add_suffix("_2000")
 df_chgnet[list(df_chgnet_2000)] = df_chgnet_2000
 
-df_chgnet_500 = pd.read_csv(Model.chgnet.discovery_path.replace("-06", "-04"))
+df_chgnet_500 = pd.read_csv(Model.chgnet_030.discovery_path.replace("-06", "-04"))
 df_chgnet_500 = df_chgnet_500.set_index(Key.mat_id).add_suffix("_500")
 df_chgnet[list(df_chgnet_500)] = df_chgnet_500
 

diff --git a/models/chgnet/test_chgnet_discovery.py b/models/chgnet/test_chgnet_discovery.py
@@ -21,7 +21,7 @@
 
 from matbench_discovery import timestamp, today
 from matbench_discovery.data import as_dict_handler, df_wbm
-from matbench_discovery.enums import DataFiles, MbdKey, Task
+from matbench_discovery.enums import DataFiles, MbdKey, Model, Task
 from matbench_discovery.hpc import slurm_submit
 from matbench_discovery.plots import wandb_scatter
 
@@ -33,9 +33,9 @@
 # set large job array size for smaller data splits and faster testing/debugging
 slurm_array_task_count = 50
 device = "cuda" if torch.cuda.is_available() else "cpu"
-chgnet = StructOptimizer(use_device=device)  # load default pre-trained CHGNnet model
-job_name = f"chgnet-{chgnet.version}-wbm-{task_type}"
-out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+chgnet = StructOptimizer(use_device=device)  # load default pre-trained CHGNet model
+job_name = f"{Model.chgnet_030}/{today}-wbm-{task_type}"
+out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{job_name}")
 
 slurm_vars = slurm_submit(
     job_name=job_name,

diff --git a/models/deepmd/join_dpa3_preds.py b/models/deepmd/join_dpa3_preds.py
@@ -16,11 +16,11 @@
 
 from matbench_discovery.data import df_wbm
 from matbench_discovery.energy import get_e_form_per_atom, mp_elemental_ref_energies
-from matbench_discovery.enums import DataFiles
+from matbench_discovery.enums import DataFiles, Model
 
 e_form_dp_col = "e_form_per_atom_dp"
 results = "./results"
-model_name = "dpa3"
+model_name = Model.dpa3_v1_mptrj  # or Model.dpa3_v1_openlam
 module_dir = os.path.dirname(__file__)
 out_path = f"{module_dir}/{model_name}"
 files = sorted(glob(f"{results}/{model_name}-*.json.gz"))

diff --git a/models/deepmd/test_dpa3.py → models/deepmd/test_dpa3_discovery.py b/models/deepmd/test_dpa3.py → models/deepmd/test_dpa3_discovery.py