add stress unit conversion in json_to_extxyz.py

janosh · Nov 22, 2023 · 7ffaf4e · 7ffaf4e
2 parents 603bcb4 + db999fd
commit 7ffaf4e
Show file tree

Hide file tree

Showing 138 changed files with 1,377 additions and 2,141 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ __pycache__
 data/**/raw
 data/**/tsne
 !data/mp/2023-02-07-mp-elemental-reference-entries.json.gz
+models/**/checkpoints
 
 # slurm + Weights and Biases logs
 wandb/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,23 +7,19 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.290
+    rev: v0.1.5
     hooks:
       - id: ruff
         args: [--fix]
-
-  - repo: https://github.com/psf/black
-    rev: 23.9.1
-    hooks:
-      - id: black
+      - id: ruff-format
 
   - repo: https://github.com/janosh/format-ipy-cells
     rev: v0.1.10
     hooks:
       - id: format-ipy-cells
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
       - id: check-case-conflict
       - id: check-symlinks
@@ -34,19 +30,19 @@ repos:
       - id: trailing-whitespace
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.5.1
+    rev: v1.7.0
     hooks:
       - id: mypy
         additional_dependencies: [types-pyyaml, types-requests]
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.2.6
     hooks:
       - id: codespell
         stages: [commit, commit-msg]
         exclude_types: [csv, json, svg]
         exclude: ^(.+references.yaml|site/src/figs/.+)$
-        args: [--ignore-words-list, "nd,te,fpr"]
+        args: [--ignore-words-list, "nd,te,fpr", --check-filenames]
 
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v3.0.3
@@ -60,7 +56,7 @@ repos:
         exclude: ^(site/src/figs/.+\.svelte|data/wbm/20.+\..+|site/src/routes/.+\.(yaml|json)|changelog.md)$
 
   - repo: https://github.com/pre-commit/mirrors-eslint
-    rev: v8.49.0
+    rev: v8.53.0
     hooks:
       - id: eslint
         types: [file]

diff --git a/changelog.md b/changelog.md
diff --git a/citation.cff b/citation.cff
@@ -1,7 +1,7 @@
 # cff-version: 1.2.0
 message: If you use this software, please cite it as below.
 title: Matbench Discovery
-subtitle: Can machine learning identify stable crystals?
+subtitle: An evaluation framework for machine learning crystal stability prediction
 authors:
   - given-names: Janosh
     family-names: Riebesell

diff --git a/data/figshare/1.0.0.json b/data/figshare/1.0.0.json
@@ -1,42 +1,49 @@
 {
-  "alignn_checkpoint": [
-    "https://figshare.com/ndownloader/files/41233560",
-    "2023-06-02-pbenner-best-alignn-model.pth.zip"
-  ],
-  "mace_checkpoint": [
-    "https://figshare.com/ndownloader/files/42374049",
-    "2023-08-14-mace-yuan-trained-mptrj-04.model"
-  ],
-  "mp_computed_structure_entries": [
-    "https://figshare.com/ndownloader/files/40344436",
-    "2023-02-07-mp-computed-structure-entries.json.gz"
-  ],
-  "mp_elemental_ref_entries": [
-    "https://figshare.com/ndownloader/files/40387775",
-    "2023-02-07-mp-elemental-reference-entries.json.gz"
-  ],
-  "mp_energies": [
-    "https://figshare.com/ndownloader/files/41296875",
-    "2023-01-10-mp-energies.csv.gz"
-  ],
-  "mp_patched_phase_diagram": [
-    "https://figshare.com/ndownloader/files/40344451",
-    "2023-02-07-ppd-mp.pkl.gz"
-  ],
-  "wbm_computed_structure_entries": [
-    "https://figshare.com/ndownloader/files/40344463",
-    "2022-10-19-wbm-computed-structure-entries.json.bz2"
-  ],
-  "wbm_initial_structures": [
-    "https://figshare.com/ndownloader/files/40344466",
-    "2022-10-19-wbm-init-structs.json.bz2"
-  ],
-  "wbm_cses_plus_init_structs": [
-    "https://figshare.com/ndownloader/files/40344469",
-    "2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
-  ],
-  "wbm_summary": [
-    "https://figshare.com/ndownloader/files/41296866",
-    "2022-10-19-wbm-summary.csv.gz"
-  ]
+  "files": {
+    "alignn_checkpoint": [
+      "https://figshare.com/ndownloader/files/41233560",
+      "2023-06-02-pbenner-best-alignn-model.pth.zip"
+    ],
+    "mace_checkpoint_1": [
+      "https://figshare.com/ndownloader/files/42374049",
+      "2023-08-14-mace-yuan-trained-mptrj-04.model"
+    ],
+    "mace_checkpoint_2": [
+      "https://figshare.com/ndownloader/files/43117273",
+      "2023-10-29-mace-16M-pbenner-mptrj-no-conditional-loss.model"
+    ],
+    "mp_computed_structure_entries": [
+      "https://figshare.com/ndownloader/files/40344436",
+      "2023-02-07-mp-computed-structure-entries.json.gz"
+    ],
+    "mp_elemental_ref_entries": [
+      "https://figshare.com/ndownloader/files/40387775",
+      "2023-02-07-mp-elemental-reference-entries.json.gz"
+    ],
+    "mp_energies": [
+      "https://figshare.com/ndownloader/files/41296875",
+      "2023-01-10-mp-energies.csv.gz"
+    ],
+    "mp_patched_phase_diagram": [
+      "https://figshare.com/ndownloader/files/40344451",
+      "2023-02-07-ppd-mp.pkl.gz"
+    ],
+    "wbm_computed_structure_entries": [
+      "https://figshare.com/ndownloader/files/40344463",
+      "2022-10-19-wbm-computed-structure-entries.json.bz2"
+    ],
+    "wbm_initial_structures": [
+      "https://figshare.com/ndownloader/files/40344466",
+      "2022-10-19-wbm-init-structs.json.bz2"
+    ],
+    "wbm_cses_plus_init_structs": [
+      "https://figshare.com/ndownloader/files/40344469",
+      "2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
+    ],
+    "wbm_summary": [
+      "https://figshare.com/ndownloader/files/41296866",
+      "2022-10-19-wbm-summary.csv.gz"
+    ]
+  },
+  "article": "https://figshare.com/articles/dataset/22715158"
 }
diff --git a/data/mp/build_phase_diagram.py b/data/mp/build_phase_diagram.py
@@ -17,7 +17,7 @@
 from pymatgen.ext.matproj import MPRester
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, today
+from matbench_discovery import ROOT, id_col, today
 from matbench_discovery.data import DATA_FILES
 from matbench_discovery.energy import get_e_form_per_atom, get_elemental_ref_entries
 
@@ -30,7 +30,7 @@
 # save all ComputedStructureEntries to disk
 # mp-15590 appears twice so we drop_duplicates()
 df = pd.DataFrame(all_mp_computed_structure_entries, columns=["entry"])
-df.index.name = "material_id"
+df.index.name = id_col
 df.index = [e.entry_id for e in df.entry]
 df.reset_index().to_json(
     f"{module_dir}/{today}-mp-computed-structure-entries.json.gz",
@@ -40,7 +40,7 @@
 
 # %%
 data_path = f"{module_dir}/2023-02-07-mp-computed-structure-entries.json.gz"
-df = pd.read_json(data_path).set_index("material_id")
+df = pd.read_json(data_path).set_index(id_col)
 
 # drop the structure, just load ComputedEntry, makes the PPD faster to build and load
 mp_computed_entries = [ComputedEntry.from_dict(dct) for dct in tqdm(df.entry)]
@@ -63,9 +63,7 @@
 
 
 # %% build phase diagram with both MP entries + WBM entries
-df_wbm = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
-    "material_id"
-)
+df_wbm = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(id_col)
 
 # using ComputedStructureEntry vs ComputedEntry here is important as CSEs receive
 # more accurate energy corrections that take into account peroxide/superoxide nature
@@ -104,7 +102,7 @@
     json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())
 
 
-df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
+df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)
 
 
 # %%

diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -8,7 +8,7 @@
 from pymatviz.utils import annotate_metrics
 from tqdm import tqdm
 
-from matbench_discovery import STABILITY_THRESHOLD, today
+from matbench_discovery import STABILITY_THRESHOLD, id_col, today
 from matbench_discovery.data import DATA_FILES
 
 """
@@ -26,7 +26,7 @@
 
 # %%
 fields = {
-    "material_id",
+    id_col,
     "formula_pretty",
     "formation_energy_per_atom",
     "energy_per_atom",
@@ -46,7 +46,7 @@
 
 
 # %%
-df = pd.DataFrame(docs).set_index("material_id")
+df = pd.DataFrame(docs).set_index(id_col)
 
 df_spg = pd.json_normalize(df.pop("symmetry"))[["number", "symbol"]]
 df["spacegroup_symbol"] = df_spg.symbol.to_numpy()
@@ -56,7 +56,7 @@
 
 
 # %%
-df_cse = pd.read_json(DATA_FILES.mp_computed_structure_entries).set_index("material_id")
+df_cse = pd.read_json(DATA_FILES.mp_computed_structure_entries).set_index(id_col)
 
 struct_col = "structure"
 df_cse[struct_col] = [
@@ -76,7 +76,7 @@
 assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()
 
 df.to_csv(DATA_FILES.mp_energies)
-# df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
+# df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)
 
 
 # %% reproduce fig. 1b from https://arxiv.org/abs/2001.10591 (as data consistency check)
@@ -104,14 +104,14 @@
 ax = df.plot.scatter(
     x="decomposition_enthalpy",
     y="energy_above_hull",
-    color=mask_above_line.map({True: "red", False: "blue"})
+    color=mask_above_line.map({True: "red", False: "blue"}),
     # backend="plotly",
     # hover_data=["index", "formula_pretty", "formation_energy_per_atom"],
 )
 # most points lie on line y=x for x > 0 and y = 0 for x < 0.
 n_above_line = sum(mask_above_line)
 ax.set(
-    title=f"{n_above_line:,} / {len(df):,} = {n_above_line/len(df):.1%} "
+    title=f"{n_above_line:,} / {len(df):,} = {n_above_line / len(df):.1%} "
     "MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
 )
 # ax.figure.savefig(f"{module_dir}/mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
diff --git a/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py b/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
@@ -1,7 +1,6 @@
 # %%
 import gzip
 import json
-import warnings
 
 import pandas as pd
 from pymatgen.entries.compatibility import (
@@ -11,7 +10,7 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, today
+from matbench_discovery import ROOT, formula_col, id_col, today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.plots import plt
@@ -23,9 +22,7 @@
 """
 
 
-df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
-    "material_id"
-)
+df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(id_col)
 
 cses = [
     ComputedStructureEntry.from_dict(dct)
@@ -35,14 +32,11 @@
 ces = [ComputedEntry.from_dict(dct) for dct in tqdm(df_cse.computed_structure_entry)]
 
 
-warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")
-
-
 # %%
-out = MaterialsProject2020Compatibility().process_entries(cses, verbose=True)
-assert len(out) == len(df_cse)
-out = MaterialsProject2020Compatibility().process_entries(ces, verbose=True)
-assert len(out) == len(df_cse)
+processed = MaterialsProject2020Compatibility().process_entries(cses, verbose=True)
+assert len(processed) == len(df_cse)
+processed = MaterialsProject2020Compatibility().process_entries(ces, verbose=True)
+assert len(processed) == len(df_cse)
 
 df_wbm["e_form_per_atom_mp2020_from_ce"] = [
     get_e_form_per_atom(entry) for entry in tqdm(ces)
@@ -58,10 +52,10 @@
 
 
 # %%
-out = MaterialsProjectCompatibility().process_entries(cses, verbose=True)
-assert len(out) == len(df_cse)
-out = MaterialsProjectCompatibility().process_entries(ces, verbose=True)
-assert len(out) == len(df_cse)
+processed = MaterialsProjectCompatibility().process_entries(cses, verbose=True)
+assert len(processed) == len(df_cse)
+processed = MaterialsProjectCompatibility().process_entries(ces, verbose=True)
+assert len(processed) == len(df_cse)
 
 df_wbm["e_form_per_atom_legacy_from_ce"] = [
     get_e_form_per_atom(entry) for entry in tqdm(ces)
@@ -74,7 +68,9 @@
 
 
 # %%
-df_wbm["chem_sys"] = df_wbm.formula.str.replace("[0-9]+", "", regex=True).str.split()
+df_wbm["chem_sys"] = (
+    df_wbm[formula_col].str.replace("[0-9]+", "", regex=True).str.split()
+)
 df_wbm["anion"] = None
 df_wbm["anion"][df_wbm.chem_sys.astype(str).str.contains("'O'")] = "oxide"
 df_wbm["anion"][df_wbm.chem_sys.astype(str).str.contains("'S'")] = "sulfide"