Skip to content

Commit

Permalink
add stress unit conversion in json_to_extxyz.py
Browse files Browse the repository at this point in the history
  • Loading branch information
chiang-yuan committed Nov 22, 2023
2 parents 603bcb4 + db999fd commit 7ffaf4e
Show file tree
Hide file tree
Showing 138 changed files with 1,377 additions and 2,141 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ __pycache__
data/**/raw
data/**/tsne
!data/mp/2023-02-07-mp-elemental-reference-entries.json.gz
models/**/checkpoints

# slurm + Weights and Biases logs
wandb/
Expand Down
18 changes: 7 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,19 @@ default_install_hook_types: [pre-commit, commit-msg]

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.290
rev: v0.1.5
hooks:
- id: ruff
args: [--fix]

- repo: https://github.com/psf/black
rev: 23.9.1
hooks:
- id: black
- id: ruff-format

- repo: https://github.com/janosh/format-ipy-cells
rev: v0.1.10
hooks:
- id: format-ipy-cells

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: check-case-conflict
- id: check-symlinks
Expand All @@ -34,19 +30,19 @@ repos:
- id: trailing-whitespace

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.5.1
rev: v1.7.0
hooks:
- id: mypy
additional_dependencies: [types-pyyaml, types-requests]

- repo: https://github.com/codespell-project/codespell
rev: v2.2.5
rev: v2.2.6
hooks:
- id: codespell
stages: [commit, commit-msg]
exclude_types: [csv, json, svg]
exclude: ^(.+references.yaml|site/src/figs/.+)$
args: [--ignore-words-list, "nd,te,fpr"]
args: [--ignore-words-list, "nd,te,fpr", --check-filenames]

- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.3
Expand All @@ -60,7 +56,7 @@ repos:
exclude: ^(site/src/figs/.+\.svelte|data/wbm/20.+\..+|site/src/routes/.+\.(yaml|json)|changelog.md)$

- repo: https://github.com/pre-commit/mirrors-eslint
rev: v8.49.0
rev: v8.53.0
hooks:
- id: eslint
types: [file]
Expand Down
34 changes: 17 additions & 17 deletions changelog.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion citation.cff
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# cff-version: 1.2.0
message: If you use this software, please cite it as below.
title: Matbench Discovery
subtitle: Can machine learning identify stable crystals?
subtitle: An evaluation framework for machine learning crystal stability prediction
authors:
- given-names: Janosh
family-names: Riebesell
Expand Down
87 changes: 47 additions & 40 deletions data/figshare/1.0.0.json
Original file line number Diff line number Diff line change
@@ -1,42 +1,49 @@
{
"alignn_checkpoint": [
"https://figshare.com/ndownloader/files/41233560",
"2023-06-02-pbenner-best-alignn-model.pth.zip"
],
"mace_checkpoint": [
"https://figshare.com/ndownloader/files/42374049",
"2023-08-14-mace-yuan-trained-mptrj-04.model"
],
"mp_computed_structure_entries": [
"https://figshare.com/ndownloader/files/40344436",
"2023-02-07-mp-computed-structure-entries.json.gz"
],
"mp_elemental_ref_entries": [
"https://figshare.com/ndownloader/files/40387775",
"2023-02-07-mp-elemental-reference-entries.json.gz"
],
"mp_energies": [
"https://figshare.com/ndownloader/files/41296875",
"2023-01-10-mp-energies.csv.gz"
],
"mp_patched_phase_diagram": [
"https://figshare.com/ndownloader/files/40344451",
"2023-02-07-ppd-mp.pkl.gz"
],
"wbm_computed_structure_entries": [
"https://figshare.com/ndownloader/files/40344463",
"2022-10-19-wbm-computed-structure-entries.json.bz2"
],
"wbm_initial_structures": [
"https://figshare.com/ndownloader/files/40344466",
"2022-10-19-wbm-init-structs.json.bz2"
],
"wbm_cses_plus_init_structs": [
"https://figshare.com/ndownloader/files/40344469",
"2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
],
"wbm_summary": [
"https://figshare.com/ndownloader/files/41296866",
"2022-10-19-wbm-summary.csv.gz"
]
"files": {
"alignn_checkpoint": [
"https://figshare.com/ndownloader/files/41233560",
"2023-06-02-pbenner-best-alignn-model.pth.zip"
],
"mace_checkpoint_1": [
"https://figshare.com/ndownloader/files/42374049",
"2023-08-14-mace-yuan-trained-mptrj-04.model"
],
"mace_checkpoint_2": [
"https://figshare.com/ndownloader/files/43117273",
"2023-10-29-mace-16M-pbenner-mptrj-no-conditional-loss.model"
],
"mp_computed_structure_entries": [
"https://figshare.com/ndownloader/files/40344436",
"2023-02-07-mp-computed-structure-entries.json.gz"
],
"mp_elemental_ref_entries": [
"https://figshare.com/ndownloader/files/40387775",
"2023-02-07-mp-elemental-reference-entries.json.gz"
],
"mp_energies": [
"https://figshare.com/ndownloader/files/41296875",
"2023-01-10-mp-energies.csv.gz"
],
"mp_patched_phase_diagram": [
"https://figshare.com/ndownloader/files/40344451",
"2023-02-07-ppd-mp.pkl.gz"
],
"wbm_computed_structure_entries": [
"https://figshare.com/ndownloader/files/40344463",
"2022-10-19-wbm-computed-structure-entries.json.bz2"
],
"wbm_initial_structures": [
"https://figshare.com/ndownloader/files/40344466",
"2022-10-19-wbm-init-structs.json.bz2"
],
"wbm_cses_plus_init_structs": [
"https://figshare.com/ndownloader/files/40344469",
"2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
],
"wbm_summary": [
"https://figshare.com/ndownloader/files/41296866",
"2022-10-19-wbm-summary.csv.gz"
]
},
"article": "https://figshare.com/articles/dataset/22715158"
}
12 changes: 5 additions & 7 deletions data/mp/build_phase_diagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pymatgen.ext.matproj import MPRester
from tqdm import tqdm

from matbench_discovery import ROOT, today
from matbench_discovery import ROOT, id_col, today
from matbench_discovery.data import DATA_FILES
from matbench_discovery.energy import get_e_form_per_atom, get_elemental_ref_entries

Expand All @@ -30,7 +30,7 @@
# save all ComputedStructureEntries to disk
# mp-15590 appears twice so we drop_duplicates()
df = pd.DataFrame(all_mp_computed_structure_entries, columns=["entry"])
df.index.name = "material_id"
df.index.name = id_col
df.index = [e.entry_id for e in df.entry]
df.reset_index().to_json(
f"{module_dir}/{today}-mp-computed-structure-entries.json.gz",
Expand All @@ -40,7 +40,7 @@

# %%
data_path = f"{module_dir}/2023-02-07-mp-computed-structure-entries.json.gz"
df = pd.read_json(data_path).set_index("material_id")
df = pd.read_json(data_path).set_index(id_col)

# drop the structure, just load ComputedEntry, makes the PPD faster to build and load
mp_computed_entries = [ComputedEntry.from_dict(dct) for dct in tqdm(df.entry)]
Expand All @@ -63,9 +63,7 @@


# %% build phase diagram with both MP entries + WBM entries
df_wbm = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
"material_id"
)
df_wbm = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(id_col)

# using ComputedStructureEntry vs ComputedEntry here is important as CSEs receive
# more accurate energy corrections that take into account peroxide/superoxide nature
Expand Down Expand Up @@ -104,7 +102,7 @@
json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())


df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)


# %%
Expand Down
14 changes: 7 additions & 7 deletions data/mp/get_mp_energies.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pymatviz.utils import annotate_metrics
from tqdm import tqdm

from matbench_discovery import STABILITY_THRESHOLD, today
from matbench_discovery import STABILITY_THRESHOLD, id_col, today
from matbench_discovery.data import DATA_FILES

"""
Expand All @@ -26,7 +26,7 @@

# %%
fields = {
"material_id",
id_col,
"formula_pretty",
"formation_energy_per_atom",
"energy_per_atom",
Expand All @@ -46,7 +46,7 @@


# %%
df = pd.DataFrame(docs).set_index("material_id")
df = pd.DataFrame(docs).set_index(id_col)

df_spg = pd.json_normalize(df.pop("symmetry"))[["number", "symbol"]]
df["spacegroup_symbol"] = df_spg.symbol.to_numpy()
Expand All @@ -56,7 +56,7 @@


# %%
df_cse = pd.read_json(DATA_FILES.mp_computed_structure_entries).set_index("material_id")
df_cse = pd.read_json(DATA_FILES.mp_computed_structure_entries).set_index(id_col)

struct_col = "structure"
df_cse[struct_col] = [
Expand All @@ -76,7 +76,7 @@
assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()

df.to_csv(DATA_FILES.mp_energies)
# df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
# df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)


# %% reproduce fig. 1b from https://arxiv.org/abs/2001.10591 (as data consistency check)
Expand Down Expand Up @@ -104,14 +104,14 @@
ax = df.plot.scatter(
x="decomposition_enthalpy",
y="energy_above_hull",
color=mask_above_line.map({True: "red", False: "blue"})
color=mask_above_line.map({True: "red", False: "blue"}),
# backend="plotly",
# hover_data=["index", "formula_pretty", "formation_energy_per_atom"],
)
# most points lie on line y=x for x > 0 and y = 0 for x < 0.
n_above_line = sum(mask_above_line)
ax.set(
title=f"{n_above_line:,} / {len(df):,} = {n_above_line/len(df):.1%} "
title=f"{n_above_line:,} / {len(df):,} = {n_above_line / len(df):.1%} "
"MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
)
# ax.figure.savefig(f"{module_dir}/mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
30 changes: 13 additions & 17 deletions data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# %%
import gzip
import json
import warnings

import pandas as pd
from pymatgen.entries.compatibility import (
Expand All @@ -11,7 +10,7 @@
from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
from tqdm import tqdm

from matbench_discovery import ROOT, today
from matbench_discovery import ROOT, formula_col, id_col, today
from matbench_discovery.data import DATA_FILES, df_wbm
from matbench_discovery.energy import get_e_form_per_atom
from matbench_discovery.plots import plt
Expand All @@ -23,9 +22,7 @@
"""


df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
"material_id"
)
df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(id_col)

cses = [
ComputedStructureEntry.from_dict(dct)
Expand All @@ -35,14 +32,11 @@
ces = [ComputedEntry.from_dict(dct) for dct in tqdm(df_cse.computed_structure_entry)]


warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")


# %%
out = MaterialsProject2020Compatibility().process_entries(cses, verbose=True)
assert len(out) == len(df_cse)
out = MaterialsProject2020Compatibility().process_entries(ces, verbose=True)
assert len(out) == len(df_cse)
processed = MaterialsProject2020Compatibility().process_entries(cses, verbose=True)
assert len(processed) == len(df_cse)
processed = MaterialsProject2020Compatibility().process_entries(ces, verbose=True)
assert len(processed) == len(df_cse)

df_wbm["e_form_per_atom_mp2020_from_ce"] = [
get_e_form_per_atom(entry) for entry in tqdm(ces)
Expand All @@ -58,10 +52,10 @@


# %%
out = MaterialsProjectCompatibility().process_entries(cses, verbose=True)
assert len(out) == len(df_cse)
out = MaterialsProjectCompatibility().process_entries(ces, verbose=True)
assert len(out) == len(df_cse)
processed = MaterialsProjectCompatibility().process_entries(cses, verbose=True)
assert len(processed) == len(df_cse)
processed = MaterialsProjectCompatibility().process_entries(ces, verbose=True)
assert len(processed) == len(df_cse)

df_wbm["e_form_per_atom_legacy_from_ce"] = [
get_e_form_per_atom(entry) for entry in tqdm(ces)
Expand All @@ -74,7 +68,9 @@


# %%
df_wbm["chem_sys"] = df_wbm.formula.str.replace("[0-9]+", "", regex=True).str.split()
df_wbm["chem_sys"] = (
df_wbm[formula_col].str.replace("[0-9]+", "", regex=True).str.split()
)
df_wbm["anion"] = None
df_wbm["anion"][df_wbm.chem_sys.astype(str).str.contains("'O'")] = "oxide"
df_wbm["anion"][df_wbm.chem_sys.astype(str).str.contains("'S'")] = "sulfide"
Expand Down
Loading

0 comments on commit 7ffaf4e

Please sign in to comment.