Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make CSV ID column more flexible and add simple CLI #43

Merged
merged 13 commits into from
Dec 9, 2023
9 changes: 9 additions & 0 deletions src/mc_optimade/mc_optimade/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import argparse
from pathlib import Path
from mc_optimade.convert import convert_archive

def main():
parser = argparse.ArgumentParser()
parser.add_argument("archive_path", help="The path to the archive to ingest.")
args = parser.parse_args()
convert_archive(Path(args.archive_path))
18 changes: 12 additions & 6 deletions src/mc_optimade/mc_optimade/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ def pybtex_to_optimade(bib_entry: Any, properties=None) -> EntryResource:
raise NotImplementedError


def load_csv_file(p: Path) -> dict[str, dict[str, Any]]:
def load_csv_file(p: Path, id_key: str = "id") -> dict[str, dict[str, Any]]:
"""Parses a CSV file found at path `p` and returns a dictionary
of properties keyed by ID.

Requires the `id` column to be present in the CSV file, which will
Will use the first column that contains the substring "id", which will
be matched with the generated IDs.

Returns:
Expand All @@ -30,10 +30,16 @@ def load_csv_file(p: Path) -> dict[str, dict[str, Any]]:
"""
df = pandas.read_csv(p)
if "id" not in df:
raise RuntimeError(
"CSV file {p} must have an 'id' column: not just {df.columns}"
)

id_keys = [f for f in df.columns if "id" in f.lower()]
if not id_keys:
raise RuntimeError(
f"CSV file {p} must have a column containing 'id' : not just {df.columns}"
)
id_key = id_keys[0]

# Copy found ID key and rename it to 'id'
if id_key != "id":
df["id"] = df[id_key]
df = df.set_index("id")

return df.to_dict(orient="index")
Expand Down
3 changes: 3 additions & 0 deletions src/mc_optimade/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ follow_imports = "skip"
[tool.isort]
known_first_party = "mc_optimade"
profile = "black"

[project.scripts]
optimake = "mc_optimade.cli:main"