Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dandi export fixes #1095

Merged
merged 20 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dj.FreeTable(dj.conn(), "common_session.session_group").drop()
- Enforce match between ingested nwb probe geometry and existing table entry
#1074
- Update DataJoint install and password instructions #1131
- Fix dandi upload process for nwb's with video or linked objects #1095

### Pipelines

Expand Down
70 changes: 61 additions & 9 deletions src/spyglass/common/common_dandi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import shutil
from pathlib import Path
from typing import Optional

import datajoint as dj
import fsspec
Expand All @@ -9,7 +10,7 @@
from fsspec.implementations.cached import CachingFileSystem

from spyglass.common.common_usage import Export, ExportSelection
from spyglass.settings import export_dir
from spyglass.settings import export_dir, raw_dir
from spyglass.utils import SpyglassMixin, logger
from spyglass.utils.sql_helper_fn import SQLDumpHelper

Expand All @@ -21,7 +22,8 @@
from dandi.consts import known_instances
from dandi.dandiapi import DandiAPIClient
from dandi.metadata.nwb import get_metadata
from dandi.organize import OrganizeInvalid
from dandi.organize import CopyMode, FileOperationMode, OrganizeInvalid
from dandi.pynwb_utils import nwb_has_external_links
from dandi.validate_types import Severity

except (ImportError, ModuleNotFoundError) as e:
Expand All @@ -31,8 +33,11 @@
DandiAPIClient,
get_metadata,
OrganizeInvalid,
CopyMode,
FileOperationMode,
Severity,
) = [None] * 6
nwb_has_external_links,
) = [None] * 9
logger.warning(e)


Expand Down Expand Up @@ -87,6 +92,7 @@ def compile_dandiset(
dandiset_id: str,
dandi_api_key: str = None,
dandi_instance: str = "dandi",
skip_raw_files: bool = False,
):
"""Compile a Dandiset from the export.
Parameters
Expand All @@ -100,6 +106,8 @@ def compile_dandiset(
DANDI_API_KEY is set.
dandi_instance : str, optional
What instance of Dandi the dandiset is on. Defaults to dev server.
skip_raw_files : bool, optional
Dev tool to skip raw files in the export. Defaults to False.
"""
key = (Export & key).fetch1("KEY")
paper_id = (Export & key).fetch1("paper_id")
Expand Down Expand Up @@ -137,9 +145,14 @@ def compile_dandiset(

os.makedirs(destination_dir, exist_ok=False)
for file in source_files:
if not os.path.exists(
f"{destination_dir}/{os.path.basename(file)}"
):
if os.path.exists(f"{destination_dir}/{os.path.basename(file)}"):
continue
if skip_raw_files and raw_dir in file:
continue
# copy the file if it has external links so can be safely edited
if nwb_has_external_links(file):
shutil.copy(file, f"{destination_dir}/{os.path.basename(file)}")
else:
os.symlink(file, f"{destination_dir}/{os.path.basename(file)}")

# validate the dandiset
Expand All @@ -154,11 +167,16 @@ def compile_dandiset(

# organize the files in the dandiset directory
dandi.organize.organize(
destination_dir, dandiset_dir, invalid=OrganizeInvalid.WARN
destination_dir,
dandiset_dir,
update_external_file_paths=True,
invalid=OrganizeInvalid.FAIL,
media_files_mode=CopyMode.SYMLINK,
files_mode=FileOperationMode.COPY,
)

# get the dandi name translations
translations = translate_name_to_dandi(destination_dir)
translations = lookup_dandi_translation(destination_dir, dandiset_dir)

# upload the dandiset to the dandi server
if dandi_api_key:
Expand Down Expand Up @@ -200,7 +218,7 @@ def write_mysqldump(self, export_key: dict):
docker_id=None,
spyglass_version=spyglass_version,
)
sql_dump.write_mysqldump(self & key, file_suffix="_dandi")
sql_dump.write_mysqldump([self & key], file_suffix="_dandi")


def _get_metadata(path):
Expand Down Expand Up @@ -229,6 +247,7 @@ def translate_name_to_dandi(folder):
dict
dictionary of filename to dandi_path translations
"""

files = Path(folder).glob("*")
metadata = list(map(_get_metadata, files))
metadata, skip_invalid = dandi.organize.filter_invalid_metadata_rows(
Expand All @@ -243,6 +262,39 @@ def translate_name_to_dandi(folder):
]


def lookup_dandi_translation(source_dir: str, dandiset_dir: str):
"""Get the dandi_path for each nwb file in the source_dir from
the organized dandi directory

Parameters
----------
source_dir : str
location of the source files
dandiset_dir : str
location of the organized dandiset directory

Returns
-------
dict
dictionary of filename to dandi_path translations
"""
# get the obj_id and dandipath for each nwb file in the dandiset
dandi_name_dict = {}
for dandi_file in Path(dandiset_dir).rglob("*.nwb"):
dandi_path = dandi_file.relative_to(dandiset_dir).as_posix()
with pynwb.NWBHDF5IO(dandi_file, "r") as io:
nwb = io.read()
dandi_name_dict[nwb.object_id] = dandi_path
# for each file in the source_dir, lookup the dandipath based on the obj_id
name_translation = {}
for file in Path(source_dir).glob("*"):
with pynwb.NWBHDF5IO(file, "r") as io:
nwb = io.read()
dandi_path = dandi_name_dict[nwb.object_id]
name_translation[file.name] = dandi_path
return name_translation


def validate_dandiset(
folder, min_severity="ERROR", ignore_external_files=False
):
Expand Down
17 changes: 17 additions & 0 deletions src/spyglass/common/common_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
unique_dicts,
update_analysis_for_dandi_standard,
)
from spyglass.utils.nwb_helper_fn import get_linked_nwbs
from spyglass.utils.sql_helper_fn import SQLDumpHelper

schema = dj.schema("common_usage")
Expand Down Expand Up @@ -236,6 +237,7 @@ class File(SpyglassMixin, dj.Part):

def populate_paper(self, paper_id: Union[str, dict]):
"""Populate Export for a given paper_id."""
self.load_shared_schemas()
if isinstance(paper_id, dict):
paper_id = paper_id.get("paper_id")
self.populate(ExportSelection().paper_export_id(paper_id))
Expand Down Expand Up @@ -272,6 +274,21 @@ def make(self, key):
query.list_file_paths(paper_key) + restr_graph.file_paths
)

# Check for linked nwb objects and add them to the export
unlinked_files = set()
for file in file_paths:
if not (links := get_linked_nwbs(file["file_path"])):
unlinked_files.add(file)
continue
logger.warning(
"Dandi not yet supported for linked nwb objects "
+ f"excluding {file['file_path']} from export "
+ f" and including {links} instead"
)
for link in links:
unlinked_files.add(link)
file_paths = {"file_path": link for link in unlinked_files}

table_inserts = [
{**key, **rd, "table_id": i}
for i, rd in enumerate(restr_graph.as_dict)
Expand Down
15 changes: 12 additions & 3 deletions src/spyglass/utils/dj_helper_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ def get_child_tables(table):
def update_analysis_for_dandi_standard(
filepath: str,
age: str = "P4M/P8M",
resolve_external_table: bool = True,
):
"""Function to resolve common nwb file format errors within the database

Expand All @@ -364,6 +365,9 @@ def update_analysis_for_dandi_standard(
abs path to the file to edit
age : str, optional
age to assign animal if missing, by default "P4M/P8M"
resolve_external_table : bool, optional
whether to update the external table. Set False if editing file
outside the database, by default True
"""
from spyglass.common import LabMember

Expand Down Expand Up @@ -394,7 +398,7 @@ def update_analysis_for_dandi_standard(
)
file["/general/subject/species"][()] = new_species_value

if not (
elif not (
len(species_value.split(" ")) == 2 or "NCBITaxon" in species_value
):
raise ValueError(
Expand Down Expand Up @@ -427,7 +431,9 @@ def update_analysis_for_dandi_standard(
file["/general/experimenter"][:] = new_experimenter_value

# update the datajoint external store table to reflect the changes
_resolve_external_table(filepath, file_name)
if resolve_external_table:
location = "raw" if filepath.endswith("_.nwb") else "analysis"
_resolve_external_table(filepath, file_name, location)


def dandi_format_names(experimenter: List) -> List:
Expand Down Expand Up @@ -510,7 +516,10 @@ def make_file_obj_id_unique(nwb_path: str):
new_id = str(uuid4())
with h5py.File(nwb_path, "a") as f:
f.attrs["object_id"] = new_id
_resolve_external_table(nwb_path, nwb_path.split("/")[-1])
location = "raw" if nwb_path.endswith("_.nwb") else "analysis"
_resolve_external_table(
nwb_path, nwb_path.split("/")[-1], location=location
)
return new_id


Expand Down
18 changes: 18 additions & 0 deletions src/spyglass/utils/nwb_helper_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ def get_nwb_file(nwb_file_path):
from ..common.common_dandi import DandiPath

dandi_key = {"filename": os.path.basename(nwb_file_path)}
if not DandiPath() & dandi_key:
# Check if non-copied raw file is in Dandi
dandi_key = {
"filename": Path(nwb_file_path).name.replace(
"_.nwb", ".nwb"
)
}

if not DandiPath & dandi_key:
# If not in Dandi, then we can't find the file
raise FileNotFoundError(
Expand Down Expand Up @@ -101,6 +109,16 @@ def file_from_dandi(filepath):
return False


def get_linked_nwbs(path):
"""Return a list of paths to NWB files that are linked by objects in
the file at the given path."""
with pynwb.NWBHDF5IO(path, "r") as io:
# open the nwb file (opens externally linked files as well)
nwb = io.read()
# get the linked files
return [x for x in io._HDF5IO__built if (not x == path)]
edeno marked this conversation as resolved.
Show resolved Hide resolved


def get_config(nwb_file_path, calling_table=None):
"""Return a dictionary of config settings for the given NWB file.
If the file does not exist, return an empty dict.
Expand Down