From 380206811eaba182f61d8bbcc8af4c76d34ebe7f Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 01:09:26 -0500 Subject: [PATCH 01/27] update proc status file schema - match column names to https://nipoppy.readthedocs.io/en/latest/schemas/index.html#imaging-bagel-file - make pipeline_starttime optional - remove "PHASE__" & "STAGE__" cols, "step" & "status" cols - simplify column descriptions --- schemas/bagel_schema.json | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/schemas/bagel_schema.json b/schemas/bagel_schema.json index ae9f3b7..c7c85fa 100644 --- a/schemas/bagel_schema.json +++ b/schemas/bagel_schema.json @@ -1,23 +1,29 @@ { "GLOBAL_COLUMNS": { "participant_id": { - "Description": "Participant identifier within a given dataset.", + "Description": "Participant identifier.", "dtype": "str", "IsRequired": true, "IsPrefixedColumn": false }, - "bids_id": { - "Description": "BIDS dataset identifier for a participant, if available/different from the participant_id.", + "bids_participant_id": { + "Description": "BIDS-compliant participant identifier.", "dtype": "str", "IsRequired": false, "IsPrefixedColumn": false }, - "session": { - "Description": "Participant session ID.", + "session_id": { + "Description": "Participant session identifier.", "dtype": "str", "IsRequired": true, "IsPrefixedColumn": false }, + "bids_session_id": { + "Description": "BIDS-compliant session identifier.", + "dtype": "str", + "IsRequired": false, + "IsPrefixedColumn": false + }, "has_mri_data": { "Description": "Whether or not participant had MRI data acquired in a given session.", "dtype": "bool", @@ -40,14 +46,14 @@ "IsPrefixedColumn": true }, "pipeline_name": { - "Description": "Name of a pipeline that was run for the participant, if applicable. Example value: 'freesurfer'", + "Description": "Name of the pipeline that was run.", "dtype": "str", "IsRequired": true, "MissingValue": "UNAVAILABLE", "IsPrefixedColumn": false }, "pipeline_version": { - "description": "Version of pipeline that was run. Must have a value if the value for 'pipeline_name' is not 'UNAVAILABLE'. Example value: '7.3.0'", + "description": "Version of the pipeline that was run. Must have a value if the value for 'pipeline_name' is not 'UNAVAILABLE'.", "dtype": "str", "IsRequired": true, "MissingValue": "UNAVAILABLE", @@ -56,7 +62,7 @@ "pipeline_starttime": { "Description": "Date/time that pipeline run was started. In format of 'YYYY-MM-DD HH:MM:SS'.", "dtype": "str", - "IsRequired": true, + "IsRequired": false, "MissingValue": "UNAVAILABLE", "IsPrefixedColumn": false }, @@ -69,24 +75,16 @@ } }, "PIPELINE_STATUS_COLUMNS": { - "pipeline_complete": { - "Description": "Status of pipeline run. 'SUCCESS': All expected pipeline output files (as configured by pipeline tracker) are present. 'FAIL': At least one expected pipeline output is missing. 'INCOMPLETE': Pipeline has not been run for the subject session (output directory missing). 'UNAVAILABLE': Relevant MRI modality for pipeline not available for subject session.", + "pipeline_step": { + "Description": "Name of a specific step in a pipeline.", "dtype": "str", "IsRequired": true, - "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE"], "IsPrefixedColumn": false }, - "PHASE__": { - "Description": "Completion status of tracker-specified phase/stream of a pipeline. To be grouped to the relevant pipeline, a second prefix denoting {pipeline_name}-{pipeline_version} must be included, e.g., 'PHASE__fmriprep-20.2.7__func'. Each phase may correspond to a specific output subdirectory or a set of related outputs. If phase and stage columns are both present, each phase is expected to correspond to >= 1 stage. 'SUCCESS': All output files corresponding to phase are present. 'FAIL': At least one output file of phase is missing. This status may be used to indicate that the phase crashed. 'INCOMPLETE': Parent pipeline has not been run for the subject session. 'UNAVAILABLE': Relevant MRI modality for pipeline not available for subject session. '' (no value): Specified phase not in pipeline described by current row/record.", + "status": { + "Description": "Completion status of the pipeline run or step for the subject-session pair. 'SUCCESS': All output files are present. 'FAIL': At least one output file is missing. 'INCOMPLETE': Parent pipeline has not been run for the subject session. 'UNAVAILABLE': Relevant MRI modality for pipeline not available for subject session.", "dtype": "str", - "IsRequired": false, - "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE", ""], - "IsPrefixedColumn": true - }, - "STAGE__": { - "Description": "Completion status of tracker-specified stage of a pipeline. To be grouped to the relevant pipeline, a second prefix denoting {pipeline_name}-{pipeline_version} must be included, e.g., 'STAGE__fmriprep-20.2.7__space-MNI152Lin_res-1'. Each stage may correspond to a single output file, subdirectory, or a few related output files. If phase and stage columns are both present, each phase is expected to correspond to >= 1 stage. 'SUCCESS': All output files corresponding to stage are present. 'FAIL': At least one output file of stage is missing. This status may be used to indicate that the stage crashed. 'INCOMPLETE': Parent pipeline has not been run for the subject session. 'UNAVAILABLE': Relevant MRI modality for pipeline not available for subject session. '' (no value): Specified stage not in pipeline described by current row/record.", - "dtype": "str", - "IsRequired": false, + "IsRequired": true, "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE", ""], "IsPrefixedColumn": true } From 38bff3a6f57edf67caf439f46c6f9493618e0e5d Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 01:32:28 -0500 Subject: [PATCH 02/27] remove columns about raw imaging data and IsPrefixedColumn property - "HAS_DATATYPE__", "HAS_IMAGE__" --- schemas/bagel_schema.json | 51 ++++++++------------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/schemas/bagel_schema.json b/schemas/bagel_schema.json index c7c85fa..4e3dccc 100644 --- a/schemas/bagel_schema.json +++ b/schemas/bagel_schema.json @@ -3,90 +3,59 @@ "participant_id": { "Description": "Participant identifier.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true }, "bids_participant_id": { "Description": "BIDS-compliant participant identifier.", "dtype": "str", - "IsRequired": false, - "IsPrefixedColumn": false + "IsRequired": false }, "session_id": { "Description": "Participant session identifier.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true }, "bids_session_id": { "Description": "BIDS-compliant session identifier.", "dtype": "str", - "IsRequired": false, - "IsPrefixedColumn": false - }, - "has_mri_data": { - "Description": "Whether or not participant had MRI data acquired in a given session.", - "dtype": "bool", - "IsRequired": false, - "Range": [true, false], - "IsPrefixedColumn": false - }, - "HAS_DATATYPE__": { - "Description": "Whether or not participant session has specified raw BIDS datatype. Column suffix should correspond to a specific BIDS subdirectory. e.g., 'HAS_DATATYPE__anat'", - "dtype": "bool", - "IsRequired": false, - "Range": [true, false], - "IsPrefixedColumn": true - }, - "HAS_IMAGE__": { - "Description": "Whether or not participant session has specified imaging file. Column suffix should correspond to a BIDS file suffix. e.g. 'HAS_IMAGE__T1w'", - "dtype": "bool", - "IsRequired": false, - "Range": [true, false], - "IsPrefixedColumn": true + "IsRequired": false }, "pipeline_name": { "Description": "Name of the pipeline that was run.", "dtype": "str", "IsRequired": true, - "MissingValue": "UNAVAILABLE", - "IsPrefixedColumn": false + "MissingValue": "UNAVAILABLE" }, "pipeline_version": { "description": "Version of the pipeline that was run. Must have a value if the value for 'pipeline_name' is not 'UNAVAILABLE'.", "dtype": "str", "IsRequired": true, - "MissingValue": "UNAVAILABLE", - "IsPrefixedColumn": false + "MissingValue": "UNAVAILABLE" }, "pipeline_starttime": { "Description": "Date/time that pipeline run was started. In format of 'YYYY-MM-DD HH:MM:SS'.", "dtype": "str", "IsRequired": false, - "MissingValue": "UNAVAILABLE", - "IsPrefixedColumn": false + "MissingValue": "UNAVAILABLE" }, "pipeline_endtime": { "Description": "Date/time that pipeline run ended. In format of 'YYYY-MM-DD HH:MM:SS'.", "dtype": "str", "IsRequired": false, - "MissingValue": "UNAVAILABLE", - "IsPrefixedColumn": false + "MissingValue": "UNAVAILABLE" } }, "PIPELINE_STATUS_COLUMNS": { "pipeline_step": { "Description": "Name of a specific step in a pipeline.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true }, "status": { "Description": "Completion status of the pipeline run or step for the subject-session pair. 'SUCCESS': All output files are present. 'FAIL': At least one output file is missing. 'INCOMPLETE': Parent pipeline has not been run for the subject session. 'UNAVAILABLE': Relevant MRI modality for pipeline not available for subject session.", "dtype": "str", "IsRequired": true, - "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE", ""], - "IsPrefixedColumn": true + "Range": ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE", ""] } } } \ No newline at end of file From 4dd3d1f6c9df6840dd7a6b3f618cc9b0f6d79624 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 02:01:01 -0500 Subject: [PATCH 03/27] require TSV inputs instead of CSVs --- digest/app.py | 2 +- digest/layout.py | 4 ++-- digest/utility.py | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/digest/app.py b/digest/app.py index a4b7911..5a1de47 100644 --- a/digest/app.py +++ b/digest/app.py @@ -188,7 +188,7 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames): {"type": schema, "data": overview_df.to_dict("records")}, pipelines_dict, None, - "csv", + "csv", # NOTE: "tsv" does not appear to be an option for export_format ) diff --git a/digest/layout.py b/digest/layout.py index 2917789..a950d7c 100644 --- a/digest/layout.py +++ b/digest/layout.py @@ -101,7 +101,7 @@ def upload_buttons() -> list: upload_imaging = dcc.Upload( id={"type": "upload-data", "index": "imaging", "btn_idx": 0}, children=dbc.Button( - "Select imaging CSV file...", + "Select imaging TSV file...", color="light", ), multiple=False, @@ -110,7 +110,7 @@ def upload_buttons() -> list: upload_phenotypic = dcc.Upload( id={"type": "upload-data", "index": "phenotypic", "btn_idx": 1}, children=dbc.Button( - "Select phenotypic CSV file...", + "Select phenotypic TSV file...", color="light", ), multiple=False, diff --git a/digest/utility.py b/digest/utility.py index 7aeed50..bdf4031 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -11,7 +11,7 @@ BAGEL_CONFIG = { "imaging": { "schema_file": "bagel_schema.json", - "overview_col": "pipeline_complete", + "overview_col": "status", }, "phenotypic": { "schema_file": "bagel_schema_pheno.json", @@ -244,31 +244,31 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: def load_file_from_path( file_path: Path, ) -> Tuple[Optional[pd.DataFrame], Optional[str]]: - """Reads in a CSV file (if it exists) and returns it as a dataframe.""" + """Reads in a TSV file (if it exists) and returns it as a dataframe.""" if not file_path.exists(): return None, "File not found." - bagel = pd.read_csv(file_path) + bagel = pd.read_tsv(file_path, sep="\t") return bagel, None def load_file_from_contents( filename: str, contents: str ) -> Tuple[Optional[pd.DataFrame], Optional[str]]: - """Returns contents of an uploaded CSV file as a dataframe.""" - if not filename.endswith(".csv"): - return None, "Invalid file type. Please upload a .csv file." + """Returns contents of an uploaded TSV file as a dataframe.""" + if not filename.endswith(".tsv"): + return None, "Invalid file type. Please upload a .tsv file." content_type, content_string = contents.split(",") decoded = base64.b64decode(content_string) - bagel = pd.read_csv(io.StringIO(decoded.decode("utf-8"))) + bagel = pd.read_csv(io.StringIO(decoded.decode("utf-8")), sep="\t") return bagel, None def get_schema_validation_errors( bagel: pd.DataFrame, schema: str ) -> Optional[str]: - """Checks that the input CSV adheres to the schema for the selected bagel type. If not, returns an informative error message as a string.""" + """Checks that the input file adheres to the schema for the selected bagel type. If not, returns an informative error message as a string.""" error_msg = None # Get the columns that uniquely identify a participant-session's value for an event, @@ -287,7 +287,7 @@ def get_schema_validation_errors( ) > 0 ): - error_msg = f"The selected CSV is missing the following required {schema} metadata columns: {missing_req_cols}. Please try again." + error_msg = f"The selected TSV is missing the following required {schema} metadata columns: {missing_req_cols}. Please try again." elif ( get_duplicate_entries( data=bagel, subset=unique_value_id_columns @@ -295,7 +295,7 @@ def get_schema_validation_errors( > 0 ): # TODO: Switch to warning once alerts are implemented for errors? - error_msg = f"The selected CSV contains duplicate entries in the combination of: {unique_value_id_columns}. Please double check your input." + error_msg = f"The selected TSV contains duplicate entries in the combination of: {unique_value_id_columns}. Please double check your input." return error_msg From 1b14759a0cd600926994487815ffdfe91eaaeca1 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 03:20:07 -0500 Subject: [PATCH 04/27] align phenotypic file schema with proc status file changes - remove "IsPrefixedColumn" - rename ID columns & update their descriptions --- schemas/bagel_schema_pheno.json | 34 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/schemas/bagel_schema_pheno.json b/schemas/bagel_schema_pheno.json index 9f5633e..140c345 100644 --- a/schemas/bagel_schema_pheno.json +++ b/schemas/bagel_schema_pheno.json @@ -1,55 +1,47 @@ { "GLOBAL_COLUMNS": { "participant_id": { - "Description": "Participant identifier within a given dataset.", + "Description": "Participant identifier.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true }, - "bids_id": { - "Description": "BIDS dataset identifier for a participant, if available/different from the participant_id.", + "bids_participant_id": { + "Description": "BIDS-compliant participant identifier.", "dtype": "str", - "IsRequired": false, - "IsPrefixedColumn": false + "IsRequired": false }, - "session": { - "Description": "Participant session ID.", + "session_id": { + "Description": "Participant session identifier.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true }, "has_mri_data": { "Description": "Whether or not participant had MRI data acquired in a given session.", "dtype": "bool", "IsRequired": false, - "Range": [true, false], - "IsPrefixedColumn": false + "Range": [true, false] }, "assessment_name": { "Description": "Name of an assessment or subscale completed by the participant.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true }, "assessment_version": { "Description": "Version of assessment. Should be defined if more than one assessment of the same name is present.", "dtype": "str", - "IsRequired": false, - "IsPrefixedColumn": false + "IsRequired": false }, "assessment_datetime": { "Description": "Date/time that assessment was completed. In format of 'YYYY-MM-DD HH:MM:SS'.", "dtype": "str", - "IsRequired": false, - "IsPrefixedColumn": false + "IsRequired": false } }, "ASSESSMENT_SPECIFIC_COLUMNS": { "assessment_score": { "Description": "Score of a given participant corresponding to an assessment.", "dtype": "str", - "IsRequired": true, - "IsPrefixedColumn": false + "IsRequired": true } } } \ No newline at end of file From 669adda4e1f8b69be1d02efbf388f12aced975f3 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 03:34:15 -0500 Subject: [PATCH 05/27] update id column references and handling to reflect new names - refactor out primary session column name into a variable --- digest/app.py | 11 ++++++----- digest/plotting.py | 9 +++++---- digest/utility.py | 43 ++++++++++++++++++++++++++++--------------- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/digest/app.py b/digest/app.py index 5a1de47..55c93eb 100644 --- a/digest/app.py +++ b/digest/app.py @@ -7,6 +7,7 @@ import pandas as pd from dash import ALL, Dash, ctx, dcc, html from dash.dependencies import Input, Output, State +from utility import PRIMARY_SESSION from . import plotting as plot from . import utility as util @@ -156,9 +157,9 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames): # TODO: Any existing NaNs will currently be turned into "nan". (See open issue https://github.com/pandas-dev/pandas/issues/25353) # Another side effect of allowing NaN sessions is that if this column has integer values, they will be read in as floats # (before being converted to str) if there are NaNs in the column. - # This should not be a problem after we disallow NaNs value in "participant_id" and "session" columns, https://github.com/neurobagel/digest/issues/20 - bagel["session"] = bagel["session"].astype(str) - session_list = bagel["session"].unique().tolist() + # This should not be a problem after we disallow NaNs value in "participant_id" and "session_id" columns, https://github.com/neurobagel/digest/issues/20 + bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) + session_list = bagel[PRIMARY_SESSION].unique().tolist() overview_df = util.get_pipelines_overview( bagel=bagel, schema=schema @@ -512,7 +513,7 @@ def display_phenotypic_column_dropdown(parsed_data): # exclude unique participant identifier columns from visualization if column not in [ "participant_id", - "bids_id", + "bids_participant_id", ]: # TODO: Consider storing these column names in a constant column_options.append({"label": column, "value": column}) @@ -552,7 +553,7 @@ def plot_phenotypic_column( data_to_plot = virtual_data if session_switch_value: - color = "session" + color = PRIMARY_SESSION else: color = None diff --git a/digest/plotting.py b/digest/plotting.py index 96fa876..b355fdd 100644 --- a/digest/plotting.py +++ b/digest/plotting.py @@ -5,6 +5,7 @@ import pandas as pd import plotly.express as px import plotly.graph_objects as go +from utility import PRIMARY_SESSION from . import utility as util @@ -60,28 +61,28 @@ def plot_pipeline_status_by_participants( ) -> go.Figure: status_counts = ( transform_active_data_to_long(data) - .groupby(["pipeline_name", "pipeline_complete", "session"]) + .groupby(["pipeline_name", "pipeline_complete", PRIMARY_SESSION]) .size() .reset_index(name="participants") ) fig = px.bar( status_counts, - x="session", + x=PRIMARY_SESSION, y="participants", color="pipeline_complete", text_auto=True, facet_col="pipeline_name", category_orders={ "pipeline_complete": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), - "session": session_list, + PRIMARY_SESSION: session_list, }, color_discrete_map=STATUS_COLORS, labels={ "pipeline_name": "Pipeline", "participants": "Participants (n)", "pipeline_complete": "Processing status", - "session": "Session", + PRIMARY_SESSION: "Session", }, title="All participant pipeline statuses by session", ) diff --git a/digest/utility.py b/digest/utility.py index bdf4031..88bc18a 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -18,12 +18,14 @@ "overview_col": "assessment_score", }, } +# TODO: Update PIPE_COMPLETE_STATUS_SHORT_DESC = { "SUCCESS": "All expected output files of pipeline are present.", "FAIL": "At least one expected output of pipeline is missing.", "INCOMPLETE": "Pipeline has not yet been run (output directory not available).", "UNAVAILABLE": "Relevant MRI modality for pipeline not available.", } +PRIMARY_SESSION = "session_id" # TODO: # Could also use URLs for "imaging" or "phenotypic" locations if fetching from a remote repo doesn't slow things down too much. @@ -59,7 +61,7 @@ def reset_column_dtypes(data: pd.DataFrame) -> pd.DataFrame: stream.close() # Just in case, convert session labels back to strings (will avoid sessions being undesirably treated as continuous data in e.g., plots) - data_retyped["session"] = data_retyped["session"].astype(str) + data_retyped[PRIMARY_SESSION] = data_retyped[PRIMARY_SESSION].astype(str) return data_retyped @@ -91,7 +93,7 @@ def construct_summary_str(data: pd.DataFrame) -> str: """Creates summary of key counts for dataset.""" return f"""Total number of participants: {count_unique_subjects(data)} Total number of unique records (participant-session pairs): {count_unique_records(data)} -Total number of unique sessions: {data["session"].nunique()}""" +Total number of unique sessions: {data[PRIMARY_SESSION].nunique()}""" def get_required_bagel_columns(schema_file: str) -> list: @@ -126,7 +128,7 @@ def get_event_id_columns( When there is only one relevant column, we return a string instead of a list to avoid grouper problems when the column name is used in pandas groupby. """ if schema == "imaging": - return ["pipeline_name", "pipeline_version"] + return ["pipeline_name", "pipeline_version", "pipeline_step"] if schema == "phenotypic": return ( ["assessment_name", "assessment_version"] @@ -136,6 +138,7 @@ def get_event_id_columns( return None +# TODO: Generalize function name to include both assessments and pipelines (e.g., extract_measures or extract_modules)? def extract_pipelines(bagel: pd.DataFrame, schema: str) -> dict: """Get data for each unique pipeline in the aggregate input as an individual labelled dataframe.""" pipelines_dict = {} @@ -148,8 +151,9 @@ def extract_pipelines(bagel: pd.DataFrame, schema: str) -> dict: pipelines = bagel.groupby(by=groupby, sort=sort) if isinstance(groupby, list): - for (name, version), pipeline in pipelines: - label = f"{name}-{version}" + for pipeline_ids, pipeline in pipelines: + # Construct a unique identifier for the pipeline/assessment + label = "-".join(pipeline_ids) # per pipeline, sort by participant_id (not sorting by session_id here to avoid disrupting chronological order) pipelines_dict[label] = ( pipeline.sort_values(["participant_id"]) @@ -169,13 +173,17 @@ def extract_pipelines(bagel: pd.DataFrame, schema: str) -> dict: return pipelines_dict +# TODO: Revisit if we need to consider all the ID columns here, and if the order matters def get_id_columns(data: pd.DataFrame) -> list: """Returns names of columns which identify a given participant record""" - return ( - ["participant_id", "bids_id", "session"] - if "bids_id" in data.columns - else ["participant_id", "session"] - ) + id_columns = ["participant_id", "session_id"] + + if "bids_participant_id" in data.columns: + id_columns.append("bids_participant_id") + if "bids_session_id" in data.columns: + id_columns.append("bids_session_id") + + return id_columns def get_duplicate_entries(data: pd.DataFrame, subset: list) -> pd.DataFrame: @@ -193,11 +201,16 @@ def count_unique_subjects(data: pd.DataFrame) -> int: def count_unique_records(data: pd.DataFrame) -> int: """Returns number of unique participant-session pairs.""" - if set(["participant_id", "session"]).issubset(data.columns): - return data[["participant_id", "session"]].drop_duplicates().shape[0] + if set(["participant_id", PRIMARY_SESSION]).issubset(data.columns): + return ( + data[["participant_id", PRIMARY_SESSION]] + .drop_duplicates() + .shape[0] + ) return 0 +# TODO: Generalize function name to include both assessments and pipelines (e.g., extract_measures or extract_modules)? def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: """ Constructs a wide format dataframe from the long format input file, @@ -229,9 +242,9 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: pipeline_complete_df = ( # Enforce original order of sessions as they appear in input (pivot automatically sorts them) # NOTE: .reindex only works correctly when there are no NaN values in the index level - # (Here, the entire "session" column should have already been cast to a string) + # (Here, the entire "session_id" column should have already been cast to a string) pipeline_complete_df.reindex( - index=bagel["session"].unique(), level="session" + index=bagel[PRIMARY_SESSION].unique(), level=PRIMARY_SESSION ) .reindex(col_order, axis=1) # reorder assessments/pipelines if needed .reset_index() @@ -329,7 +342,7 @@ def filter_records( matching_subs = [] for sub_id, sub in data.groupby("participant_id"): if all( - session in sub["session"].unique() + session in sub[PRIMARY_SESSION].unique() for session in session_values ): if all( From 58803614e60f9d88e7157d9bd06b154a8b6df1ab Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 03:37:12 -0500 Subject: [PATCH 06/27] update comment --- digest/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digest/app.py b/digest/app.py index 55c93eb..4c3eb68 100644 --- a/digest/app.py +++ b/digest/app.py @@ -189,7 +189,7 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames): {"type": schema, "data": overview_df.to_dict("records")}, pipelines_dict, None, - "csv", # NOTE: "tsv" does not appear to be an option for export_format + "csv", # NOTE: "tsv" is not an option for export_format ) From 84dda4a78cb8de4f2ecfc0efb4c5c0741914d7b1 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 03:43:32 -0500 Subject: [PATCH 07/27] fix imports --- digest/app.py | 2 +- digest/plotting.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/digest/app.py b/digest/app.py index 4c3eb68..961aa87 100644 --- a/digest/app.py +++ b/digest/app.py @@ -7,11 +7,11 @@ import pandas as pd from dash import ALL, Dash, ctx, dcc, html from dash.dependencies import Input, Output, State -from utility import PRIMARY_SESSION from . import plotting as plot from . import utility as util from .layout import DEFAULT_DATASET_NAME, construct_layout, upload_buttons +from .utility import PRIMARY_SESSION EMPTY_FIGURE_PROPS = {"data": [], "layout": {}, "frames": []} diff --git a/digest/plotting.py b/digest/plotting.py index b355fdd..253fcf4 100644 --- a/digest/plotting.py +++ b/digest/plotting.py @@ -5,9 +5,9 @@ import pandas as pd import plotly.express as px import plotly.graph_objects as go -from utility import PRIMARY_SESSION from . import utility as util +from .utility import PRIMARY_SESSION CMAP = px.colors.qualitative.Bold STATUS_COLORS = { From 403ed1fe01631f72f840f1452fe0bcd3a62a8274 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 9 Dec 2024 13:49:41 -0500 Subject: [PATCH 08/27] update references to renamed 'pipeline-complete' column --- digest/app.py | 10 ++++++---- digest/plotting.py | 18 +++++++++--------- digest/utility.py | 1 + 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/digest/app.py b/digest/app.py index 961aa87..bc8d473 100644 --- a/digest/app.py +++ b/digest/app.py @@ -437,8 +437,10 @@ def reset_selections(filename): ) def generate_overview_status_fig_for_participants(parsed_data, session_list): """ - If new dataset uploaded, generate stacked bar plot of pipeline_complete statuses per session, - grouped by pipeline. Provides overview of the number of participants with each status in a given session, + When a new dataset is uploaded, generate stacked bar plots of pipeline statuses per session, + grouped in subplots corresponding to each pipeline. + + Provides overview of the number of participants with each status in a given session, per processing pipeline. """ if parsed_data is not None and parsed_data.get("type") != "phenotypic": @@ -468,7 +470,7 @@ def generate_overview_status_fig_for_participants(parsed_data, session_list): def update_overview_status_fig_for_records(data, pipelines_dict, parsed_data): """ When visible data in the overview datatable is updated (excluding built-in frontend datatable filtering - but including custom component filtering), generate stacked bar plot of pipeline_complete statuses aggregated + but including custom component filtering), generate stacked bar plot of pipeline statuses aggregated by pipeline. Counts of statuses in plot thus correspond to unique records (unique participant-session combinations). """ @@ -480,7 +482,7 @@ def update_overview_status_fig_for_records(data, pipelines_dict, parsed_data): if not data_df.empty: status_counts = ( plot.transform_active_data_to_long(data_df) - .groupby(["pipeline_name", "pipeline_complete"]) + .groupby(["pipeline_name", "status"]) .size() .reset_index(name="records") ) diff --git a/digest/plotting.py b/digest/plotting.py index 253fcf4..5b306f7 100644 --- a/digest/plotting.py +++ b/digest/plotting.py @@ -38,7 +38,7 @@ def transform_active_data_to_long(data: pd.DataFrame) -> pd.DataFrame: data, id_vars=util.get_id_columns(data), var_name="pipeline_name", - value_name="pipeline_complete", + value_name="status", ) @@ -61,7 +61,7 @@ def plot_pipeline_status_by_participants( ) -> go.Figure: status_counts = ( transform_active_data_to_long(data) - .groupby(["pipeline_name", "pipeline_complete", PRIMARY_SESSION]) + .groupby(["pipeline_name", "status", PRIMARY_SESSION]) .size() .reset_index(name="participants") ) @@ -70,18 +70,18 @@ def plot_pipeline_status_by_participants( status_counts, x=PRIMARY_SESSION, y="participants", - color="pipeline_complete", + color="status", text_auto=True, facet_col="pipeline_name", category_orders={ - "pipeline_complete": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), + "status": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), PRIMARY_SESSION: session_list, }, color_discrete_map=STATUS_COLORS, labels={ "pipeline_name": "Pipeline", "participants": "Participants (n)", - "pipeline_complete": "Processing status", + "status": "Processing status", PRIMARY_SESSION: "Session", }, title="All participant pipeline statuses by session", @@ -98,10 +98,10 @@ def plot_pipeline_status_by_records(status_counts: pd.DataFrame) -> go.Figure: status_counts, x="pipeline_name", y="records", - color="pipeline_complete", + color="status", text_auto=True, category_orders={ - "pipeline_complete": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), + "status": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), "pipeline_name": status_counts["pipeline_name"] .drop_duplicates() .sort_values(), @@ -110,7 +110,7 @@ def plot_pipeline_status_by_records(status_counts: pd.DataFrame) -> go.Figure: labels={ "pipeline_name": "Pipeline", "records": "Records (n)", - "pipeline_complete": "Processing status", + "status": "Processing status", }, title="Pipeline statuses of records matching filter (default: all)", ) @@ -125,7 +125,7 @@ def populate_empty_records_pipeline_status_plot( """Returns dataframe of counts representing 0 matching records in the datatable, i.e., 0 records with each pipeline status.""" status_counts = pd.DataFrame( list(product(pipelines, statuses)), - columns=["pipeline_name", "pipeline_complete"], + columns=["pipeline_name", "status"], ) status_counts["records"] = 0 diff --git a/digest/utility.py b/digest/utility.py index 88bc18a..13b7cd7 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -223,6 +223,7 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: # Related issues: # https://github.com/pandas-dev/pandas/issues/21969 # https://github.com/pandas-dev/pandas/issues/17595 + # TODO: Rename variable to reflect renaming of "pipeline_complete" column pipeline_complete_df = bagel.pivot( index=get_id_columns(bagel), columns=get_event_id_columns(bagel, schema), From e7df57dd068cf5d1d95e76c9d014f0500b8bc461 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 11 Dec 2024 16:19:21 -0500 Subject: [PATCH 09/27] remove MissingValue column property from schema --- schemas/bagel_schema.json | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/schemas/bagel_schema.json b/schemas/bagel_schema.json index 4e3dccc..ebe2b44 100644 --- a/schemas/bagel_schema.json +++ b/schemas/bagel_schema.json @@ -23,26 +23,22 @@ "pipeline_name": { "Description": "Name of the pipeline that was run.", "dtype": "str", - "IsRequired": true, - "MissingValue": "UNAVAILABLE" + "IsRequired": true }, "pipeline_version": { "description": "Version of the pipeline that was run. Must have a value if the value for 'pipeline_name' is not 'UNAVAILABLE'.", "dtype": "str", - "IsRequired": true, - "MissingValue": "UNAVAILABLE" + "IsRequired": true }, "pipeline_starttime": { "Description": "Date/time that pipeline run was started. In format of 'YYYY-MM-DD HH:MM:SS'.", "dtype": "str", - "IsRequired": false, - "MissingValue": "UNAVAILABLE" + "IsRequired": false }, "pipeline_endtime": { "Description": "Date/time that pipeline run ended. In format of 'YYYY-MM-DD HH:MM:SS'.", "dtype": "str", - "IsRequired": false, - "MissingValue": "UNAVAILABLE" + "IsRequired": false } }, "PIPELINE_STATUS_COLUMNS": { From 53047f05e2b5aeb1a238859eded8b15a3b827b68 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 11 Dec 2024 16:54:07 -0500 Subject: [PATCH 10/27] refactor id column extraction --- digest/utility.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/digest/utility.py b/digest/utility.py index 13b7cd7..cf7ebe4 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -38,12 +38,12 @@ / "nipoppy-qpn" / "nipoppy" / "digest" - / "qpn_imaging_availability_digest.csv", + / "qpn_imaging_availability_digest.tsv", "phenotypic": Path(__file__).absolute().parents[2] / "nipoppy-qpn" / "nipoppy" / "digest" - / "qpn_tabular_availability_digest.csv", + / "qpn_tabular_availability_digest.tsv", } } @@ -173,17 +173,20 @@ def extract_pipelines(bagel: pd.DataFrame, schema: str) -> dict: return pipelines_dict -# TODO: Revisit if we need to consider all the ID columns here, and if the order matters def get_id_columns(data: pd.DataFrame) -> list: - """Returns names of columns which identify a given participant record""" - id_columns = ["participant_id", "session_id"] - - if "bids_participant_id" in data.columns: - id_columns.append("bids_participant_id") - if "bids_session_id" in data.columns: - id_columns.append("bids_session_id") + """Returns names of columns found in the uploaded data which identify a given participant record.""" + reference_id_cols = [ + "participant_id", + "bids_participant_id", + "session_id", + "bids_session_id", + ] + # Preserve order of appearance in the original tabular data + recognized_id_cols = [ + col for col in data.columns if col in reference_id_cols + ] - return id_columns + return recognized_id_cols def get_duplicate_entries(data: pd.DataFrame, subset: list) -> pd.DataFrame: From 4745d7f8de8ae998198a96ab9bbbb5fca74c8b56 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 11 Dec 2024 18:01:12 -0500 Subject: [PATCH 11/27] update docstrings --- digest/utility.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/digest/utility.py b/digest/utility.py index cf7ebe4..bc74beb 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -138,12 +138,11 @@ def get_event_id_columns( return None -# TODO: Generalize function name to include both assessments and pipelines (e.g., extract_measures or extract_modules)? +# TODO: Generalize function and variable names to include both assessments and pipelines (e.g., extract_events?) def extract_pipelines(bagel: pd.DataFrame, schema: str) -> dict: - """Get data for each unique pipeline in the aggregate input as an individual labelled dataframe.""" + """Get rows corresponding to each unique data "event" (i.e., pipeline or assessment) in the input file as an individual labelled dataframe.""" pipelines_dict = {} - - # TODO: Possibly temporary fix - to avoid related assessment columns from being out of order + # Avoid related assessment columns from being out of order sort = bool(schema == "imaging") groupby = get_event_id_columns(bagel, schema) @@ -213,7 +212,7 @@ def count_unique_records(data: pd.DataFrame) -> int: return 0 -# TODO: Generalize function name to include both assessments and pipelines (e.g., extract_measures or extract_modules)? +# TODO: Generalize function and variable names to include both assessments and pipelines def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: """ Constructs a wide format dataframe from the long format input file, From 07c11d37bcba0897da8676a9e0f0fb2197fbc671 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 12 Dec 2024 20:27:11 -0500 Subject: [PATCH 12/27] update README --- README.md | 57 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index c844e66..5b867da 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,47 @@ -# Dashboard for neuroimaging and phenotypic dataset exploration +# Descriptive & neuroImaging data Graphical Explorer for Subject Tracking -- [Overview](#overview) +- [Overview](#overview) - [Preview](#preview) +- [Quickstart](#quickstart) - [Input schema](#input-schema) -- [Creating a dashboard-ready dataset file](#creating-a-dataset-file-for-the-dashboard-bagelcsv) +- [Creating a dashboard-ready "digest" file](#creating-a-dashboard-ready-digest-file) +- [Running in a Docker container](#running-in-a-docker-container) - [Local development](#local-development) ## Overview -`digest` is a web dashboard that provides interactive visual summaries and subject-level querying based on neuroimaging derivatives and phenotypic variables available for a dataset. +`digest` is a web dashboard for exploring subject-level availability of pipeline derivatives and phenotypic variables in a neuroimaging dataset. +It provides user-friendly options for querying data availability, along with interactive visual summaries. -A `digest` dashboard can be generated for any tabular dataset file that follows a data modality-specific [schema](/schemas/), which we refer to as a "bagel" file. -The dashboard is compatible with the processing status `bagel.csv` files automatically generated by the [Nipoppy framework for neuroimaging dataset organization and processing](https://github.com/neurodatascience/nipoppy). - -For more information on how to use `digest` with the Nipoppy project, also see the official [Nipoppy documentation](https://neurobagel.org/nipoppy/overview/). - -**Quickstart**: https://digest.neurobagel.org/ +`digest` supports any tabular dataset file that follows a data modality-specific [schema](/schemas/), referred to here as a "digest" file. +`digest` is also compatible with the processing status files generated by [Nipoppy](https://nipoppy.readthedocs.io/en/stable/), a framework for organization and processing of neuroimaging-clinical datasets. ## Preview ![alt text](img/ui_overview_table.png?raw=true) ![alt text](img/ui_overview_plots.png?raw=true) +## Quickstart +`digest` is publicly available at https://digest.neurobagel.org/! + +You can find correctly formatted example input files [here](/example_bagels/) to test out dashboard functionality. + ## Input schema -Input files to the dashboard contain long format data that must be formatted according to the [bagel schema](/schemas/) (see also the schemas [README](https://github.com/neurobagel/digest/tree/main/schemas#readme) for more info). A single file is expected to correspond to one dataset, but may contain status information for multiple processing pipelines for that dataset. - -### Try it out -You can view and download correctly formatted, minimal input tabular files from [here](/example_bagels/) to test out dashboard functionality. - -## Creating a dashboard-ready dataset file (`bagel.csv`) -While `digest` works on any input CSV compliant with a [bagel schema](/schemas/), the easiest way to generate a dashboard-ready file for a dataset's neuroimaging processing info is to follow the [Nipoppy](https://neurobagel.org/nipoppy/overview/) standard structure for organizing raw MRI data and processed outputs (data derivatives). -`Nipoppy` offers scripts that can use this standardized dataset organization to automatically extract info about the raw imaging files and any pipelines that have been run, which is then stored in a dashboard-ready `bagel.csv`. - -Detailed instructions to get started using `Nipoppy` can be found in their [documentation](https://neurobagel.org/nipoppy/overview/). -In brief, generating a `bagel.csv` for your dataset can be as simple as: -1. Installing `Nipoppy` to generate a dataset directory tree for your dataset (see [Installation](https://neurobagel.org/nipoppy/installation/) section of docs) that you can populate with your existing data -2. Update `Nipoppy` configuration to reflect the pipeline versions you are using (for tracking purposes), and augment your participant spreadsheet according to `Nipoppy` requirements (see [Configs](https://neurobagel.org/nipoppy/configs/) section of docs) -3. Run the tracker ([run_tracker.py](https://github.com/neurodatascience/nipoppy/blob/main/trackers/run_tracker.py)) for the relevant pipeline(s) for your dataset to generate a comprehensive `bagel.csv` - - To see help text for this script: `python run_tracker.py --help` - - This step can be repeated as needed to update the `bagel.csv` with newly processed subjects +`digest` supports long format TSVs that contain the columns specified in the [bagel schema](/schemas/) (see also the schemas [README](https://github.com/neurobagel/digest/tree/main/schemas#readme) for more info). + +At the moment, each digest file is expected to correspond to one dataset. + +## Creating a dashboard-ready "digest" file +While `digest` can be used with any TSV compliant with one of the [digest schemas](/schemas/), the easiest way to obtain dashboard-ready files for pipeline derivative availability is to use the [Nipoppy](https://neurobagel.org/nipoppy/overview/) specification for your neuroimaging dataset organization. +Nipoppy provides dataset [trackers](https://nipoppy.readthedocs.io/en/stable/user_guide/tracking.html) that can automatically extract subjects' imaging data and pipeline output availability, producing processing status files that are directly `digest` compatible. + +For detailed instructions to get started using Nipoppy, see the [documentation](https://nipoppy.readthedocs.io/en/stable/). +In brief, the (mostly automated!) Nipoppy steps to generate a processing status file can be as simple as: +1. Initializing an empty, Nipoppy-compliant dataset directory tree for your dataset +2. Updating your Nipoppy configuration with the pipeline versions you are using, and creating a manifest spreadsheet of all available participants and sessions +2. Populating the directory tree with any existing data and pipeline outputs * +3. Running the tracker for the relevant pipeline(s) for your dataset to generate a processing status file + - This step can be repeated as needed to update the file with newly processed subjects + +*Nipoppy also provides a protocol for running processing pipelines from raw imaging data. ## Running in a Docker container From e53c76d7b5d08ffb68a3f326f960c79500320628 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 12 Dec 2024 20:34:16 -0500 Subject: [PATCH 13/27] update README --- README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 5b867da..f4a22b9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ `digest` is a web dashboard for exploring subject-level availability of pipeline derivatives and phenotypic variables in a neuroimaging dataset. It provides user-friendly options for querying data availability, along with interactive visual summaries. -`digest` supports any tabular dataset file that follows a data modality-specific [schema](/schemas/), referred to here as a "digest" file. +`digest` supports any dataset TSV file that follows a data modality-specific [schema](/schemas/) (called a "digest" file). `digest` is also compatible with the processing status files generated by [Nipoppy](https://nipoppy.readthedocs.io/en/stable/), a framework for organization and processing of neuroimaging-clinical datasets. ## Preview @@ -25,23 +25,22 @@ It provides user-friendly options for querying data availability, along with int You can find correctly formatted example input files [here](/example_bagels/) to test out dashboard functionality. ## Input schema -`digest` supports long format TSVs that contain the columns specified in the [bagel schema](/schemas/) (see also the schemas [README](https://github.com/neurobagel/digest/tree/main/schemas#readme) for more info). - +`digest` supports long format TSVs that contain the columns specified in the [digest schema](/schemas/) (see also the schema [README](https://github.com/neurobagel/digest/tree/main/schemas#readme)). At the moment, each digest file is expected to correspond to one dataset. ## Creating a dashboard-ready "digest" file -While `digest` can be used with any TSV compliant with one of the [digest schemas](/schemas/), the easiest way to obtain dashboard-ready files for pipeline derivative availability is to use the [Nipoppy](https://neurobagel.org/nipoppy/overview/) specification for your neuroimaging dataset organization. -Nipoppy provides dataset [trackers](https://nipoppy.readthedocs.io/en/stable/user_guide/tracking.html) that can automatically extract subjects' imaging data and pipeline output availability, producing processing status files that are directly `digest` compatible. +While `digest` accepts any TSV compliant with one of the [digest schemas](/schemas/), the easiest way to obtain dashboard-ready files for pipeline derivative availability is to use the [Nipoppy](https://neurobagel.org/nipoppy/overview/) specification for your neuroimaging dataset organization. +Nipoppy provides dataset [trackers](https://nipoppy.readthedocs.io/en/stable/user_guide/tracking.html) that can automatically extract subjects' imaging data and pipeline output availability, producing `digest`-compatible processing status files. For detailed instructions to get started using Nipoppy, see the [documentation](https://nipoppy.readthedocs.io/en/stable/). + In brief, the (mostly automated!) Nipoppy steps to generate a processing status file can be as simple as: 1. Initializing an empty, Nipoppy-compliant dataset directory tree for your dataset 2. Updating your Nipoppy configuration with the pipeline versions you are using, and creating a manifest spreadsheet of all available participants and sessions -2. Populating the directory tree with any existing data and pipeline outputs * -3. Running the tracker for the relevant pipeline(s) for your dataset to generate a processing status file - - This step can be repeated as needed to update the file with newly processed subjects +2. Populating the directory tree with any existing data and pipeline outputs _*_ +3. Running the tracker for the relevant pipeline(s) to generate a processing status file -*Nipoppy also provides a protocol for running processing pipelines from raw imaging data. +_*Nipoppy also provides a protocol for running processing pipelines from raw imaging data._ ## Running in a Docker container From 3e10487d7bdfc3ae3421839e1c3b90b5241d64b3 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 12 Dec 2024 20:37:26 -0500 Subject: [PATCH 14/27] update README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f4a22b9..cab6dae 100644 --- a/README.md +++ b/README.md @@ -13,23 +13,23 @@ It provides user-friendly options for querying data availability, along with interactive visual summaries. `digest` supports any dataset TSV file that follows a data modality-specific [schema](/schemas/) (called a "digest" file). -`digest` is also compatible with the processing status files generated by [Nipoppy](https://nipoppy.readthedocs.io/en/stable/), a framework for organization and processing of neuroimaging-clinical datasets. +`digest` is also compatible with the processing status files generated by [Nipoppy](https://nipoppy.readthedocs.io/en/stable/). ## Preview ![alt text](img/ui_overview_table.png?raw=true) ![alt text](img/ui_overview_plots.png?raw=true) ## Quickstart -`digest` is publicly available at https://digest.neurobagel.org/! +Try out `digest` at https://digest.neurobagel.org/! You can find correctly formatted example input files [here](/example_bagels/) to test out dashboard functionality. ## Input schema -`digest` supports long format TSVs that contain the columns specified in the [digest schema](/schemas/) (see also the schema [README](https://github.com/neurobagel/digest/tree/main/schemas#readme)). +`digest` supports long format TSVs that contain the columns specified in the [digest schemas](/schemas/) (see also the schema [README](https://github.com/neurobagel/digest/tree/main/schemas#readme)). At the moment, each digest file is expected to correspond to one dataset. ## Creating a dashboard-ready "digest" file -While `digest` accepts any TSV compliant with one of the [digest schemas](/schemas/), the easiest way to obtain dashboard-ready files for pipeline derivative availability is to use the [Nipoppy](https://neurobagel.org/nipoppy/overview/) specification for your neuroimaging dataset organization. +While `digest` accepts any TSV compliant with one of the [digest schemas](/schemas/), the easiest way to obtain dashboard-ready files for pipeline derivative availability is to use the [Nipoppy](https://neurobagel.org/nipoppy/overview/) specification for organizing your neuroimaging dataset. Nipoppy provides dataset [trackers](https://nipoppy.readthedocs.io/en/stable/user_guide/tracking.html) that can automatically extract subjects' imaging data and pipeline output availability, producing `digest`-compatible processing status files. For detailed instructions to get started using Nipoppy, see the [documentation](https://nipoppy.readthedocs.io/en/stable/). From 99cfdf5a22f5223be38efb9bb335135900be3be1 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 12 Dec 2024 21:30:15 -0500 Subject: [PATCH 15/27] update schema README --- schemas/README.md | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/schemas/README.md b/schemas/README.md index 74d2aa9..05ff1a9 100644 --- a/schemas/README.md +++ b/schemas/README.md @@ -1,28 +1,24 @@ -# Schema for inputs to the dashboard +# Schema for `digest` input files ## Overview -The set of recognized and expected columns within a tabular input file for the dashboard (referred to generically as a `bagel.csv`) is specified in `.json` files called bagel schemas. -Different schemas correspond to `bagel.csv` files containing different types of data (e.g. imaging vs. phenotypic). +`digest` supports TSV files as inputs. +Expected columns in a digest file are defined in JSON files called digest schemas. +Different schemas correspond to digest files containing different modalities of data (e.g. imaging vs. phenotypic). -| Schema | (Meta)data type of corresponding CSV | -| ------------------------- | ------------------------------------------------------------------ | -| `bagel_schema.json` | Metadata for raw neuroimaging data and processing pipeline outputs | -| `bagel_schema_pheno.json` | Demographic and phenotypic assessment data | - -**NOTE:** -Within a `bagel.csv`, each row is expected to correspond to a single subject session (i.e., a unique pairing of `participant_id` and `session_id`), -and is referred to as a "record." +| Schema | Modality of data in corresponding digest file | +| ----- | ----- | +| `digest_schema_derivatives.json` | Processing pipeline output and derivative availability | +| `digest_schema_phenotypic.json` | Demographic and phenotypic assessment data | ## How to read the schema ### Column categories -Within a schema, columns are organized into two categories to simplify the process of automated `bagel.csv` generation -(these categories are not present in an actual input file): +Within a schema, columns are grouped into two semantic categories. These categories are purely for organizational purposes and do not appear in an digest file. + +**Global columns:** Columns describing basic metadata that should have the same meaning regardless of the specific event described by a given record +(e.g., a certain processing pipeline or phenotypic assessment), and does not depend on event outputs. -**Global columns:** Includes columns describing basic metadata that should have the same meaning regardless of the specific task described by a given record -(e.g., a certain processing pipeline or phenotypic assessment), -and does not depend on task outputs. -**Task-specific columns:** Includes columns whose values may have task-specific meanings. -e.g., in the schema for an imaging bagel, the `"PIPELINE_STATUS_COLUMNS"` convey info about processing pipeline completion that depends on pipeline-specific outputs and may vary depending on the pipeline tracker used. +**Event-specific columns:** Includes columns whose values may have event-specific meanings. +e.g., in the schema for an imaging digest, the `"PIPELINE_STATUS_COLUMNS"` convey info about processing pipeline completion that depends on pipeline-specific outputs and may have varying values depending on the pipeline tracker used. ### Column attributes Recognized columns are individually described in the schema using the following attributes: @@ -31,15 +27,8 @@ Recognized columns are individually described in the schema using the following - `dtype` - The expected type for values in the column. - `IsRequired` - - Whether or not the column is required to be present in `bagel.csv`. + - Whether or not the column is required to be present in the digest file. If `true`, the dashboard will throw an error if the column is missing. - `Range` - Acceptable values for data in the column. Only present for categorical columns. -- `MissingValue` - - The value expected to denote that data is not available for the column. - Only present for non-categorical columns. -- `IsPrefixedColumn` - - Whether or not this column will be recognized by a prefix in the column name. - If `true`, columns with this prefix are expected to have unique user-defined informative suffixes in line with the `Description`. - More than one column can have the same prefix. From b25d14edd1302ed6c4f3c5d123ba78390ca9fede Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 12 Dec 2024 21:34:08 -0500 Subject: [PATCH 16/27] update schema README --- schemas/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/schemas/README.md b/schemas/README.md index 05ff1a9..9915cd7 100644 --- a/schemas/README.md +++ b/schemas/README.md @@ -1,9 +1,9 @@ -# Schema for `digest` input files +# Schemas for `digest` input files ## Overview `digest` supports TSV files as inputs. Expected columns in a digest file are defined in JSON files called digest schemas. -Different schemas correspond to digest files containing different modalities of data (e.g. imaging vs. phenotypic). +There are different schemas for digest files containing different modalities of data (e.g. imaging vs. phenotypic). | Schema | Modality of data in corresponding digest file | | ----- | ----- | @@ -15,13 +15,13 @@ Different schemas correspond to digest files containing different modalities of Within a schema, columns are grouped into two semantic categories. These categories are purely for organizational purposes and do not appear in an digest file. **Global columns:** Columns describing basic metadata that should have the same meaning regardless of the specific event described by a given record -(e.g., a certain processing pipeline or phenotypic assessment), and does not depend on event outputs. +(e.g., a certain processing pipeline or phenotypic assessment), and do not depend on event outputs. -**Event-specific columns:** Includes columns whose values may have event-specific meanings. +**Event-specific columns:** Columns whose values may have event-specific meanings. e.g., in the schema for an imaging digest, the `"PIPELINE_STATUS_COLUMNS"` convey info about processing pipeline completion that depends on pipeline-specific outputs and may have varying values depending on the pipeline tracker used. ### Column attributes -Recognized columns are individually described in the schema using the following attributes: +Recognized columns are described in the schema using the following attributes: - `Description` - Describes the column contents, along with the meaning of different acceptable values for categorical columns. - `dtype` From 902947a30dcbd08d173d241ae510fab8c9a9320f Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 12 Dec 2024 22:10:13 -0500 Subject: [PATCH 17/27] rename schemas --- digest/layout.py | 2 +- digest/utility.py | 4 ++-- schemas/README.md | 4 ++-- schemas/{bagel_schema.json => imaging_digest_schema.json} | 0 ...{bagel_schema_pheno.json => phenotypic_digest_schema.json} | 0 5 files changed, 5 insertions(+), 5 deletions(-) rename schemas/{bagel_schema.json => imaging_digest_schema.json} (100%) rename schemas/{bagel_schema_pheno.json => phenotypic_digest_schema.json} (100%) diff --git a/digest/layout.py b/digest/layout.py index a950d7c..3b3a145 100644 --- a/digest/layout.py +++ b/digest/layout.py @@ -266,7 +266,7 @@ def status_legend_card(): "These are the recommended status definitions for processing progress. For more details, see the ", html.A( "schema for an imaging digest file", - href="https://github.com/neurobagel/digest/blob/main/schemas/bagel_schema.json", + href="https://github.com/neurobagel/digest/blob/main/schemas/imaging_digest_schema.json", target="_blank", ), ], diff --git a/digest/utility.py b/digest/utility.py index bc74beb..65ee5f8 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -10,11 +10,11 @@ SCHEMAS_PATH = Path(__file__).absolute().parents[1] / "schemas" BAGEL_CONFIG = { "imaging": { - "schema_file": "bagel_schema.json", + "schema_file": "imaging_digest_schema.json", "overview_col": "status", }, "phenotypic": { - "schema_file": "bagel_schema_pheno.json", + "schema_file": "phenotypic_digest_schema.json", "overview_col": "assessment_score", }, } diff --git a/schemas/README.md b/schemas/README.md index 9915cd7..9775291 100644 --- a/schemas/README.md +++ b/schemas/README.md @@ -7,8 +7,8 @@ There are different schemas for digest files containing different modalities of | Schema | Modality of data in corresponding digest file | | ----- | ----- | -| `digest_schema_derivatives.json` | Processing pipeline output and derivative availability | -| `digest_schema_phenotypic.json` | Demographic and phenotypic assessment data | +| `imaging_digest_schema.json` | Processing pipeline output and derivative availability | +| `phenotypic_digest_schema.json` | Demographic and phenotypic assessment data | ## How to read the schema ### Column categories diff --git a/schemas/bagel_schema.json b/schemas/imaging_digest_schema.json similarity index 100% rename from schemas/bagel_schema.json rename to schemas/imaging_digest_schema.json diff --git a/schemas/bagel_schema_pheno.json b/schemas/phenotypic_digest_schema.json similarity index 100% rename from schemas/bagel_schema_pheno.json rename to schemas/phenotypic_digest_schema.json From 1f72a13613a10ef2dad7c78348ca93d4d9dd4b1a Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 01:32:36 -0500 Subject: [PATCH 18/27] fix sneaky outdated session column reference --- digest/utility.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/digest/utility.py b/digest/utility.py index 65ee5f8..ec32964 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -25,6 +25,7 @@ "INCOMPLETE": "Pipeline has not yet been run (output directory not available).", "UNAVAILABLE": "Relevant MRI modality for pipeline not available.", } +# Column to use as the primary session identifier in the data PRIMARY_SESSION = "session_id" # TODO: @@ -351,17 +352,18 @@ def filter_records( if all( not sub.query( " and ".join( - [f"session == '{session}'"] + pipeline_queries + [f"{PRIMARY_SESSION} == '{session}'"] + + pipeline_queries ) ).empty for session in session_values ): matching_subs.append(sub_id) - query = f"participant_id in {matching_subs} and session in {session_values}" + query = f"participant_id in {matching_subs} and {PRIMARY_SESSION} in {session_values}" else: if operator_value == "OR": query = " and ".join( - [f"session in {session_values}"] + pipeline_queries + [f"{PRIMARY_SESSION} in {session_values}"] + pipeline_queries ) data = data.query(query) From b2dde0e1052013723d7a8d0268c86f91d6c7e7d0 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 01:37:45 -0500 Subject: [PATCH 19/27] update columns in test data digests and convert to TSVs --- tests/data/example_diff_sessions_bagel.csv | 5 ----- tests/data/example_diff_sessions_bagel.tsv | 5 +++++ tests/data/example_mismatch-subs_bagel.csv | 17 ----------------- tests/data/example_mismatch-subs_bagel.tsv | 17 +++++++++++++++++ tests/data/example_missing-col_bagel.csv | 19 ------------------- tests/data/example_missing-col_bagel.tsv | 19 +++++++++++++++++++ 6 files changed, 41 insertions(+), 41 deletions(-) delete mode 100644 tests/data/example_diff_sessions_bagel.csv create mode 100644 tests/data/example_diff_sessions_bagel.tsv delete mode 100644 tests/data/example_mismatch-subs_bagel.csv create mode 100644 tests/data/example_mismatch-subs_bagel.tsv delete mode 100644 tests/data/example_missing-col_bagel.csv create mode 100644 tests/data/example_missing-col_bagel.tsv diff --git a/tests/data/example_diff_sessions_bagel.csv b/tests/data/example_diff_sessions_bagel.csv deleted file mode 100644 index 381e60b..0000000 --- a/tests/data/example_diff_sessions_bagel.csv +++ /dev/null @@ -1,5 +0,0 @@ -participant_id,session,has_mri_data,pipeline_name,pipeline_version,pipeline_starttime,pipeline_complete -sub-01,1,True,freesurfer,6.0.1,2022-05-24 13:43,SUCCESS -sub-01,2,True,freesurfer,6.0.1,2022-05-24 13:46,SUCCESS -sub-02,1,True,freesurfer,6.0.1,2022-05-24 14:01,SUCCESS -sub-02,3,True,freesurfer,6.0.1,2022-05-24 16:27,SUCCESS \ No newline at end of file diff --git a/tests/data/example_diff_sessions_bagel.tsv b/tests/data/example_diff_sessions_bagel.tsv new file mode 100644 index 0000000..00daa7b --- /dev/null +++ b/tests/data/example_diff_sessions_bagel.tsv @@ -0,0 +1,5 @@ +participant_id session_id pipeline_name pipeline_version pipeline_starttime pipeline_step status +sub-01 1 freesurfer 6.0.1 2022-05-24 13:43 default SUCCESS +sub-01 2 freesurfer 6.0.1 2022-05-24 13:46 default SUCCESS +sub-02 1 freesurfer 6.0.1 2022-05-24 14:01 default SUCCESS +sub-02 3 freesurfer 6.0.1 2022-05-24 16:27 default SUCCESS diff --git a/tests/data/example_mismatch-subs_bagel.csv b/tests/data/example_mismatch-subs_bagel.csv deleted file mode 100644 index 9f79f33..0000000 --- a/tests/data/example_mismatch-subs_bagel.csv +++ /dev/null @@ -1,17 +0,0 @@ -participant_id,session,has_mri_data,pipeline_name,pipeline_version,pipeline_starttime,pipeline_complete -sub-01,1,TRUE,freesurfer,6.0.1,2022-05-24 13:43,SUCCESS -sub-01,2,TRUE,freesurfer,6.0.1,2022-05-24 13:46,SUCCESS -sub-02,1,TRUE,freesurfer,6.0.1,2022-05-24 14:01,SUCCESS -sub-02,2,TRUE,freesurfer,6.0.1,2022-05-24 16:27,SUCCESS -sub-01,1,TRUE,freesurfer,7.3.2,2022-09-24 13:43,SUCCESS -sub-01,2,TRUE,freesurfer,7.3.2,UNAVAILABLE,UNAVAILABLE -sub-02,1,TRUE,freesurfer,7.3.2,2022-09-24 14:01,SUCCESS -sub-02,2,TRUE,freesurfer,7.3.2,UNAVAILABLE,UNAVAILABLE -sub-03,1,TRUE,freesurfer,7.3.2,2022-09-24 17:07,SUCCESS -sub-03,2,TRUE,freesurfer,7.3.2,UNAVAILABLE,UNAVAILABLE -sub-01,1,FALSE,fmriprep,20.2.7,UNAVAILABLE,UNAVAILABLE -sub-01,2,FALSE,fmriprep,20.2.7,UNAVAILABLE,UNAVAILABLE -sub-02,1,TRUE,fmriprep,20.2.7,2022-05-24 16:26,SUCCESS -sub-02,2,TRUE,fmriprep,20.2.7,2022-05-24 16:26,SUCCESS -sub-03,1,TRUE,fmriprep,20.2.7,2022-05-24 16:26,SUCCESS -sub-03,2,TRUE,fmriprep,20.2.7,2022-05-24 16:33,SUCCESS diff --git a/tests/data/example_mismatch-subs_bagel.tsv b/tests/data/example_mismatch-subs_bagel.tsv new file mode 100644 index 0000000..d86e321 --- /dev/null +++ b/tests/data/example_mismatch-subs_bagel.tsv @@ -0,0 +1,17 @@ +participant_id session_id pipeline_name pipeline_version pipeline_starttime pipeline_step status +sub-01 1 freesurfer 6.0.1 2022-05-24 13:43 default SUCCESS +sub-01 2 freesurfer 6.0.1 2022-05-24 13:46 default SUCCESS +sub-02 1 freesurfer 6.0.1 2022-05-24 14:01 default SUCCESS +sub-02 2 freesurfer 6.0.1 2022-05-24 16:27 default SUCCESS +sub-01 1 freesurfer 7.3.2 2022-09-24 13:43 default SUCCESS +sub-01 2 freesurfer 7.3.2 default UNAVAILABLE +sub-02 1 freesurfer 7.3.2 2022-09-24 14:01 default SUCCESS +sub-02 2 freesurfer 7.3.2 default UNAVAILABLE +sub-03 1 freesurfer 7.3.2 2022-09-24 17:07 default SUCCESS +sub-03 2 freesurfer 7.3.2 default UNAVAILABLE +sub-01 1 fmriprep 20.2.7 default UNAVAILABLE +sub-01 2 fmriprep 20.2.7 default UNAVAILABLE +sub-02 1 fmriprep 20.2.7 2022-05-24 16:26 default SUCCESS +sub-02 2 fmriprep 20.2.7 2022-05-24 16:26 default SUCCESS +sub-03 1 fmriprep 20.2.7 2022-05-24 16:26 default SUCCESS +sub-03 2 fmriprep 20.2.7 2022-05-24 16:33 default SUCCESS diff --git a/tests/data/example_missing-col_bagel.csv b/tests/data/example_missing-col_bagel.csv deleted file mode 100644 index 77a8f11..0000000 --- a/tests/data/example_missing-col_bagel.csv +++ /dev/null @@ -1,19 +0,0 @@ -participant_id,session,has_mri_data,pipeline_name,pipeline_version,pipeline_complete -sub-01,1,TRUE,freesurfer,6.0.1,SUCCESS -sub-01,2,TRUE,freesurfer,6.0.1,SUCCESS -sub-02,1,TRUE,freesurfer,6.0.1,SUCCESS -sub-02,2,TRUE,freesurfer,6.0.1,SUCCESS -sub-03,1,TRUE,freesurfer,6.0.1,FAIL -sub-03,2,TRUE,freesurfer,6.0.1,FAIL -sub-01,1,TRUE,freesurfer,7.3.2,SUCCESS -sub-01,2,TRUE,freesurfer,7.3.2,UNAVAILABLE -sub-02,1,TRUE,freesurfer,7.3.2,SUCCESS -sub-02,2,TRUE,freesurfer,7.3.2,UNAVAILABLE -sub-03,1,TRUE,freesurfer,7.3.2,SUCCESS -sub-03,2,TRUE,freesurfer,7.3.2,UNAVAILABLE -sub-01,1,FALSE,fmriprep,20.2.7,UNAVAILABLE -sub-01,2,FALSE,fmriprep,20.2.7,UNAVAILABLE -sub-02,1,TRUE,fmriprep,20.2.7,SUCCESS -sub-02,2,TRUE,fmriprep,20.2.7,SUCCESS -sub-03,1,TRUE,fmriprep,20.2.7,SUCCESS -sub-03,2,TRUE,fmriprep,20.2.7,SUCCESS diff --git a/tests/data/example_missing-col_bagel.tsv b/tests/data/example_missing-col_bagel.tsv new file mode 100644 index 0000000..178f747 --- /dev/null +++ b/tests/data/example_missing-col_bagel.tsv @@ -0,0 +1,19 @@ +participant_id session_id pipeline_name pipeline_version pipeline_step status +sub-01 1 freesurfer 6.0.1 default SUCCESS +sub-01 2 freesurfer 6.0.1 default SUCCESS +sub-02 1 freesurfer 6.0.1 default SUCCESS +sub-02 2 freesurfer 6.0.1 default SUCCESS +sub-03 1 freesurfer 6.0.1 default FAIL +sub-03 2 freesurfer 6.0.1 default FAIL +sub-01 1 freesurfer 7.3.2 default SUCCESS +sub-01 2 freesurfer 7.3.2 default UNAVAILABLE +sub-02 1 freesurfer 7.3.2 default SUCCESS +sub-02 2 freesurfer 7.3.2 default UNAVAILABLE +sub-03 1 freesurfer 7.3.2 default SUCCESS +sub-03 2 freesurfer 7.3.2 default UNAVAILABLE +sub-01 1 fmriprep 20.2.7 default UNAVAILABLE +sub-01 2 fmriprep 20.2.7 default UNAVAILABLE +sub-02 1 fmriprep 20.2.7 default SUCCESS +sub-02 2 fmriprep 20.2.7 default SUCCESS +sub-03 1 fmriprep 20.2.7 default SUCCESS +sub-03 2 fmriprep 20.2.7 default SUCCESS From 037e5dc86c0fdcf9ad4c892451264505dd03356f Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 01:40:23 -0500 Subject: [PATCH 20/27] update missing column example based on revised schema --- tests/data/example_missing-col_bagel.tsv | 38 ++++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/data/example_missing-col_bagel.tsv b/tests/data/example_missing-col_bagel.tsv index 178f747..21cd856 100644 --- a/tests/data/example_missing-col_bagel.tsv +++ b/tests/data/example_missing-col_bagel.tsv @@ -1,19 +1,19 @@ -participant_id session_id pipeline_name pipeline_version pipeline_step status -sub-01 1 freesurfer 6.0.1 default SUCCESS -sub-01 2 freesurfer 6.0.1 default SUCCESS -sub-02 1 freesurfer 6.0.1 default SUCCESS -sub-02 2 freesurfer 6.0.1 default SUCCESS -sub-03 1 freesurfer 6.0.1 default FAIL -sub-03 2 freesurfer 6.0.1 default FAIL -sub-01 1 freesurfer 7.3.2 default SUCCESS -sub-01 2 freesurfer 7.3.2 default UNAVAILABLE -sub-02 1 freesurfer 7.3.2 default SUCCESS -sub-02 2 freesurfer 7.3.2 default UNAVAILABLE -sub-03 1 freesurfer 7.3.2 default SUCCESS -sub-03 2 freesurfer 7.3.2 default UNAVAILABLE -sub-01 1 fmriprep 20.2.7 default UNAVAILABLE -sub-01 2 fmriprep 20.2.7 default UNAVAILABLE -sub-02 1 fmriprep 20.2.7 default SUCCESS -sub-02 2 fmriprep 20.2.7 default SUCCESS -sub-03 1 fmriprep 20.2.7 default SUCCESS -sub-03 2 fmriprep 20.2.7 default SUCCESS +participant_id session_id pipeline_name pipeline_version status +sub-01 1 freesurfer 6.0.1 SUCCESS +sub-01 2 freesurfer 6.0.1 SUCCESS +sub-02 1 freesurfer 6.0.1 SUCCESS +sub-02 2 freesurfer 6.0.1 SUCCESS +sub-03 1 freesurfer 6.0.1 FAIL +sub-03 2 freesurfer 6.0.1 FAIL +sub-01 1 freesurfer 7.3.2 SUCCESS +sub-01 2 freesurfer 7.3.2 UNAVAILABLE +sub-02 1 freesurfer 7.3.2 SUCCESS +sub-02 2 freesurfer 7.3.2 UNAVAILABLE +sub-03 1 freesurfer 7.3.2 SUCCESS +sub-03 2 freesurfer 7.3.2 UNAVAILABLE +sub-01 1 fmriprep 20.2.7 UNAVAILABLE +sub-01 2 fmriprep 20.2.7 UNAVAILABLE +sub-02 1 fmriprep 20.2.7 SUCCESS +sub-02 2 fmriprep 20.2.7 SUCCESS +sub-03 1 fmriprep 20.2.7 SUCCESS +sub-03 2 fmriprep 20.2.7 SUCCESS From 745483a875f924198831994328b269dabc3c0072 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 02:18:21 -0500 Subject: [PATCH 21/27] rename test data files --- ...-subs_bagel.tsv => example_imaging_diff-pipeline-subjects.tsv} | 0 ..._diff_sessions_bagel.tsv => example_imaging_diff-sessions.tsv} | 0 ...mple_missing-col_bagel.tsv => example_imaging_missing-col.tsv} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename tests/data/{example_mismatch-subs_bagel.tsv => example_imaging_diff-pipeline-subjects.tsv} (100%) rename tests/data/{example_diff_sessions_bagel.tsv => example_imaging_diff-sessions.tsv} (100%) rename tests/data/{example_missing-col_bagel.tsv => example_imaging_missing-col.tsv} (100%) diff --git a/tests/data/example_mismatch-subs_bagel.tsv b/tests/data/example_imaging_diff-pipeline-subjects.tsv similarity index 100% rename from tests/data/example_mismatch-subs_bagel.tsv rename to tests/data/example_imaging_diff-pipeline-subjects.tsv diff --git a/tests/data/example_diff_sessions_bagel.tsv b/tests/data/example_imaging_diff-sessions.tsv similarity index 100% rename from tests/data/example_diff_sessions_bagel.tsv rename to tests/data/example_imaging_diff-sessions.tsv diff --git a/tests/data/example_missing-col_bagel.tsv b/tests/data/example_imaging_missing-col.tsv similarity index 100% rename from tests/data/example_missing-col_bagel.tsv rename to tests/data/example_imaging_missing-col.tsv From a18a2d6d927e6d3f038fde17e5ad12c44c6199d7 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 02:22:30 -0500 Subject: [PATCH 22/27] update docstring --- digest/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/digest/app.py b/digest/app.py index bc8d473..1b66022 100644 --- a/digest/app.py +++ b/digest/app.py @@ -1,6 +1,6 @@ """ -Serves Dash app for viewing and filtering participant (meta)data for imaging and phenotypic tasks from a given dataset. -App accepts and parses a user-uploaded bagel.csv file (assumed to be generated by mr_proc) as input. +Serves Dash app for viewing and filtering participant (meta)data for imaging and phenotypic data events from a provided dataset. +App accepts and parses a user-uploaded digest TSV file as input. """ import dash_bootstrap_components as dbc From cab23fb949c1490c522437c60fda5a7a96fac1be Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 12:36:54 -0500 Subject: [PATCH 23/27] rename reference example inputs and update symlinks --- example_bagels/example_imaging_bagel.csv | 22 ---------------------- example_bagels/example_pheno_bagel.csv | 22 ---------------------- example_inputs/example_imaging.tsv | 12 ++++++++++++ example_inputs/example_phenotypic.tsv | 22 ++++++++++++++++++++++ tests/data/example_imaging.tsv | 1 + tests/data/example_imaging_bagel.csv | 1 - tests/data/example_pheno_bagel.csv | 1 - tests/data/example_phenotypic.tsv | 1 + 8 files changed, 36 insertions(+), 46 deletions(-) delete mode 100644 example_bagels/example_imaging_bagel.csv delete mode 100644 example_bagels/example_pheno_bagel.csv create mode 100644 example_inputs/example_imaging.tsv create mode 100644 example_inputs/example_phenotypic.tsv create mode 120000 tests/data/example_imaging.tsv delete mode 120000 tests/data/example_imaging_bagel.csv delete mode 120000 tests/data/example_pheno_bagel.csv create mode 120000 tests/data/example_phenotypic.tsv diff --git a/example_bagels/example_imaging_bagel.csv b/example_bagels/example_imaging_bagel.csv deleted file mode 100644 index 3f77b0e..0000000 --- a/example_bagels/example_imaging_bagel.csv +++ /dev/null @@ -1,22 +0,0 @@ -bids_id,participant_id,session,has_mri_data,pipeline_name,pipeline_version,pipeline_starttime,pipeline_complete -sub-MNI001,MNI001,1,TRUE,freesurfer,6.0.1,2022-05-24 13:43,SUCCESS -sub-MNI001,MNI001,2,TRUE,freesurfer,6.0.1,2022-05-24 13:46,SUCCESS -sub-MNI001,MNI001,3,TRUE,freesurfer,6.0.1,UNAVAILABLE,INCOMPLETE -sub-MNI002,MNI002,1,TRUE,freesurfer,6.0.1,2022-05-24 14:01,SUCCESS -sub-MNI002,MNI002,2,TRUE,freesurfer,6.0.1,2022-05-24 16:27,SUCCESS -sub-MNI003,MNI003,1,TRUE,freesurfer,6.0.1,2022-05-24 17:07,FAIL -sub-MNI003,MNI003,2,TRUE,freesurfer,6.0.1,2022-05-24 17:06,FAIL -sub-MNI001,MNI001,1,TRUE,freesurfer,7.3.2,2022-09-24 13:43,SUCCESS -sub-MNI001,MNI001,2,TRUE,freesurfer,7.3.2,UNAVAILABLE,INCOMPLETE -sub-MNI001,MNI001,3,TRUE,freesurfer,7.3.2,UNAVAILABLE,INCOMPLETE -sub-MNI002,MNI002,1,TRUE,freesurfer,7.3.2,2022-09-24 14:01,SUCCESS -sub-MNI002,MNI002,2,TRUE,freesurfer,7.3.2,UNAVAILABLE,INCOMPLETE -sub-MNI003,MNI003,1,TRUE,freesurfer,7.3.2,2022-09-24 17:07,SUCCESS -sub-MNI003,MNI003,2,TRUE,freesurfer,7.3.2,UNAVAILABLE,INCOMPLETE -sub-MNI001,MNI001,1,TRUE,fmriprep,20.2.7,UNAVAILABLE,UNAVAILABLE -sub-MNI001,MNI001,2,TRUE,fmriprep,20.2.7,UNAVAILABLE,UNAVAILABLE -sub-MNI001,MNI001,3,TRUE,fmriprep,20.2.7,UNAVAILABLE,UNAVAILABLE -sub-MNI002,MNI002,1,TRUE,fmriprep,20.2.7,2022-05-24 16:26,SUCCESS -sub-MNI002,MNI002,2,TRUE,fmriprep,20.2.7,2022-05-24 16:26,SUCCESS -sub-MNI003,MNI003,1,TRUE,fmriprep,20.2.7,2022-05-24 16:26,SUCCESS -sub-MNI003,MNI003,2,TRUE,fmriprep,20.2.7,2022-05-24 16:33,SUCCESS diff --git a/example_bagels/example_pheno_bagel.csv b/example_bagels/example_pheno_bagel.csv deleted file mode 100644 index 18b8659..0000000 --- a/example_bagels/example_pheno_bagel.csv +++ /dev/null @@ -1,22 +0,0 @@ -bids_id,participant_id,session,assessment_name,assessment_score -sub-MNI001,MNI001,1,group,Patient -sub-MNI001,MNI001,2,group,Patient -sub-MNI001,MNI001,3,group,Patient -sub-MNI002,MNI002,1,group,Patient -sub-MNI002,MNI002,2,group,Patient -sub-MNI003,MNI003,1,group,Control -sub-MNI003,MNI003,2,group,Control -sub-MNI001,MNI001,1,moca_total,21 -sub-MNI001,MNI001,2,moca_total,21 -sub-MNI001,MNI001,3,moca_total,19 -sub-MNI002,MNI002,1,moca_total,25 -sub-MNI002,MNI002,2,moca_total,24 -sub-MNI003,MNI003,1,moca_total,30 -sub-MNI003,MNI003,2,moca_total,30 -sub-MNI001,MNI001,1,updrs_3_total,30 -sub-MNI001,MNI001,2,updrs_3_total,33 -sub-MNI001,MNI001,3,updrs_3_total,32 -sub-MNI002,MNI002,1,updrs_3_total,25 -sub-MNI002,MNI002,2,updrs_3_total,24 -sub-MNI003,MNI003,1,updrs_3_total,0 -sub-MNI003,MNI003,2,updrs_3_total,1 diff --git a/example_inputs/example_imaging.tsv b/example_inputs/example_imaging.tsv new file mode 100644 index 0000000..ef85532 --- /dev/null +++ b/example_inputs/example_imaging.tsv @@ -0,0 +1,12 @@ +participant_id bids_participant_id session_id bids_session_id pipeline_name pipeline_version pipeline_step status +01 sub-01 01 ses-01 fmriprep 20.2.7 step1 FAIL +01 sub-01 01 ses-01 fmriprep 20.2.7 step2 INCOMPLETE +01 sub-01 01 ses-01 fmriprep 23.1.3 default SUCCESS +01 sub-01 01 ses-01 freesurfer 7.3.2 default SUCCESS +01 sub-01 02 ses-02 fmriprep 20.2.7 step1 SUCCESS +01 sub-01 02 ses-02 fmriprep 20.2.7 step2 SUCCESS +01 sub-01 02 ses-02 fmriprep 23.1.3 default SUCCESS +01 sub-01 02 ses-02 freesurfer 7.3.2 default UNAVAILABLE +02 sub-02 01 ses-01 fmriprep 23.1.3 default SUCCESS +02 sub-02 01 ses-01 freesurfer 7.3.2 default SUCCESS +02 sub-02 02 ses-02 freesurfer 7.3.2 default SUCCESS diff --git a/example_inputs/example_phenotypic.tsv b/example_inputs/example_phenotypic.tsv new file mode 100644 index 0000000..9319ce4 --- /dev/null +++ b/example_inputs/example_phenotypic.tsv @@ -0,0 +1,22 @@ +participant_id bids_participant_id session_id assessment_name assessment_score +MNI003 sub-MNI003 1 group Control +MNI003 sub-MNI003 2 group Control +MNI003 sub-MNI003 1 moca_total 30 +MNI003 sub-MNI003 2 moca_total 30 +MNI003 sub-MNI003 1 updrs_3_total 0 +MNI003 sub-MNI003 2 updrs_3_total 1 +MNI002 sub-MNI002 1 group Patient +MNI002 sub-MNI002 2 group Patient +MNI002 sub-MNI002 1 moca_total 25 +MNI002 sub-MNI002 2 moca_total 24 +MNI002 sub-MNI002 1 updrs_3_total 25 +MNI002 sub-MNI002 2 updrs_3_total 24 +MNI001 sub-MNI001 1 group Patient +MNI001 sub-MNI001 2 group Patient +MNI001 sub-MNI001 3 group Patient +MNI001 sub-MNI001 1 moca_total 21 +MNI001 sub-MNI001 2 moca_total 21 +MNI001 sub-MNI001 3 moca_total 19 +MNI001 sub-MNI001 1 updrs_3_total 30 +MNI001 sub-MNI001 2 updrs_3_total 33 +MNI001 sub-MNI001 3 updrs_3_total 32 diff --git a/tests/data/example_imaging.tsv b/tests/data/example_imaging.tsv new file mode 120000 index 0000000..e78250b --- /dev/null +++ b/tests/data/example_imaging.tsv @@ -0,0 +1 @@ +../../example_inputs/example_imaging.tsv \ No newline at end of file diff --git a/tests/data/example_imaging_bagel.csv b/tests/data/example_imaging_bagel.csv deleted file mode 120000 index 197a2c0..0000000 --- a/tests/data/example_imaging_bagel.csv +++ /dev/null @@ -1 +0,0 @@ -../../example_bagels/example_imaging_bagel.csv \ No newline at end of file diff --git a/tests/data/example_pheno_bagel.csv b/tests/data/example_pheno_bagel.csv deleted file mode 120000 index 9d0e409..0000000 --- a/tests/data/example_pheno_bagel.csv +++ /dev/null @@ -1 +0,0 @@ -../../example_bagels/example_pheno_bagel.csv \ No newline at end of file diff --git a/tests/data/example_phenotypic.tsv b/tests/data/example_phenotypic.tsv new file mode 120000 index 0000000..3cfa347 --- /dev/null +++ b/tests/data/example_phenotypic.tsv @@ -0,0 +1 @@ +../../example_inputs/example_phenotypic.tsv \ No newline at end of file From 3d492a608e4bd01e6a30f46d3341fb787723ec4b Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 12:40:19 -0500 Subject: [PATCH 24/27] update tests --- tests/test_app.py | 14 ++++---- tests/test_utility.py | 77 +++++++++++++++++++++++-------------------- 2 files changed, 48 insertions(+), 43 deletions(-) diff --git a/tests/test_app.py b/tests/test_app.py index fe0185f..1835d39 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -18,13 +18,13 @@ def test_server(dash_duo): "valid_bagel,bagel_type,expected_elements,unexpected_elements", [ ( - "example_imaging_bagel.csv", + "example_imaging.tsv", "imaging", ["#fig-pipeline-status-all-ses"], ["#phenotypic-plotting-form"], ), ( - "example_pheno_bagel.csv", + "example_phenotypic.tsv", "phenotypic", # TODO: Check specifically for a session filter form instead of #advanced-filter-form, # since latter is a larger container that also contains pipeline-specific dropdowns for imaging data @@ -80,8 +80,8 @@ def test_002_upload_invalid_imaging_bagel(test_server, bagels_path): to reuse the same (function scoped) server instance. """ invalid_input_output = { - "example_missing-col_bagel.csv": "missing the following required imaging metadata columns: {'pipeline_starttime'}", - "example_pheno_bagel.csv": "missing the following required imaging metadata columns", + "example_imaging_missing-col.tsv": "missing the following required imaging metadata columns: {'pipeline_step'}", + "example_phenotypic.tsv": "missing the following required imaging metadata columns", } upload = test_server.driver.find_element( @@ -115,9 +115,7 @@ def test_003_upload_invalid_phenotypic_bagel(test_server, bagels_path): ) upload.send_keys( - os.path.realpath( - os.path.join(bagels_path, "example_imaging_bagel.csv") - ) + os.path.realpath(os.path.join(bagels_path, "example_imaging.tsv")) ) test_server.wait_for_contains_text("#output-data-upload", err, timeout=4) assert err in test_server.find_element("#output-data-upload").text @@ -139,7 +137,7 @@ def test_004_phenotypic_col_selection_generates_visualization( """//*[contains(@id,'"index":"phenotypic","type":"upload-data"')]/div/input""", ) upload.send_keys( - os.path.realpath(os.path.join(bagels_path, "example_pheno_bagel.csv")) + os.path.realpath(os.path.join(bagels_path, "example_phenotypic.tsv")) ) # Wait for dropdown container diff --git a/tests/test_utility.py b/tests/test_utility.py index 8ee76ed..31a1edc 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -4,11 +4,12 @@ import digest.plotting as plot import digest.utility as util +from digest.utility import PRIMARY_SESSION @pytest.mark.parametrize( "filename", - ["imagingbagel.tsv", "imagingbagel.txt", "imagingbagel.csv.tsv"], + ["imagingbagel.csv", "imagingbagel.txt", "imagingbagel.tsv.csv"], ) def test_invalid_filetype_returns_informative_error(filename): toy_upload_contents = "stand-in for a base64 encoded file contents string" @@ -27,7 +28,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-2", "sub-2"], - "session": [1, 2, 1, 2], + "session_id": [1, 2, 1, 2], "assessment_name": ["moca", "moca", "moca", "moca"], "assessment_score": [21.0, 24.0, np.nan, 24.0], } @@ -36,7 +37,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": pd.Series([], dtype="object"), - "session": pd.Series([], dtype="int64"), + "session_id": pd.Series([], dtype="int64"), "assessment_name": pd.Series([], dtype="object"), "assessment_score": pd.Series([], dtype="float64"), } @@ -46,7 +47,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-2", "sub-2"], - "session": [1, 1, 1, 2], + "session_id": [1, 1, 1, 2], "assessment_name": ["moca", "moca", "moca", "moca"], "assessment_score": [21.0, 24.0, np.nan, 24.0], } @@ -54,7 +55,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": ["sub-1", "sub-1"], - "session": [1, 1], + "session_id": [1, 1], "assessment_name": ["moca", "moca"], "assessment_score": [21.0, 24.0], } @@ -64,7 +65,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-2", "sub-2"], - "session": [np.nan, np.nan, 1, 2], + "session_id": [np.nan, np.nan, 1, 2], "assessment_name": ["moca", "moca", "moca", "moca"], "assessment_score": [21.0, 24.0, np.nan, 24.0], } @@ -72,7 +73,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": ["sub-1", "sub-1"], - "session": [np.nan, np.nan], + "session_id": [np.nan, np.nan], "assessment_name": ["moca", "moca"], "assessment_score": [21.0, 24.0], } @@ -83,7 +84,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-2", "sub-2"], - "session": [1, np.nan, 1, 2], + "session_id": [1, np.nan, 1, 2], "assessment_name": ["moca", "moca", "moca", "moca"], "assessment_score": [21.0, 24.0, np.nan, 24.0], } @@ -91,7 +92,7 @@ def test_invalid_filetype_returns_informative_error(filename): pd.DataFrame( { "participant_id": pd.Series([], dtype="object"), - "session": pd.Series([], dtype="float64"), + "session_id": pd.Series([], dtype="float64"), "assessment_name": pd.Series([], dtype="object"), "assessment_score": pd.Series([], dtype="float64"), } @@ -102,7 +103,11 @@ def test_invalid_filetype_returns_informative_error(filename): def test_get_duplicate_entries(original_df, duplicates_df): """Test that get_duplicate_entries() returns a dataframe containing the duplicate entries in a given dataframe.""" - unique_value_id_columns = ["participant_id", "session", "assessment_name"] + unique_value_id_columns = [ + "participant_id", + "session_id", + "assessment_name", + ] assert util.get_duplicate_entries( data=original_df, subset=unique_value_id_columns ).equals(duplicates_df) @@ -112,37 +117,39 @@ def test_get_duplicate_entries(original_df, duplicates_df): "bagel_path,schema,expected_columns,expected_n_records", [ ( - "example_mismatch-subs_bagel.csv", + "example_imaging_diff-pipeline-subjects.tsv", "imaging", [ "participant_id", - "session", - "fmriprep-20.2.7", - "freesurfer-6.0.1", - "freesurfer-7.3.2", + "session_id", + "fmriprep-20.2.7-default", + "freesurfer-6.0.1-default", + "freesurfer-7.3.2-default", ], 6, ), ( - "example_imaging_bagel.csv", + "example_imaging.tsv", "imaging", [ "participant_id", - "bids_id", - "session", - "fmriprep-20.2.7", - "freesurfer-6.0.1", - "freesurfer-7.3.2", + "bids_participant_id", + "session_id", + "bids_session_id", + "fmriprep-20.2.7-step1", + "fmriprep-20.2.7-step2", + "fmriprep-23.1.3-default", + "freesurfer-7.3.2-default", ], - 7, + 4, ), ( - "example_pheno_bagel.csv", + "example_phenotypic.tsv", "phenotypic", [ "participant_id", - "bids_id", - "session", + "bids_participant_id", + "session_id", "group", "moca_total", "updrs_3_total", @@ -158,8 +165,8 @@ def test_get_pipelines_overview( Smoke test that get_pipelines_overview() returns a dataframe with the expected columns and number of participant-session rows after reshaping data into a wide format. """ - bagel = pd.read_csv(bagels_path / bagel_path) - bagel["session"] = bagel["session"].astype(str) + bagel = pd.read_csv(bagels_path / bagel_path, sep="\t") + bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) overview_df = util.get_pipelines_overview(bagel=bagel, schema=schema) assert overview_df.columns.tolist() == expected_columns @@ -174,7 +181,7 @@ def test_get_pipelines_overview( pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-2", "sub-2"], - "session": [1, np.nan, 1, 2], + "session_id": [1, np.nan, 1, 2], "assessment_name": ["moca", "moca", "moca", "moca"], "assessment_score": [21.0, 24.0, np.nan, 24.0], } @@ -182,7 +189,7 @@ def test_get_pipelines_overview( pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-2", "sub-2"], - "session": ["1.0", "nan", "1.0", "2.0"], + "session_id": ["1.0", "nan", "1.0", "2.0"], "moca": [21.0, 24.0, np.nan, 24.0], } ), @@ -198,7 +205,7 @@ def test_get_pipelines_overview( "sub-1", "sub-1", ], - "session": [ + "session_id": [ "intake", "baseline", "follow-up", @@ -218,7 +225,7 @@ def test_get_pipelines_overview( pd.DataFrame( { "participant_id": ["sub-1", "sub-1", "sub-1"], - "session": ["intake", "baseline", "follow-up"], + "session_id": ["intake", "baseline", "follow-up"], "moca": [np.nan, 24.0, np.nan], "updrs": [12.0, 12.0, np.nan], } @@ -230,7 +237,7 @@ def test_get_pipelines_overview_handles_nan_correctly( bagel, expected_overview_df ): """Test that get_pipelines_overview() handles NaN values in the original long-format data as expected.""" - bagel["session"] = bagel["session"].astype(str) + bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) overview_df = util.get_pipelines_overview(bagel=bagel, schema="phenotypic") assert overview_df.equals(expected_overview_df), overview_df @@ -244,7 +251,7 @@ def test_reset_column_dtypes(): pheno_overview_df = pd.DataFrame( { "participant_id": ["sub-1", "sub-2", "sub-3"], - "session": [1, 1, 1], + "session_id": [1, 1, 1], "group": ["PD", "PD", "PD"], "moca_total": ["21", "24", np.nan], "moca_total_status": ["true", "true", "false"], @@ -254,7 +261,7 @@ def test_reset_column_dtypes(): pheno_overview_df_retyped = util.reset_column_dtypes(pheno_overview_df) assert pheno_overview_df_retyped["participant_id"].dtype == "object" - assert pheno_overview_df_retyped["session"].dtype == "object" + assert pheno_overview_df_retyped["session_id"].dtype == "object" assert pheno_overview_df_retyped["group"].dtype == "object" assert pheno_overview_df_retyped["moca_total"].dtype == "float64" assert pheno_overview_df_retyped["moca_total_status"].dtype == "bool" @@ -294,7 +301,7 @@ def test_generate_column_summary_str(column, nonmissing, missing, stats): pheno_overview_df = pd.DataFrame( { "participant_id": ["sub-1", "sub-2", "sub-3"], - "session": [ + "session_id": [ "1", "1", "1", From e6225859d5aa30304b295ae600b118ac5d6b927a Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 13:04:51 -0500 Subject: [PATCH 25/27] replace csv with tsv in docstrings --- digest/app.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/digest/app.py b/digest/app.py index 1b66022..c3a5b77 100644 --- a/digest/app.py +++ b/digest/app.py @@ -117,7 +117,7 @@ def set_was_upload_used_flag(upload_contents, available_digest_nclicks): ) def process_bagel(upload_contents, available_digest_nclicks, filenames): """ - From the contents of a correctly-formatted uploaded .csv file, parse and store (1) the pipeline overview data as a dataframe, + From the contents of a correctly-formatted uploaded TSV file, parse and store (1) the pipeline overview data as a dataframe, and (2) pipeline-specific metadata as individual dataframes within a dict. Returns any errors encountered during input file processing as a user-friendly message. """ @@ -204,7 +204,7 @@ def reset_upload_buttons(memory_filename): Upload components need to be manually replaced to clear contents, otherwise previously uploaded imaging/pheno bagels cannot be re-uploaded - (e.g. if a user uploads pheno_bagel.csv, then imaging_bagel.csv, then pheno_bagel.csv again) + (e.g. if a user uploads pheno_bagel.tsv, then imaging_bagel.tsv, then pheno_bagel.tsv again) see https://github.com/plotly/dash-core-components/issues/816 """ return upload_buttons() @@ -294,7 +294,7 @@ def update_session_filter(parsed_data, session_list): ) def create_pipeline_status_dropdowns(pipelines_dict, parsed_data): """ - Generates a dropdown filter with status options for each unique pipeline in the input csv, + Generates a dropdown filter with status options for each unique pipeline in the input TSV, and disables the native datatable filter UI for the corresponding columns in the datatable. """ pipeline_dropdowns = [] @@ -419,7 +419,7 @@ def update_matching_rows(columns, virtual_data): ) def reset_selections(filename): """ - If file contents change (i.e., selected new CSV for upload), reset displayed file name and selection values related to data filtering or plotting. + If file contents change (i.e., selected new TSV for upload), reset displayed file name and selection values related to data filtering or plotting. Reset will occur regardless of whether there is an issue processing the selected file. """ return f"Input file: {filename}", "", "", None, False From 08a5812256878b2958a2838aa6afcd23190d02bd Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Fri, 13 Dec 2024 13:15:13 -0500 Subject: [PATCH 26/27] rename PRIMARY_SESSION var --- digest/app.py | 8 ++++---- digest/plotting.py | 10 +++++----- digest/utility.py | 24 ++++++++++++++---------- tests/test_utility.py | 6 +++--- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/digest/app.py b/digest/app.py index c3a5b77..a48a194 100644 --- a/digest/app.py +++ b/digest/app.py @@ -11,7 +11,7 @@ from . import plotting as plot from . import utility as util from .layout import DEFAULT_DATASET_NAME, construct_layout, upload_buttons -from .utility import PRIMARY_SESSION +from .utility import PRIMARY_SESSION_COL EMPTY_FIGURE_PROPS = {"data": [], "layout": {}, "frames": []} @@ -158,8 +158,8 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames): # Another side effect of allowing NaN sessions is that if this column has integer values, they will be read in as floats # (before being converted to str) if there are NaNs in the column. # This should not be a problem after we disallow NaNs value in "participant_id" and "session_id" columns, https://github.com/neurobagel/digest/issues/20 - bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) - session_list = bagel[PRIMARY_SESSION].unique().tolist() + bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str) + session_list = bagel[PRIMARY_SESSION_COL].unique().tolist() overview_df = util.get_pipelines_overview( bagel=bagel, schema=schema @@ -555,7 +555,7 @@ def plot_phenotypic_column( data_to_plot = virtual_data if session_switch_value: - color = PRIMARY_SESSION + color = PRIMARY_SESSION_COL else: color = None diff --git a/digest/plotting.py b/digest/plotting.py index 5b306f7..a122727 100644 --- a/digest/plotting.py +++ b/digest/plotting.py @@ -7,7 +7,7 @@ import plotly.graph_objects as go from . import utility as util -from .utility import PRIMARY_SESSION +from .utility import PRIMARY_SESSION_COL CMAP = px.colors.qualitative.Bold STATUS_COLORS = { @@ -61,28 +61,28 @@ def plot_pipeline_status_by_participants( ) -> go.Figure: status_counts = ( transform_active_data_to_long(data) - .groupby(["pipeline_name", "status", PRIMARY_SESSION]) + .groupby(["pipeline_name", "status", PRIMARY_SESSION_COL]) .size() .reset_index(name="participants") ) fig = px.bar( status_counts, - x=PRIMARY_SESSION, + x=PRIMARY_SESSION_COL, y="participants", color="status", text_auto=True, facet_col="pipeline_name", category_orders={ "status": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), - PRIMARY_SESSION: session_list, + PRIMARY_SESSION_COL: session_list, }, color_discrete_map=STATUS_COLORS, labels={ "pipeline_name": "Pipeline", "participants": "Participants (n)", "status": "Processing status", - PRIMARY_SESSION: "Session", + PRIMARY_SESSION_COL: "Session", }, title="All participant pipeline statuses by session", ) diff --git a/digest/utility.py b/digest/utility.py index ec32964..d548999 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -26,7 +26,7 @@ "UNAVAILABLE": "Relevant MRI modality for pipeline not available.", } # Column to use as the primary session identifier in the data -PRIMARY_SESSION = "session_id" +PRIMARY_SESSION_COL = "session_id" # TODO: # Could also use URLs for "imaging" or "phenotypic" locations if fetching from a remote repo doesn't slow things down too much. @@ -62,7 +62,9 @@ def reset_column_dtypes(data: pd.DataFrame) -> pd.DataFrame: stream.close() # Just in case, convert session labels back to strings (will avoid sessions being undesirably treated as continuous data in e.g., plots) - data_retyped[PRIMARY_SESSION] = data_retyped[PRIMARY_SESSION].astype(str) + data_retyped[PRIMARY_SESSION_COL] = data_retyped[ + PRIMARY_SESSION_COL + ].astype(str) return data_retyped @@ -94,7 +96,7 @@ def construct_summary_str(data: pd.DataFrame) -> str: """Creates summary of key counts for dataset.""" return f"""Total number of participants: {count_unique_subjects(data)} Total number of unique records (participant-session pairs): {count_unique_records(data)} -Total number of unique sessions: {data[PRIMARY_SESSION].nunique()}""" +Total number of unique sessions: {data[PRIMARY_SESSION_COL].nunique()}""" def get_required_bagel_columns(schema_file: str) -> list: @@ -204,9 +206,9 @@ def count_unique_subjects(data: pd.DataFrame) -> int: def count_unique_records(data: pd.DataFrame) -> int: """Returns number of unique participant-session pairs.""" - if set(["participant_id", PRIMARY_SESSION]).issubset(data.columns): + if set(["participant_id", PRIMARY_SESSION_COL]).issubset(data.columns): return ( - data[["participant_id", PRIMARY_SESSION]] + data[["participant_id", PRIMARY_SESSION_COL]] .drop_duplicates() .shape[0] ) @@ -248,7 +250,8 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: # NOTE: .reindex only works correctly when there are no NaN values in the index level # (Here, the entire "session_id" column should have already been cast to a string) pipeline_complete_df.reindex( - index=bagel[PRIMARY_SESSION].unique(), level=PRIMARY_SESSION + index=bagel[PRIMARY_SESSION_COL].unique(), + level=PRIMARY_SESSION_COL, ) .reindex(col_order, axis=1) # reorder assessments/pipelines if needed .reset_index() @@ -346,24 +349,25 @@ def filter_records( matching_subs = [] for sub_id, sub in data.groupby("participant_id"): if all( - session in sub[PRIMARY_SESSION].unique() + session in sub[PRIMARY_SESSION_COL].unique() for session in session_values ): if all( not sub.query( " and ".join( - [f"{PRIMARY_SESSION} == '{session}'"] + [f"{PRIMARY_SESSION_COL} == '{session}'"] + pipeline_queries ) ).empty for session in session_values ): matching_subs.append(sub_id) - query = f"participant_id in {matching_subs} and {PRIMARY_SESSION} in {session_values}" + query = f"participant_id in {matching_subs} and {PRIMARY_SESSION_COL} in {session_values}" else: if operator_value == "OR": query = " and ".join( - [f"{PRIMARY_SESSION} in {session_values}"] + pipeline_queries + [f"{PRIMARY_SESSION_COL} in {session_values}"] + + pipeline_queries ) data = data.query(query) diff --git a/tests/test_utility.py b/tests/test_utility.py index 31a1edc..5e5e673 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -4,7 +4,7 @@ import digest.plotting as plot import digest.utility as util -from digest.utility import PRIMARY_SESSION +from digest.utility import PRIMARY_SESSION_COL @pytest.mark.parametrize( @@ -166,7 +166,7 @@ def test_get_pipelines_overview( after reshaping data into a wide format. """ bagel = pd.read_csv(bagels_path / bagel_path, sep="\t") - bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) + bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str) overview_df = util.get_pipelines_overview(bagel=bagel, schema=schema) assert overview_df.columns.tolist() == expected_columns @@ -237,7 +237,7 @@ def test_get_pipelines_overview_handles_nan_correctly( bagel, expected_overview_df ): """Test that get_pipelines_overview() handles NaN values in the original long-format data as expected.""" - bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) + bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str) overview_df = util.get_pipelines_overview(bagel=bagel, schema="phenotypic") assert overview_df.equals(expected_overview_df), overview_df From 8615e2032a13d277b8cae54d18a2f27389d3e4b4 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 9 Jan 2025 16:35:51 -0500 Subject: [PATCH 27/27] Update schema README Co-authored-by: Michelle Wang --- schemas/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schemas/README.md b/schemas/README.md index 9775291..413b59d 100644 --- a/schemas/README.md +++ b/schemas/README.md @@ -12,7 +12,7 @@ There are different schemas for digest files containing different modalities of ## How to read the schema ### Column categories -Within a schema, columns are grouped into two semantic categories. These categories are purely for organizational purposes and do not appear in an digest file. +Within a schema, columns are grouped into two semantic categories. These categories are purely for organizational purposes and do not appear in a digest file. **Global columns:** Columns describing basic metadata that should have the same meaning regardless of the specific event described by a given record (e.g., a certain processing pipeline or phenotypic assessment), and do not depend on event outputs.