Skip to content

Commit

Permalink
Merge pull request dandi#54 from candleindark/enh
Browse files Browse the repository at this point in the history
Redo the summary of Pydantic validation error differences for assets
  • Loading branch information
candleindark authored Jan 15, 2025
2 parents 760aa36 + 9514fe8 commit b55cea3
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 137 deletions.
212 changes: 75 additions & 137 deletions src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import logging
from itertools import chain
from pathlib import Path
from typing import TYPE_CHECKING, Annotated, cast

if TYPE_CHECKING:
from collections.abc import Iterable
from typing import Annotated, Any, cast

from jsondiff import diff
from pydantic import Field
Expand Down Expand Up @@ -35,14 +32,10 @@
gen_diff_cell,
gen_pydantic_validation_errs_cell,
gen_row,
pydantic_validation_err_diff_detailed_table,
validation_err_count_table,
validation_err_diff_detailed_tables,
validation_err_diff_table,
pydantic_validation_err_diff_summary,
)
from dandisets_linkml_status_tools.tools.validation_err_counter import (
ValidationErrCounter,
validation_err_diff,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -322,91 +315,30 @@ def _output_dandiset_validation_diff_reports(
logger.info("Creating dandiset validation diff report directory %s", output_dir)
output_dir.mkdir(parents=True)

err1_rep_lsts: list[list[tuple[str, str, tuple[str | int], Path]]] = []
err2_rep_lsts: list[list[tuple[str, str, tuple[str | int], Path]]] = []
err1_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
err2_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
for r in reports:
p = Path(r.dandiset_identifier, r.dandiset_version)

# Tuple representation of the Pydantic validation errors
err1_rep_lsts.append(
[
(e["type"], e["msg"], tuple(e["loc"]), p)
for e in r.pydantic_validation_errs1
]
[pydantic_err_rep(e, p) for e in r.pydantic_validation_errs1]
)
err2_rep_lsts.append(
[
(e["type"], e["msg"], tuple(e["loc"]), p)
for e in r.pydantic_validation_errs2
]
)

err1_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
chain.from_iterable(err1_rep_lsts)
)
err2_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
chain.from_iterable(err2_rep_lsts)
)

def err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]:
"""
Categorize a Pydantic validation error represented as a tuple using the same
tuple without the path component to the dandiset at a particular version and
with a generalized "loc" with all array indices replaced by "[*]"
:param err: The tuple representing the Pydantic validation error
:return: The tuple representing the category that the error belongs to
"""
err = cast(tuple[str, str, tuple[str | int, ...], Path], err)
type_, msg = err[0], err[1]

# Generalize the "loc" by replacing all array indices with "[*]"
loc = cast(
tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2])
[pydantic_err_rep(e, p) for e in r.pydantic_validation_errs2]
)

return type_, msg, loc

pydantic_validation_errs1_ctr = ValidationErrCounter(err_categorizer)
pydantic_validation_errs2_ctr = ValidationErrCounter(err_categorizer)
pydantic_validation_errs1_ctr = ValidationErrCounter(pydantic_err_categorizer)
pydantic_validation_errs2_ctr = ValidationErrCounter(pydantic_err_categorizer)

pydantic_validation_errs1_ctr.count(err1_reps)
pydantic_validation_errs2_ctr.count(err2_reps)

pydantic_validation_err_diff = validation_err_diff(
pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr
)
pydantic_validation_errs1_ctr.count(chain.from_iterable(err1_rep_lsts))
pydantic_validation_errs2_ctr.count(chain.from_iterable(err2_rep_lsts))

with (output_dir / summary_file_name).open("w") as summary_f:
# === Output counts of different categories of Pydantic validation errors for
# validations done with separate schemas ===
summary_f.write("### Pydantic errs 1 counts\n\n")
summary_f.write(
validation_err_count_table(pydantic_validation_errs1_ctr.counts_by_cat)
)

summary_f.write("\n")
summary_f.write("### Pydantic errs 2 counts\n\n")
summary_f.write(
validation_err_count_table(pydantic_validation_errs2_ctr.counts_by_cat)
)

# Output a table of the differences in the different categories of
# Pydantic validation errors between the two sets of validation results where
# each set is represented, and counted, by a `ValidationErrCounter` object
summary_f.write("\n")
summary_f.write("### Pydantic errs diff\n\n")
summary_f.write(validation_err_diff_table(pydantic_validation_err_diff))

# Write a sequence of tables detailing the differences in Pydantic validation
# errors between the two sets of validation results
summary_f.write("\n")
summary_f.write("## Pydantic errs diff detailed tables\n\n")
# noinspection PyTypeChecker
# Write the summary of the Pydantic validation error differences
summary_f.write(
validation_err_diff_detailed_tables(
pydantic_validation_err_diff,
pydantic_validation_err_diff_detailed_table,
pydantic_validation_err_diff_summary(
pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr
)
)

Expand Down Expand Up @@ -492,26 +424,38 @@ def _output_asset_validation_diff_reports(
"""
summary_file_name = "summary.md"

summary_headers = [
"dandiset",
"version",
"asset id",
"asset path",
"asset index",
"pydantic errs 1",
"pydantic errs 2",
"pydantic errs diff",
]

output_dir.mkdir(parents=True)
logger.info("Created asset validation diff report directory %s", output_dir)

err1_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
err2_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
for r in reports:
p = Path(r.dandiset_identifier, r.dandiset_version, str(r.asset_idx))

# Tuple representation of the Pydantic validation errors
err1_rep_lsts.append(
[pydantic_err_rep(e, p) for e in r.pydantic_validation_errs1]
)
err2_rep_lsts.append(
[pydantic_err_rep(e, p) for e in r.pydantic_validation_errs2]
)

pydantic_validation_errs1_ctr = ValidationErrCounter(pydantic_err_categorizer)
pydantic_validation_errs2_ctr = ValidationErrCounter(pydantic_err_categorizer)

pydantic_validation_errs1_ctr.count(chain.from_iterable(err1_rep_lsts))
pydantic_validation_errs2_ctr.count(chain.from_iterable(err2_rep_lsts))

with (output_dir / summary_file_name).open("w") as summary_f:
# Write the header and alignment rows of the summary table
summary_f.write(gen_header_and_alignment_rows(summary_headers))
# Write the summary of the Pydantic validation error differences
summary_f.write(
pydantic_validation_err_diff_summary(
pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr
)
)

# Output individual asset validation diff reports by writing the supporting
# files and the summary table row
# Output individual asset validation diff reports by writing the constituting
# files
for r in reports:
report_dir = (
output_dir
Expand All @@ -535,53 +479,47 @@ def _output_asset_validation_diff_reports(

logger.info(
"Dandiset %s:%s - asset %sat index %d: "
"Wrote asset validation diff report supporting files to %s",
"Wrote asset validation diff report constituting files to %s",
r.dandiset_identifier,
r.dandiset_version,
f"{r.asset_id} " if r.asset_id else "",
r.asset_idx,
report_dir,
)

# === Write the summary table row for the validation diff report ===
# Relative directory for storing all validation diff reports of the dandiset
dandiset_dir = Path(r.dandiset_identifier)
# Relative directory for storing all validation diff reports of the dandiset
# at a particular version
version_dir = dandiset_dir / r.dandiset_version
# Relative directory for storing all validation diff reports of the asset
asset_dir = version_dir / str(r.asset_idx)
logger.info("Output of asset validation diff reports is complete")

row_cells = (
f" {c} " # Add spaces around the cell content for better readability
for c in [
# For the dandiset column
f"[{r.dandiset_identifier}]({dandiset_dir})",
# For the version column
f"[{r.dandiset_version}]({version_dir})",
# For the asset id column
f"{r.asset_id}",
# For the asset path column
f"{r.asset_path}",
# For the asset index column
f"[{r.asset_idx}]({asset_dir})",
# For the pydantic errs 1 column
gen_pydantic_validation_errs_cell(
r.pydantic_validation_errs1,
asset_dir / f"{pydantic_errs1_base_fname}.json",
),
# For the pydantic errs 2 column
gen_pydantic_validation_errs_cell(
r.pydantic_validation_errs2,
asset_dir / f"{pydantic_errs2_base_fname}.json",
),
# For the pydantic errs diff column
gen_diff_cell(
r.pydantic_validation_errs_diff,
asset_dir / f"{pydantic_errs_diff_base_fname}.json",
),
]
)
summary_f.write(gen_row(row_cells))

logger.info("Output of asset validation diff reports is complete")
def pydantic_err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]:
"""
Categorize a Pydantic validation error represented as a tuple using the same
tuple without the path component to the dandiset at a particular version and
with a generalized "loc" with all array indices replaced by "[*]"
:param err: The tuple representing the Pydantic validation error
:return: The tuple representing the category that the error belongs to
"""
err = cast(tuple[str, str, tuple[str | int, ...], Path], err)
type_, msg = err[0], err[1]

# Generalize the "loc" by replacing all array indices with "[*]"
loc = cast(
tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2])
)

return type_, msg, loc


def pydantic_err_rep(
err: dict[str, Any], path: Path
) -> tuple[str, str, tuple[str | int, ...], Path]:
"""
Get a representation of a Pydantic validation error as a tuple
:param err: The Pydantic validation error as a `dict`
:param path: The path the data instance that the error pertained to
:return: The representation of the Pydantic validation error as tuple consisting of
the values for the `'type'`, `'msg'`, `'loc'` keys of the error and `path`.
Note: The value of the `'loc'` key is converted to a tuple from a list
"""
return err["type"], err["msg"], tuple(err["loc"]), path
49 changes: 49 additions & 0 deletions src/dandisets_linkml_status_tools/tools/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

from dandisets_linkml_status_tools.models import PydanticValidationErrsType
from dandisets_linkml_status_tools.tools.typing import Stringable
from dandisets_linkml_status_tools.tools.validation_err_counter import (
ValidationErrCounter,
validation_err_diff,
)


def gen_row(cell_values: Iterable[Stringable]) -> str:
Expand Down Expand Up @@ -247,3 +251,48 @@ def pydantic_validation_err_diff_detailed_table(
)

return f"{heading}{header_and_alignment_rows}{rows}"


def pydantic_validation_err_diff_summary(
c1: ValidationErrCounter, c2: ValidationErrCounter
) -> str:
"""
Generate a summary of the differences between two sets of Pydantic validation errors
:param c1: A `ValidationErrCounter` that has counted the first set of Pydantic
validation errors
:param c2: A `ValidationErrCounter` that has counted the second set of Pydantic
validation errors
:return: The string presenting the summary in Markdown format
"""

# The differences in the different categories of
# Pydantic validation errors between the two sets of validation results where
# each set is represented, and counted, by a `ValidationErrCounter` object
pydantic_validation_err_diff = validation_err_diff(c1, c2)

count_table1 = validation_err_count_table(c1.counts_by_cat)
count_table2 = validation_err_count_table(c2.counts_by_cat)

# A table of the differences in the different categories of Pydantic validation
# errors
diff_table = validation_err_diff_table(pydantic_validation_err_diff)

# A sequence of tables detailing the differences in Pydantic validation
# errors between the two sets of validation results
# noinspection PyTypeChecker
diff_detailed_tables = validation_err_diff_detailed_tables(
pydantic_validation_err_diff,
pydantic_validation_err_diff_detailed_table,
)

return (
f"### Pydantic errs 1 counts\n\n"
f"{count_table1}"
f"\n### Pydantic errs 2 counts\n\n"
f"{count_table2}"
f"\n### Pydantic errs diff\n\n"
f"{diff_table}"
f"\n## Pydantic errs diff detailed tables\n\n"
f"{diff_detailed_tables}"
)

0 comments on commit b55cea3

Please sign in to comment.