From 41aba172ebe7c2b9fa2535bd5078d02278ed9a7e Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:44:15 -0600 Subject: [PATCH] Added a validate tabular script --- hed/cli/cli.py | 169 +++++++++++++++++ hed/scripts/validate_hed_tabular.py | 203 +++++++++++++++++++++ pyproject.toml | 1 + tests/scripts/test_validate_hed_tabular.py | 148 +++++++++++++++ tests/test_cli_parameter_parity.py | 46 +++++ 5 files changed, 567 insertions(+) create mode 100644 hed/scripts/validate_hed_tabular.py create mode 100644 tests/scripts/test_validate_hed_tabular.py diff --git a/hed/cli/cli.py b/hed/cli/cli.py index 0b7c542a..8b46c913 100644 --- a/hed/cli/cli.py +++ b/hed/cli/cli.py @@ -538,6 +538,175 @@ def validate_sidecar_cmd( ctx.exit(result if result is not None else 0) +@validate.command( + name="tabular", + epilog=""" +This command validates HED in a tabular file (TSV) against a specified HED schema +version. It can optionally include a sidecar file and check for warnings. + +\b +Examples: + # Basic validation of a TSV file + hedpy validate tabular events.tsv -sv 8.3.0 + + # Validate with a sidecar + hedpy validate tabular events.tsv -s sidecar.json -sv 8.3.0 + + # Validate with multiple schemas (base + library) + hedpy validate tabular events.tsv -s sidecar.json -sv 8.3.0 -sv score_1.1.0 + + # Check for warnings as well as errors + hedpy validate tabular events.tsv -sv 8.4.0 --check-for-warnings + + # Limit reported errors + hedpy validate tabular events.tsv -sv 8.4.0 -el 5 + + # Save validation results to a file + hedpy validate tabular events.tsv -sv 8.4.0 -o validation_results.txt +""", +) +@click.argument("tabular_file", type=click.Path(exists=True)) +# Validation options +@optgroup.group("Validation options") +@optgroup.option( + "-sv", + "--schema-version", + required=True, + multiple=True, + metavar="VERSION", + help="HED schema version(s) to validate against (e.g., '8.4.0'). Can be specified multiple times for multiple schemas (e.g., -sv lang_1.1.0 -sv score_2.1.0)", +) +@optgroup.option( + "-s", + "--sidecar", + type=click.Path(exists=True), + metavar=METAVAR_FILE, + help="BIDS JSON sidecar file to use during validation", +) +@optgroup.option( + "-w", + "--check-for-warnings", + is_flag=True, + help="Check for warnings as well as errors", +) +@optgroup.option( + "-el", + "--error-limit", + type=int, + metavar=METAVAR_N, + help="Limit number of errors reported per code (default: No limit)", +) +@optgroup.option( + "-ef", + "--errors-by-file", + is_flag=True, + help="If using --error-limit, apply the limit per-file rather than globally", +) +# Output options +@optgroup.group("Output options") +@optgroup.option( + "-f", + "--format", + type=click.Choice(["text", "json"]), + default="text", + show_default="text", + help="Output format for validation results (text: human-readable; json: structured format for programmatic use)", +) +@optgroup.option( + "-o", + "--output-file", + type=click.Path(), + default="", + metavar=METAVAR_FILE, + help="Path for output file to hold validation results; if not specified, output to stdout", +) +# Logging options +@optgroup.group("Logging options") +@optgroup.option( + "-l", + "--log-level", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), + default="WARNING", + show_default="WARNING", + help="Log level for diagnostic messages", +) +@optgroup.option( + "-v", + "--verbose", + is_flag=True, + help="Output informational messages (equivalent to --log-level INFO)", +) +@optgroup.option( + "-lf", + "--log-file", + type=click.Path(), + metavar=METAVAR_FILE, + help="File path for saving log output; logs still go to stderr unless --log-quiet is also used", +) +@optgroup.option( + "-lq", + "--log-quiet", + is_flag=True, + help="Suppress log output to stderr; only applicable when --log-file is used (logs go only to file)", +) +@optgroup.option( + "--no-log", + is_flag=True, + help="Disable all logging output", +) +@click.pass_context +def validate_tabular_cmd( + ctx, + tabular_file, + schema_version, + sidecar, + check_for_warnings, + error_limit, + errors_by_file, + format, + output_file, + log_level, + log_file, + log_quiet, + no_log, + verbose, +): + """Validate HED in a tabular file. + + TABULAR_FILE: The path to the tabular file (e.g., TSV) to validate. + """ + from hed.scripts.validate_hed_tabular import main as validate_tabular_main + + args = [tabular_file] + for version in schema_version: + args.extend(["-sv", version]) + if sidecar: + args.extend(["-s", sidecar]) + if check_for_warnings: + args.append("-w") + if error_limit is not None: + args.extend(["-el", str(error_limit)]) + if errors_by_file: + args.append("-ef") + if format: + args.extend(["-f", format]) + if output_file: + args.extend(["-o", output_file]) + if log_level: + args.extend(["-l", log_level]) + if log_file: + args.extend(["-lf", log_file]) + if log_quiet: + args.append("-lq") + if no_log: + args.append("--no-log") + if verbose: + args.append("-v") + + result = validate_tabular_main(args) + ctx.exit(result if result is not None else 0) + + @schema.command(name="validate") @click.argument("schema_path", type=click.Path(exists=True), nargs=-1, required=True) @click.option("--add-all-extensions", is_flag=True, help="Always verify all versions of the same schema are equal") diff --git a/hed/scripts/validate_hed_tabular.py b/hed/scripts/validate_hed_tabular.py new file mode 100644 index 00000000..7f505609 --- /dev/null +++ b/hed/scripts/validate_hed_tabular.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +""" +Validates HED in a tabular file (TSV) against a specified schema version. + +This script validates HED in a tabular file, optionally with a JSON sidecar, +against a specified HED schema version. +""" + +import argparse +import sys +import os +from hed.models import TabularInput, Sidecar +from hed.errors import ErrorHandler +from hed.schema import load_schema_version +from hed.scripts.script_utils import setup_logging, format_validation_results + + +def get_parser(): + """Create the argument parser for validate_hed_tabular. + + Returns: + argparse.ArgumentParser: Configured argument parser. + """ + parser = argparse.ArgumentParser( + description="Validate HED in a tabular file against a HED schema", formatter_class=argparse.RawDescriptionHelpFormatter + ) + + # Required arguments + parser.add_argument("tabular_file", help="Tabular file (TSV) to validate") + parser.add_argument( + "-sv", + "--schema-version", + required=True, + nargs="+", + dest="schema_version", + help="HED schema version(s) to validate against (e.g., '8.4.0' or '8.3.0 score_1.1.0' for multiple schemas)", + ) + + # Optional arguments + parser.add_argument( + "-s", + "--sidecar", + dest="sidecar_file", + help="Optional BIDS JSON sidecar file to use during validation", + ) + parser.add_argument( + "-w", + "--check-for-warnings", + action="store_true", + dest="check_for_warnings", + help="Check for warnings in addition to errors", + ) + + # Error limiting + error_group = parser.add_argument_group("Error limiting options") + error_group.add_argument( + "-el", + "--error-limit", + type=int, + dest="error_limit", + default=None, + help="Limit number of errors reported per code (default: No limit)", + ) + error_group.add_argument( + "-ef", + "--errors-by-file", + action="store_true", + dest="errors_by_file", + help="If using --error-limit, apply the limit per-file rather than globally", + ) + + # Output options + output_group = parser.add_argument_group("Output options") + output_group.add_argument( + "-f", + "--format", + choices=["text", "json"], + default="text", + help="Output format for validation results (default: %(default)s)", + ) + output_group.add_argument( + "-o", + "--output-file", + default="", + dest="output_file", + help="Output file for validation results; if not specified, output to stdout", + ) + + # Logging options + logging_group = parser.add_argument_group("Logging options") + logging_group.add_argument( + "-l", + "--log-level", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + default="WARNING", + dest="log_level", + help="Logging level (default: %(default)s)", + ) + logging_group.add_argument("-lf", "--log-file", default="", dest="log_file", help="File path for saving log output") + logging_group.add_argument( + "-lq", "--log-quiet", action="store_true", dest="log_quiet", help="Suppress log output to stderr when using --log-file" + ) + logging_group.add_argument("--no-log", action="store_true", dest="no_log", help="Disable all logging output") + logging_group.add_argument("-v", "--verbose", action="store_true", help="Output informational messages") + + return parser + + +def main(arg_list=None): + """Main function for validating HED in a tabular file. + + Parameters: + arg_list (list or None): Command line arguments. + """ + parser = get_parser() + args = parser.parse_args(arg_list) + + # Set up logging + setup_logging(args.log_level, args.log_file, args.log_quiet, args.verbose, args.no_log) + + import logging + + logger = logging.getLogger("validate_hed_tabular") + effective_level_name = logging.getLevelName(logger.getEffectiveLevel()) + logger.info( + "Starting HED validation of tabular file with effective log level: %s (requested: %s, verbose=%s)", + effective_level_name, + args.log_level, + "on" if args.verbose else "off", + ) + + try: + # Load schema (handle single version or list of versions) + schema_versions = args.schema_version[0] if len(args.schema_version) == 1 else args.schema_version + logging.info(f"Loading HED schema version(s) {schema_versions}") + schema = load_schema_version(schema_versions) + + # Parse Sidecar if provided + sidecar = None + issues = [] + error_handler = ErrorHandler(check_for_warnings=args.check_for_warnings) + + if args.sidecar_file: + logging.info("Loading Sidecar file") + sidecar = Sidecar(args.sidecar_file, name=os.path.basename(args.sidecar_file)) + sidecar_issues = sidecar.validate(schema, name=sidecar.name, error_handler=error_handler) + issues += sidecar_issues + if sidecar_issues: + logging.warning(f"Found {len(sidecar_issues)} issues in sidecar validation") + + # Parse and Validate Tabular Input + logging.info("Loading Tabular file") + tabular_input = TabularInput(args.tabular_file, sidecar=sidecar, name=os.path.basename(args.tabular_file)) + + logging.info("Validating Tabular file") + # Validate tabular input + tabular_issues = tabular_input.validate(schema, name=tabular_input.name, error_handler=error_handler) + issues += tabular_issues + + # Handle output + if issues: + # Format validation errors + output = format_validation_results( + issues, + output_format=args.format, + title_message="HED validation issues:", + error_limit=args.error_limit, + errors_by_file=args.errors_by_file, + ) + + # Write output + if args.output_file: + with open(args.output_file, "w") as f: + f.write(output) + logging.info(f"Validation errors written to {args.output_file}") + else: + print(output) + + return 1 # Exit with error code if validation failed + else: + # Success message + success_msg = "Tabular file has valid HED!" + if args.output_file: + with open(args.output_file, "w") as f: + f.write(success_msg + "\n") + logging.info(f"Validation results written to {args.output_file}") + else: + print(success_msg) + + return 0 + + except Exception as e: + logging.error(f"Validation failed: {str(e)}") + # If verbose, print stack trace + if args.verbose: + import traceback + + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 5e5f5fcf..58ae3718 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,7 @@ hedpy = "hed.cli.cli:main" validate_bids = "hed.scripts.validate_bids:main" validate_hed_string = "hed.scripts.validate_hed_string:main" validate_hed_sidecar = "hed.scripts.validate_hed_sidecar:main" +validate_hed_tabular = "hed.scripts.validate_hed_tabular:main" hed_extract_bids_sidecar = "hed.scripts.hed_extract_bids_sidecar:main" hed_validate_schemas = "hed.scripts.validate_schemas:main" hed_update_schemas = "hed.scripts.hed_convert_schema:main" diff --git a/tests/scripts/test_validate_hed_tabular.py b/tests/scripts/test_validate_hed_tabular.py new file mode 100644 index 00000000..3d2a96b5 --- /dev/null +++ b/tests/scripts/test_validate_hed_tabular.py @@ -0,0 +1,148 @@ +"""Tests for validate_hed_tabular script.""" + +import os +import io +import json +import unittest +import tempfile +import pandas as pd +from unittest.mock import patch +from hed.scripts.validate_hed_tabular import main + + +class TestValidateHedTabular(unittest.TestCase): + """Test validate_hed_tabular script functionality.""" + + def setUp(self): + """Set up test fixtures.""" + # Create a temporary valid tabular file + self.valid_data = { + "onset": [1.0, 2.0], + "duration": [0.5, 0.5], + "trial_type": ["show_face", "press_button"], + "HED": ["Sensory-event", "Agent-action"], + } + self.valid_df = pd.DataFrame(self.valid_data) + self.valid_tabular_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") + self.valid_df.to_csv(self.valid_tabular_file.name, sep="\t", index=False) + self.valid_tabular_file.close() + + # Create a temporary invalid tabular file + self.invalid_data = {"onset": [1.0], "duration": [0.5], "HED": ["InvalidTag"]} + self.invalid_df = pd.DataFrame(self.invalid_data) + self.invalid_tabular_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") + self.invalid_df.to_csv(self.invalid_tabular_file.name, sep="\t", index=False) + self.invalid_tabular_file.close() + + # Sidecar setup + self.valid_sidecar_content = {"trial_type": {"HED": {"show_face": "Sensory-event", "press_button": "Agent-action"}}} + self.valid_sidecar_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") + json.dump(self.valid_sidecar_content, self.valid_sidecar_file) + self.valid_sidecar_file.close() + + def tearDown(self): + """Clean up test fixtures.""" + if os.path.exists(self.valid_tabular_file.name): + os.remove(self.valid_tabular_file.name) + if os.path.exists(self.invalid_tabular_file.name): + os.remove(self.invalid_tabular_file.name) + if os.path.exists(self.valid_sidecar_file.name): + os.remove(self.valid_sidecar_file.name) + + def test_valid_tabular(self): + """Test validation of a tabular file with valid HED.""" + arg_list = [self.valid_tabular_file.name, "-sv", "8.3.0", "--no-log"] + + with patch("sys.stdout", new=io.StringIO()) as mock_stdout: + result = main(arg_list) + output = mock_stdout.getvalue() + + self.assertEqual(result, 0, "Valid tabular should return 0") + self.assertIn("valid", output.lower()) + + def test_invalid_tabular(self): + """Test validation of a tabular file with invalid HED.""" + arg_list = [self.invalid_tabular_file.name, "-sv", "8.3.0", "--no-log"] + + with patch("sys.stdout", new=io.StringIO()) as mock_stdout: + result = main(arg_list) + output = mock_stdout.getvalue() + + self.assertEqual(result, 1, "Invalid tabular should return 1") + self.assertIn("error", output.lower()) + + def test_validation_with_sidecar(self): + """Test validation with a sidecar.""" + # Create data that needs sidecar to be valid (empty HED column but valid trial_type) + data = {"onset": [1.0], "duration": [0.5], "trial_type": ["show_face"], "HED": [""]} + df = pd.DataFrame(data) + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + df.to_csv(f.name, sep="\t", index=False) + tabular_filename = f.name + + try: + arg_list = [tabular_filename, "-s", self.valid_sidecar_file.name, "-sv", "8.3.0", "--no-log"] + + with patch("sys.stdout", new=io.StringIO()) as mock_stdout: + result = main(arg_list) + output = mock_stdout.getvalue() + + self.assertEqual(result, 0) + self.assertIn("valid", output.lower()) + finally: + if os.path.exists(tabular_filename): + os.remove(tabular_filename) + + def test_error_limiting(self): + """Test error limiting options.""" + # Create data with repeated errors + data = {"HED": ["InvalidTag"] * 5} + df = pd.DataFrame(data) + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + df.to_csv(f.name, sep="\t", index=False) + tabular_filename = f.name + + try: + # Test with limit + arg_list = [tabular_filename, "-sv", "8.3.0", "-el", "2", "--no-log"] + with patch("sys.stdout", new=io.StringIO()) as mock_stdout: + result = main(arg_list) + output = mock_stdout.getvalue() + + self.assertEqual(result, 1) + # Should mention filtering + self.assertIn("after filtering", output) + + finally: + if os.path.exists(tabular_filename): + os.remove(tabular_filename) + + def test_json_output(self): + """Test JSON output format.""" + arg_list = [self.invalid_tabular_file.name, "-sv", "8.3.0", "-f", "json", "--no-log"] + + with patch("sys.stdout", new=io.StringIO()) as mock_stdout: + result = main(arg_list) + output = mock_stdout.getvalue() + + self.assertEqual(result, 1) + # Should be valid JSON + try: + json.loads(output) + except json.JSONDecodeError: + self.fail("Output should be valid JSON") + + def test_missing_file(self): + """Test handling of missing file.""" + arg_list = ["non_existent_file.tsv", "-sv", "8.3.0", "--no-log"] + + with patch("sys.stdout", new=io.StringIO()): + # The script catches exceptions and logs error, returns 1 + result = main(arg_list) + + self.assertEqual(result, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_cli_parameter_parity.py b/tests/test_cli_parameter_parity.py index 8040e31b..9b62d684 100644 --- a/tests/test_cli_parameter_parity.py +++ b/tests/test_cli_parameter_parity.py @@ -11,6 +11,7 @@ from hed.scripts.extract_tabular_summary import get_parser as get_extract_summary_parser from hed.scripts.validate_schemas import get_parser as get_validate_schemas_parser from hed.scripts.validate_hed_sidecar import get_parser as get_validate_sidecar_parser +from hed.scripts.validate_hed_tabular import get_parser as get_validate_tabular_parser class TestCLIParameterParity(unittest.TestCase): @@ -298,6 +299,51 @@ def test_validate_sidecar_parameters(self): for flag in required_flags: self.assertIn(flag, cli_flags, f"Flag '{flag}' not found in CLI") + def test_validate_tabular_parameters(self): + """Test validate tabular CLI parameters match validate_hed_tabular.py parser.""" + # Get original parser + original_parser = get_validate_tabular_parser() + self._get_parser_options(original_parser) + + # Get CLI command + validate_group = cli.commands.get("validate") + self.assertIsNotNone(validate_group, "validate command group not found") + cli_command = validate_group.commands.get("tabular") + + self.assertIsNotNone(cli_command, "validate tabular command not found in CLI") + cli_opts = self._get_click_options(cli_command) + + # Check positional arguments (should have tabular_file) + self.assertEqual( + len(cli_opts["positional"]), 1, f"Should have 1 positional argument, got {len(cli_opts['positional'])}" + ) + self.assertEqual(cli_opts["positional"][0], "tabular_file", "Positional should be tabular_file") + + # Check that key optional parameters exist + required_params = [ + "schema_version", + "sidecar_file", + "error_limit", + "format", + "output_file", + "log_level", + "log_file", + ] + # Mapping for naming differences + dest_map = {"sidecar_file": "sidecar"} + cli_dests = set(cli_opts["optional"].keys()) + + for param in required_params: + search_param = dest_map.get(param, param) + self.assertIn(search_param, cli_dests, f"Parameter '{param}' not found in CLI as '{search_param}'") + + # Check flags + required_flags = {"check_for_warnings", "errors_by_file", "log_quiet", "no_log", "verbose"} + cli_flags = {flag[0] for flag in cli_opts["flags"]} + + for flag in required_flags: + self.assertIn(flag, cli_flags, f"Flag '{flag}' not found in CLI") + def test_schema_add_ids_parameters(self): """Test schema add-ids uses positional arguments.""" schema_group = cli.commands.get("schema")