From 41aba172ebe7c2b9fa2535bd5078d02278ed9a7e Mon Sep 17 00:00:00 2001
From: Kay Robbins <1189050+VisLab@users.noreply.github.com>
Date: Wed, 21 Jan 2026 12:44:15 -0600
Subject: [PATCH] Added a validate tabular script

---
 hed/cli/cli.py                             | 169 +++++++++++++++++
 hed/scripts/validate_hed_tabular.py        | 203 +++++++++++++++++++++
 pyproject.toml                             |   1 +
 tests/scripts/test_validate_hed_tabular.py | 148 +++++++++++++++
 tests/test_cli_parameter_parity.py         |  46 +++++
 5 files changed, 567 insertions(+)
 create mode 100644 hed/scripts/validate_hed_tabular.py
 create mode 100644 tests/scripts/test_validate_hed_tabular.py

diff --git a/hed/cli/cli.py b/hed/cli/cli.py
index 0b7c542a..8b46c913 100644
--- a/hed/cli/cli.py
+++ b/hed/cli/cli.py
@@ -538,6 +538,175 @@ def validate_sidecar_cmd(
     ctx.exit(result if result is not None else 0)
 
 
+@validate.command(
+    name="tabular",
+    epilog="""
+This command validates HED in a tabular file (TSV) against a specified HED schema
+version. It can optionally include a sidecar file and check for warnings.
+
+\b
+Examples:
+    # Basic validation of a TSV file
+    hedpy validate tabular events.tsv -sv 8.3.0
+
+    # Validate with a sidecar
+    hedpy validate tabular events.tsv -s sidecar.json -sv 8.3.0
+
+    # Validate with multiple schemas (base + library)
+    hedpy validate tabular events.tsv -s sidecar.json -sv 8.3.0 -sv score_1.1.0
+
+    # Check for warnings as well as errors
+    hedpy validate tabular events.tsv -sv 8.4.0 --check-for-warnings
+
+    # Limit reported errors
+    hedpy validate tabular events.tsv -sv 8.4.0 -el 5
+
+    # Save validation results to a file
+    hedpy validate tabular events.tsv -sv 8.4.0 -o validation_results.txt
+""",
+)
+@click.argument("tabular_file", type=click.Path(exists=True))
+# Validation options
+@optgroup.group("Validation options")
+@optgroup.option(
+    "-sv",
+    "--schema-version",
+    required=True,
+    multiple=True,
+    metavar="VERSION",
+    help="HED schema version(s) to validate against (e.g., '8.4.0'). Can be specified multiple times for multiple schemas (e.g., -sv lang_1.1.0 -sv score_2.1.0)",
+)
+@optgroup.option(
+    "-s",
+    "--sidecar",
+    type=click.Path(exists=True),
+    metavar=METAVAR_FILE,
+    help="BIDS JSON sidecar file to use during validation",
+)
+@optgroup.option(
+    "-w",
+    "--check-for-warnings",
+    is_flag=True,
+    help="Check for warnings as well as errors",
+)
+@optgroup.option(
+    "-el",
+    "--error-limit",
+    type=int,
+    metavar=METAVAR_N,
+    help="Limit number of errors reported per code (default: No limit)",
+)
+@optgroup.option(
+    "-ef",
+    "--errors-by-file",
+    is_flag=True,
+    help="If using --error-limit, apply the limit per-file rather than globally",
+)
+# Output options
+@optgroup.group("Output options")
+@optgroup.option(
+    "-f",
+    "--format",
+    type=click.Choice(["text", "json"]),
+    default="text",
+    show_default="text",
+    help="Output format for validation results (text: human-readable; json: structured format for programmatic use)",
+)
+@optgroup.option(
+    "-o",
+    "--output-file",
+    type=click.Path(),
+    default="",
+    metavar=METAVAR_FILE,
+    help="Path for output file to hold validation results; if not specified, output to stdout",
+)
+# Logging options
+@optgroup.group("Logging options")
+@optgroup.option(
+    "-l",
+    "--log-level",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
+    default="WARNING",
+    show_default="WARNING",
+    help="Log level for diagnostic messages",
+)
+@optgroup.option(
+    "-v",
+    "--verbose",
+    is_flag=True,
+    help="Output informational messages (equivalent to --log-level INFO)",
+)
+@optgroup.option(
+    "-lf",
+    "--log-file",
+    type=click.Path(),
+    metavar=METAVAR_FILE,
+    help="File path for saving log output; logs still go to stderr unless --log-quiet is also used",
+)
+@optgroup.option(
+    "-lq",
+    "--log-quiet",
+    is_flag=True,
+    help="Suppress log output to stderr; only applicable when --log-file is used (logs go only to file)",
+)
+@optgroup.option(
+    "--no-log",
+    is_flag=True,
+    help="Disable all logging output",
+)
+@click.pass_context
+def validate_tabular_cmd(
+    ctx,
+    tabular_file,
+    schema_version,
+    sidecar,
+    check_for_warnings,
+    error_limit,
+    errors_by_file,
+    format,
+    output_file,
+    log_level,
+    log_file,
+    log_quiet,
+    no_log,
+    verbose,
+):
+    """Validate HED in a tabular file.
+
+    TABULAR_FILE: The path to the tabular file (e.g., TSV) to validate.
+    """
+    from hed.scripts.validate_hed_tabular import main as validate_tabular_main
+
+    args = [tabular_file]
+    for version in schema_version:
+        args.extend(["-sv", version])
+    if sidecar:
+        args.extend(["-s", sidecar])
+    if check_for_warnings:
+        args.append("-w")
+    if error_limit is not None:
+        args.extend(["-el", str(error_limit)])
+    if errors_by_file:
+        args.append("-ef")
+    if format:
+        args.extend(["-f", format])
+    if output_file:
+        args.extend(["-o", output_file])
+    if log_level:
+        args.extend(["-l", log_level])
+    if log_file:
+        args.extend(["-lf", log_file])
+    if log_quiet:
+        args.append("-lq")
+    if no_log:
+        args.append("--no-log")
+    if verbose:
+        args.append("-v")
+
+    result = validate_tabular_main(args)
+    ctx.exit(result if result is not None else 0)
+
+
 @schema.command(name="validate")
 @click.argument("schema_path", type=click.Path(exists=True), nargs=-1, required=True)
 @click.option("--add-all-extensions", is_flag=True, help="Always verify all versions of the same schema are equal")
diff --git a/hed/scripts/validate_hed_tabular.py b/hed/scripts/validate_hed_tabular.py
new file mode 100644
index 00000000..7f505609
--- /dev/null
+++ b/hed/scripts/validate_hed_tabular.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python
+"""
+Validates HED in a tabular file (TSV) against a specified schema version.
+
+This script validates HED in a tabular file, optionally with a JSON sidecar,
+against a specified HED schema version.
+"""
+
+import argparse
+import sys
+import os
+from hed.models import TabularInput, Sidecar
+from hed.errors import ErrorHandler
+from hed.schema import load_schema_version
+from hed.scripts.script_utils import setup_logging, format_validation_results
+
+
+def get_parser():
+    """Create the argument parser for validate_hed_tabular.
+
+    Returns:
+        argparse.ArgumentParser: Configured argument parser.
+    """
+    parser = argparse.ArgumentParser(
+        description="Validate HED in a tabular file against a HED schema", formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    # Required arguments
+    parser.add_argument("tabular_file", help="Tabular file (TSV) to validate")
+    parser.add_argument(
+        "-sv",
+        "--schema-version",
+        required=True,
+        nargs="+",
+        dest="schema_version",
+        help="HED schema version(s) to validate against (e.g., '8.4.0' or '8.3.0 score_1.1.0' for multiple schemas)",
+    )
+
+    # Optional arguments
+    parser.add_argument(
+        "-s",
+        "--sidecar",
+        dest="sidecar_file",
+        help="Optional BIDS JSON sidecar file to use during validation",
+    )
+    parser.add_argument(
+        "-w",
+        "--check-for-warnings",
+        action="store_true",
+        dest="check_for_warnings",
+        help="Check for warnings in addition to errors",
+    )
+
+    # Error limiting
+    error_group = parser.add_argument_group("Error limiting options")
+    error_group.add_argument(
+        "-el",
+        "--error-limit",
+        type=int,
+        dest="error_limit",
+        default=None,
+        help="Limit number of errors reported per code (default: No limit)",
+    )
+    error_group.add_argument(
+        "-ef",
+        "--errors-by-file",
+        action="store_true",
+        dest="errors_by_file",
+        help="If using --error-limit, apply the limit per-file rather than globally",
+    )
+
+    # Output options
+    output_group = parser.add_argument_group("Output options")
+    output_group.add_argument(
+        "-f",
+        "--format",
+        choices=["text", "json"],
+        default="text",
+        help="Output format for validation results (default: %(default)s)",
+    )
+    output_group.add_argument(
+        "-o",
+        "--output-file",
+        default="",
+        dest="output_file",
+        help="Output file for validation results; if not specified, output to stdout",
+    )
+
+    # Logging options
+    logging_group = parser.add_argument_group("Logging options")
+    logging_group.add_argument(
+        "-l",
+        "--log-level",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        default="WARNING",
+        dest="log_level",
+        help="Logging level (default: %(default)s)",
+    )
+    logging_group.add_argument("-lf", "--log-file", default="", dest="log_file", help="File path for saving log output")
+    logging_group.add_argument(
+        "-lq", "--log-quiet", action="store_true", dest="log_quiet", help="Suppress log output to stderr when using --log-file"
+    )
+    logging_group.add_argument("--no-log", action="store_true", dest="no_log", help="Disable all logging output")
+    logging_group.add_argument("-v", "--verbose", action="store_true", help="Output informational messages")
+
+    return parser
+
+
+def main(arg_list=None):
+    """Main function for validating HED in a tabular file.
+
+    Parameters:
+        arg_list (list or None): Command line arguments.
+    """
+    parser = get_parser()
+    args = parser.parse_args(arg_list)
+
+    # Set up logging
+    setup_logging(args.log_level, args.log_file, args.log_quiet, args.verbose, args.no_log)
+
+    import logging
+
+    logger = logging.getLogger("validate_hed_tabular")
+    effective_level_name = logging.getLevelName(logger.getEffectiveLevel())
+    logger.info(
+        "Starting HED validation of tabular file with effective log level: %s (requested: %s, verbose=%s)",
+        effective_level_name,
+        args.log_level,
+        "on" if args.verbose else "off",
+    )
+
+    try:
+        # Load schema (handle single version or list of versions)
+        schema_versions = args.schema_version[0] if len(args.schema_version) == 1 else args.schema_version
+        logging.info(f"Loading HED schema version(s) {schema_versions}")
+        schema = load_schema_version(schema_versions)
+
+        # Parse Sidecar if provided
+        sidecar = None
+        issues = []
+        error_handler = ErrorHandler(check_for_warnings=args.check_for_warnings)
+
+        if args.sidecar_file:
+            logging.info("Loading Sidecar file")
+            sidecar = Sidecar(args.sidecar_file, name=os.path.basename(args.sidecar_file))
+            sidecar_issues = sidecar.validate(schema, name=sidecar.name, error_handler=error_handler)
+            issues += sidecar_issues
+            if sidecar_issues:
+                logging.warning(f"Found {len(sidecar_issues)} issues in sidecar validation")
+
+        # Parse and Validate Tabular Input
+        logging.info("Loading Tabular file")
+        tabular_input = TabularInput(args.tabular_file, sidecar=sidecar, name=os.path.basename(args.tabular_file))
+
+        logging.info("Validating Tabular file")
+        # Validate tabular input
+        tabular_issues = tabular_input.validate(schema, name=tabular_input.name, error_handler=error_handler)
+        issues += tabular_issues
+
+        # Handle output
+        if issues:
+            # Format validation errors
+            output = format_validation_results(
+                issues,
+                output_format=args.format,
+                title_message="HED validation issues:",
+                error_limit=args.error_limit,
+                errors_by_file=args.errors_by_file,
+            )
+
+            # Write output
+            if args.output_file:
+                with open(args.output_file, "w") as f:
+                    f.write(output)
+                logging.info(f"Validation errors written to {args.output_file}")
+            else:
+                print(output)
+
+            return 1  # Exit with error code if validation failed
+        else:
+            # Success message
+            success_msg = "Tabular file has valid HED!"
+            if args.output_file:
+                with open(args.output_file, "w") as f:
+                    f.write(success_msg + "\n")
+                logging.info(f"Validation results written to {args.output_file}")
+            else:
+                print(success_msg)
+
+            return 0
+
+    except Exception as e:
+        logging.error(f"Validation failed: {str(e)}")
+        # If verbose, print stack trace
+        if args.verbose:
+            import traceback
+
+            traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/pyproject.toml b/pyproject.toml
index 5e5f5fcf..58ae3718 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,6 +101,7 @@ hedpy = "hed.cli.cli:main"
 validate_bids = "hed.scripts.validate_bids:main"
 validate_hed_string = "hed.scripts.validate_hed_string:main"
 validate_hed_sidecar = "hed.scripts.validate_hed_sidecar:main"
+validate_hed_tabular = "hed.scripts.validate_hed_tabular:main"
 hed_extract_bids_sidecar = "hed.scripts.hed_extract_bids_sidecar:main"
 hed_validate_schemas = "hed.scripts.validate_schemas:main"
 hed_update_schemas = "hed.scripts.hed_convert_schema:main"
diff --git a/tests/scripts/test_validate_hed_tabular.py b/tests/scripts/test_validate_hed_tabular.py
new file mode 100644
index 00000000..3d2a96b5
--- /dev/null
+++ b/tests/scripts/test_validate_hed_tabular.py
@@ -0,0 +1,148 @@
+"""Tests for validate_hed_tabular script."""
+
+import os
+import io
+import json
+import unittest
+import tempfile
+import pandas as pd
+from unittest.mock import patch
+from hed.scripts.validate_hed_tabular import main
+
+
+class TestValidateHedTabular(unittest.TestCase):
+    """Test validate_hed_tabular script functionality."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        # Create a temporary valid tabular file
+        self.valid_data = {
+            "onset": [1.0, 2.0],
+            "duration": [0.5, 0.5],
+            "trial_type": ["show_face", "press_button"],
+            "HED": ["Sensory-event", "Agent-action"],
+        }
+        self.valid_df = pd.DataFrame(self.valid_data)
+        self.valid_tabular_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv")
+        self.valid_df.to_csv(self.valid_tabular_file.name, sep="\t", index=False)
+        self.valid_tabular_file.close()
+
+        # Create a temporary invalid tabular file
+        self.invalid_data = {"onset": [1.0], "duration": [0.5], "HED": ["InvalidTag"]}
+        self.invalid_df = pd.DataFrame(self.invalid_data)
+        self.invalid_tabular_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv")
+        self.invalid_df.to_csv(self.invalid_tabular_file.name, sep="\t", index=False)
+        self.invalid_tabular_file.close()
+
+        # Sidecar setup
+        self.valid_sidecar_content = {"trial_type": {"HED": {"show_face": "Sensory-event", "press_button": "Agent-action"}}}
+        self.valid_sidecar_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json")
+        json.dump(self.valid_sidecar_content, self.valid_sidecar_file)
+        self.valid_sidecar_file.close()
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        if os.path.exists(self.valid_tabular_file.name):
+            os.remove(self.valid_tabular_file.name)
+        if os.path.exists(self.invalid_tabular_file.name):
+            os.remove(self.invalid_tabular_file.name)
+        if os.path.exists(self.valid_sidecar_file.name):
+            os.remove(self.valid_sidecar_file.name)
+
+    def test_valid_tabular(self):
+        """Test validation of a tabular file with valid HED."""
+        arg_list = [self.valid_tabular_file.name, "-sv", "8.3.0", "--no-log"]
+
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list)
+            output = mock_stdout.getvalue()
+
+        self.assertEqual(result, 0, "Valid tabular should return 0")
+        self.assertIn("valid", output.lower())
+
+    def test_invalid_tabular(self):
+        """Test validation of a tabular file with invalid HED."""
+        arg_list = [self.invalid_tabular_file.name, "-sv", "8.3.0", "--no-log"]
+
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list)
+            output = mock_stdout.getvalue()
+
+        self.assertEqual(result, 1, "Invalid tabular should return 1")
+        self.assertIn("error", output.lower())
+
+    def test_validation_with_sidecar(self):
+        """Test validation with a sidecar."""
+        # Create data that needs sidecar to be valid (empty HED column but valid trial_type)
+        data = {"onset": [1.0], "duration": [0.5], "trial_type": ["show_face"], "HED": [""]}
+        df = pd.DataFrame(data)
+
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+            df.to_csv(f.name, sep="\t", index=False)
+            tabular_filename = f.name
+
+        try:
+            arg_list = [tabular_filename, "-s", self.valid_sidecar_file.name, "-sv", "8.3.0", "--no-log"]
+
+            with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+                result = main(arg_list)
+                output = mock_stdout.getvalue()
+
+            self.assertEqual(result, 0)
+            self.assertIn("valid", output.lower())
+        finally:
+            if os.path.exists(tabular_filename):
+                os.remove(tabular_filename)
+
+    def test_error_limiting(self):
+        """Test error limiting options."""
+        # Create data with repeated errors
+        data = {"HED": ["InvalidTag"] * 5}
+        df = pd.DataFrame(data)
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+            df.to_csv(f.name, sep="\t", index=False)
+            tabular_filename = f.name
+
+        try:
+            # Test with limit
+            arg_list = [tabular_filename, "-sv", "8.3.0", "-el", "2", "--no-log"]
+            with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+                result = main(arg_list)
+                output = mock_stdout.getvalue()
+
+            self.assertEqual(result, 1)
+            # Should mention filtering
+            self.assertIn("after filtering", output)
+
+        finally:
+            if os.path.exists(tabular_filename):
+                os.remove(tabular_filename)
+
+    def test_json_output(self):
+        """Test JSON output format."""
+        arg_list = [self.invalid_tabular_file.name, "-sv", "8.3.0", "-f", "json", "--no-log"]
+
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list)
+            output = mock_stdout.getvalue()
+
+        self.assertEqual(result, 1)
+        # Should be valid JSON
+        try:
+            json.loads(output)
+        except json.JSONDecodeError:
+            self.fail("Output should be valid JSON")
+
+    def test_missing_file(self):
+        """Test handling of missing file."""
+        arg_list = ["non_existent_file.tsv", "-sv", "8.3.0", "--no-log"]
+
+        with patch("sys.stdout", new=io.StringIO()):
+            # The script catches exceptions and logs error, returns 1
+            result = main(arg_list)
+
+        self.assertEqual(result, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_cli_parameter_parity.py b/tests/test_cli_parameter_parity.py
index 8040e31b..9b62d684 100644
--- a/tests/test_cli_parameter_parity.py
+++ b/tests/test_cli_parameter_parity.py
@@ -11,6 +11,7 @@
 from hed.scripts.extract_tabular_summary import get_parser as get_extract_summary_parser
 from hed.scripts.validate_schemas import get_parser as get_validate_schemas_parser
 from hed.scripts.validate_hed_sidecar import get_parser as get_validate_sidecar_parser
+from hed.scripts.validate_hed_tabular import get_parser as get_validate_tabular_parser
 
 
 class TestCLIParameterParity(unittest.TestCase):
@@ -298,6 +299,51 @@ def test_validate_sidecar_parameters(self):
         for flag in required_flags:
             self.assertIn(flag, cli_flags, f"Flag '{flag}' not found in CLI")
 
+    def test_validate_tabular_parameters(self):
+        """Test validate tabular CLI parameters match validate_hed_tabular.py parser."""
+        # Get original parser
+        original_parser = get_validate_tabular_parser()
+        self._get_parser_options(original_parser)
+
+        # Get CLI command
+        validate_group = cli.commands.get("validate")
+        self.assertIsNotNone(validate_group, "validate command group not found")
+        cli_command = validate_group.commands.get("tabular")
+
+        self.assertIsNotNone(cli_command, "validate tabular command not found in CLI")
+        cli_opts = self._get_click_options(cli_command)
+
+        # Check positional arguments (should have tabular_file)
+        self.assertEqual(
+            len(cli_opts["positional"]), 1, f"Should have 1 positional argument, got {len(cli_opts['positional'])}"
+        )
+        self.assertEqual(cli_opts["positional"][0], "tabular_file", "Positional should be tabular_file")
+
+        # Check that key optional parameters exist
+        required_params = [
+            "schema_version",
+            "sidecar_file",
+            "error_limit",
+            "format",
+            "output_file",
+            "log_level",
+            "log_file",
+        ]
+        # Mapping for naming differences
+        dest_map = {"sidecar_file": "sidecar"}
+        cli_dests = set(cli_opts["optional"].keys())
+
+        for param in required_params:
+            search_param = dest_map.get(param, param)
+            self.assertIn(search_param, cli_dests, f"Parameter '{param}' not found in CLI as '{search_param}'")
+
+        # Check flags
+        required_flags = {"check_for_warnings", "errors_by_file", "log_quiet", "no_log", "verbose"}
+        cli_flags = {flag[0] for flag in cli_opts["flags"]}
+
+        for flag in required_flags:
+            self.assertIn(flag, cli_flags, f"Flag '{flag}' not found in CLI")
+
     def test_schema_add_ids_parameters(self):
         """Test schema add-ids uses positional arguments."""
         schema_group = cli.commands.get("schema")