diff --git a/.vscode/launch.json b/.vscode/launch.json index 4c8e7641..59e53006 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -54,9 +54,12 @@ "console": "integratedTerminal", // A small log_file that has a reasonable amount of data, and known_hash to verify download //"args": ["-v", "1", "--log_file", "ahi/missionlogs/2025/20250908_20250912/20250911T201546/202509112015_202509112115.nc4", "--known_hash", "d1235ead55023bea05e9841465d54a45dfab007a283320322e28b84438fb8a85"] - // Has bad latitude and longitude values and lots of bad Universal latitude_time values + // brizo 20250914T080941 has bad latitude and longitude values and lots of bad Universal latitude_time and longitude_time values //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] - "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/latitude_time"] + //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/longitude_time"] + //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/latitude_time"] + // brizo 20250916T230652 has several ESP Samples from stoqs_lrauv_sep2025 + "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--plot_time", "/longitude_time"] }, { "name": "2.0 - calibrate.py", @@ -112,7 +115,8 @@ "program": "${workspaceFolder}/src/data/combine.py", "console": "integratedTerminal", "justMyCode": false, - "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] + //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] + "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4"] }, { "name": "3.0 - align.py", @@ -321,7 +325,8 @@ "console": "integratedTerminal", //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--clobber"] - "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--clobber", "--no_cleanup"] + //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--clobber", "--no_cleanup"] + "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--no_cleanup"] //"args": ["-v", "1", "--auv_name", "tethys", "--start", "20120901", "--end", "20121101", "--noinput"] }, diff --git a/LRAUV_WORKFLOW.md b/LRAUV_WORKFLOW.md index 98307671..4fcfc26f 100644 --- a/LRAUV_WORKFLOW.md +++ b/LRAUV_WORKFLOW.md @@ -13,24 +13,26 @@ on the local file system's work directory is as follows: │ │ ├── <- e.g.: ahi, brizo, pontus, tethys, ... │ │ │ ├── missionlogs/year/dlist_dir │ │ │ │ ├── <- e.g.: ahi/missionlogs/2025/20250908_20250912/20250911T201546/202509112015_202509112115.nc4 - │ │ │ │ │ ├── <- .nc4 file containing original data + │ │ │ │ │ ├── <- .nc4 file containing original data - created by unserialize │ │ │ │ │ ├── <- .nc files, one for each group from the .nc4 file - | | | | | | data identical to original in NETCDF4 format - │ │ │ │ │ ├── <_cal> <- A single NETCDF3 .nc file containing all the - | | | | | | varibles from the .nc files along with nudged - | | | | | | latitudes and longitudes - created by combine.py + | | | | | | data identical to original in NetCDF4 format, + | | | | | | but in more interoperable NetCDF3 format + | | | | | | - created by nc42netcdfs.py + │ │ │ │ │ ├── <_combined> <- A single NetCDF3 .nc file containing all the + | | | | | | varibles from the .nc files along with nudged + | | | | | | latitudes and longitudes - created by combine.py │ │ │ │ │ ├── <_align> <- .nc file with all measurement variables | | | | | | having associated coordinate variables - | | | | | | at original instrument sampling rate - - | | | | | | created by align.py + | | | | | | at original instrument sampling rate + | | | | | | - created by align.py │ │ │ │ │ ├── <_nS> <- .nc file with all measurement variables resampled to a common time grid at n Second intervals - created by resample.py nc42netcdfs.py Extract the groups and the variables we want from the groups into - individual .nc files. These data are saved using NETCDF4 format as - there are many unlimited dimensions that are not allowed in NETCDF3. + individual .nc files. These data are saved using NetCDF4 format as + there are many unlimited dimensions that are not allowed in NetCDF3. The data in the .nc files are identical to what is in the .nc4 groups. combine.py diff --git a/README.md b/README.md index 81417927..4861c407 100644 --- a/README.md +++ b/README.md @@ -103,11 +103,11 @@ First time use with Docker on a server using a service account: * git clone git@github.com:mbari-org/auv-python.git * cd auv-python * Create a .env file in `/opt/auv-python` with the following contents: - `M3_VOL=` - `AUVCTD_VOL=` - `CALIBRATION_VOL=` - `WORK_VOL=/data` - + `M3_VOL=` + `AUVCTD_VOL=` + `CALIBRATION_VOL=` + `WORK_VOL=/data` + `HOST_NAME=` After installation and when logging into the server again mission data can be processed thusly: * Setting up environment and printing help message: `sudo -u docker_user -i` diff --git a/src/data/AUV.py b/src/data/AUV.py index c8bef718..8dd913e1 100755 --- a/src/data/AUV.py +++ b/src/data/AUV.py @@ -11,6 +11,7 @@ import logging from datetime import datetime +import cf_xarray # Needed for the .cf accessor # noqa: F401 import numpy as np import xarray as xr @@ -88,22 +89,23 @@ def nudge_positions( # noqa: C901, PLR0912, PLR0913, PLR0915 if lon[:][segi].any(): lon_nudged_array = lon[segi] lat_nudged_array = lat[segi] - dt_nudged = lon.get_index("navigation_time")[segi] + dt_nudged = lon.cf["T"][segi] logger.debug( "Filled _nudged arrays with %d values starting at %s " "which were before the first GPS fix at %s", len(segi), - lat.get_index("navigation_time")[0], - lat_fix.get_index("gps_time")[0], + lat.cf["T"].data[0], + lat_fix.cf["T"].data[0], ) else: lon_nudged_array = np.array([]) lat_nudged_array = np.array([]) dt_nudged = np.array([], dtype="datetime64[ns]") if segi.any(): - seg_min = ( - lat.get_index("navigation_time")[segi][-1] - lat.get_index("navigation_time")[segi][0] - ).total_seconds() / 60 + # Return difference of numpy timestamps in units of minutes + seg_min = (lat.cf["T"].data[segi][-1] - lat.cf["T"].data[segi][0]).astype( + "timedelta64[s]" + ).astype(float) / 60.0 else: seg_min = 0 logger.info( diff --git a/src/data/align.py b/src/data/align.py index 9e07d43e..7e69109a 100755 --- a/src/data/align.py +++ b/src/data/align.py @@ -14,6 +14,7 @@ import argparse import logging +import os import re import sys import time @@ -34,6 +35,7 @@ TIME60HZ, AUV_NetCDF, ) +from nc42netcdfs import BASE_LRAUV_PATH from scipy.interpolate import interp1d @@ -41,6 +43,10 @@ class InvalidCalFile(Exception): pass +class InvalidCombinedFile(Exception): + pass + + class Align_NetCDF: logger = logging.getLogger(__name__) _handler = logging.StreamHandler() @@ -52,6 +58,8 @@ def global_metadata(self): """Use instance variables to return a dictionary of metadata specific for the data that are written """ + # Try to get actual host name, fall back to container name + actual_hostname = os.getenv("HOST_NAME", gethostname()) repo = git.Repo(search_parent_directories=True) try: gitcommit = repo.head.object.hexsha @@ -93,17 +101,30 @@ def global_metadata(self): metadata["useconst"] = "Not intended for legal use. Data may contain inaccuracies." metadata["history"] = f"Created by {self.commandline} on {iso_now}" - metadata["title"] = ( - f"Calibrated and aligned AUV sensor data from" - f" {self.args.auv_name} mission {self.args.mission}" - ) - from_data = "calibrated data" - metadata["source"] = ( - f"MBARI Dorado-class AUV data produced from {from_data}" - f" with execution of '{self.commandline}' at {iso_now} on" - f" host {gethostname()} using git commit {gitcommit} from" - f" software at 'https://github.com/mbari-org/auv-python'" - ) + if self.args.auv_name and self.args.mission: + metadata["title"] = ( + f"Calibrated and aligned AUV sensor data from" + f" {self.args.auv_name} mission {self.args.mission}" + ) + from_data = "calibrated data" + metadata["source"] = ( + f"MBARI Dorado-class AUV data produced from {from_data}" + f" with execution of '{self.commandline}' at {iso_now} on" + f" host {actual_hostname} using git commit {gitcommit} from" + f" software at 'https://github.com/mbari-org/auv-python'" + ) + elif self.args.log_file: + metadata["title"] = ( + f"Combined and aligned LRAUV instrument data from" + f" log file {Path(self.args.log_file).name}" + ) + from_data = "combined data" + metadata["source"] = ( + f"MBARI Long Range AUV data produced from {from_data}" + f" with execution of '{self.commandline}' at {iso_now} on" + f" host {actual_hostname} using git commit {gitcommit} from" + f" software at 'https://github.com/mbari-org/auv-python'" + ) metadata["summary"] = ( "Observational oceanographic data obtained from an Autonomous" " Underwater Vehicle mission with measurements at" @@ -112,10 +133,16 @@ def global_metadata(self): " software." ) # Append location of original data files to summary - matches = re.search( - "(" + SUMMARY_SOURCE.replace("{}", r".+$") + ")", - self.calibrated_nc.attrs["summary"], - ) + if self.args.auv_name and self.args.mission: + matches = re.search( + "(" + SUMMARY_SOURCE.replace("{}", r".+$") + ")", + self.calibrated_nc.attrs["summary"], + ) + elif self.args.log_file: + matches = re.search( + "(" + SUMMARY_SOURCE.replace("{}", r".+$") + ")", + self.combined_nc.attrs["summary"], + ) if matches: metadata["summary"] += " " + matches.group(1) metadata["comment"] = ( @@ -127,16 +154,20 @@ def global_metadata(self): return metadata - def process_cal(self, vehicle: str = "", name: str = "") -> None: # noqa: C901, PLR0912, PLR0915 + def process_cal(self, vehicle: str = "", name: str = "", log_file: str = "") -> None: # noqa: C901, PLR0912, PLR0915 name = name or self.args.mission vehicle = vehicle or self.args.auv_name - netcdfs_dir = Path(self.args.base_path, vehicle, MISSIONNETCDFS, name) - in_fn = f"{vehicle}_{name}_cal.nc" - try: - self.calibrated_nc = xr.open_dataset(Path(netcdfs_dir, in_fn)) - except ValueError as e: - raise InvalidCalFile(e) from e - self.logger.info("Processing %s from %s", in_fn, netcdfs_dir) + if name and vehicle: + netcdfs_dir = Path(self.args.base_path, vehicle, MISSIONNETCDFS, name) + src_file = Path(netcdfs_dir, f"{vehicle}_{name}_cal.nc") + elif log_file: + netcdfs_dir = Path(BASE_LRAUV_PATH, f"{Path(log_file).parent}") + src_file = Path(netcdfs_dir, f"{Path(log_file).stem}_cal.nc") + else: + msg = "Must provide either mission and vehicle or log_file" + raise ValueError(msg) + self.calibrated_nc = xr.open_dataset(src_file) + self.logger.info("Processing %s", src_file) self.aligned_nc = xr.Dataset() self.min_time = datetime.now(UTC) self.max_time = datetime(1970, 1, 1, tzinfo=UTC) @@ -178,7 +209,7 @@ def process_cal(self, vehicle: str = "", name: str = "") -> None: # noqa: C901, bounds_error=False, ) except KeyError: - error_message = f"No nudged_latitude data in {in_fn}" + error_message = f"No nudged_latitude data in {src_file}" raise InvalidCalFile(error_message) from None lon_interp = interp1d( self.calibrated_nc["nudged_longitude"].get_index("time").view(np.int64).tolist(), @@ -278,7 +309,7 @@ def process_cal(self, vehicle: str = "", name: str = "") -> None: # noqa: C901, ) self.aligned_nc[f"{instr}_latitude"].attrs = self.calibrated_nc["nudged_latitude"].attrs self.aligned_nc[f"{instr}_latitude"].attrs["comment"] += ( - f". Variable nudged_latitude from {in_fn} file linearly" + f". Variable nudged_latitude from {src_file} file linearly" f" interpolated onto {variable.split('_')[0]} time values." ) self.aligned_nc[f"{instr}_latitude"].attrs["long_name"] = "Latitude" @@ -294,7 +325,7 @@ def process_cal(self, vehicle: str = "", name: str = "") -> None: # noqa: C901, "nudged_longitude" ].attrs self.aligned_nc[f"{instr}_longitude"].attrs["comment"] += ( - f". Variable nudged_longitude from {in_fn} file linearly" + f". Variable nudged_longitude from {src_file} file linearly" f" interpolated onto {variable.split('_')[0]} time values." ) self.aligned_nc[f"{instr}_longitude"].attrs["long_name"] = "Longitude" @@ -329,6 +360,229 @@ def process_cal(self, vehicle: str = "", name: str = "") -> None: # noqa: C901, return netcdfs_dir + def process_combined(self, log_file: str) -> None: # noqa: C901, PLR0912, PLR0915 + """Process combined LRAUV data from *_combined.nc files created by combine.py""" + netcdfs_dir = Path(BASE_LRAUV_PATH, f"{Path(log_file).parent}") + src_file = Path(netcdfs_dir, f"{Path(log_file).stem}_combined.nc") + + self.combined_nc = xr.open_dataset(src_file) + self.logger.info("Processing %s", src_file) + self.aligned_nc = xr.Dataset() + self.min_time = datetime.now(UTC) + self.max_time = datetime(1970, 1, 1, tzinfo=UTC) + self.min_depth = np.inf + self.max_depth = -np.inf + self.min_lat = np.inf + self.max_lat = -np.inf + self.min_lon = np.inf + self.max_lon = -np.inf + + # Find navigation coordinates from combined data - must be from universals group + nav_coords = {} + for coord_type in ["longitude", "latitude", "depth", "time"]: + coord_var = f"universals_{coord_type}" + if coord_var not in self.combined_nc: + error_message = ( + f"Required universals coordinate {coord_var} not found in {src_file}" + ) + raise InvalidCombinedFile(error_message) + nav_coords[coord_type] = coord_var + self.logger.info("Found navigation coordinate: %s", coord_var) + + # Create interpolators for navigation coordinates + try: + lat_interp = interp1d( + self.combined_nc[nav_coords["latitude"]] + .get_index("universals_time") + .view(np.int64) + .tolist(), + self.combined_nc[nav_coords["latitude"]].values, + fill_value=( + self.combined_nc[nav_coords["latitude"]][0], + self.combined_nc[nav_coords["latitude"]][-1], + ), + bounds_error=False, + ) + + lon_interp = interp1d( + self.combined_nc[nav_coords["longitude"]] + .get_index("universals_time") + .view(np.int64) + .tolist(), + self.combined_nc[nav_coords["longitude"]].values, + fill_value=( + self.combined_nc[nav_coords["longitude"]][0], + self.combined_nc[nav_coords["longitude"]][-1], + ), + bounds_error=False, + ) + + depth_interp = interp1d( + self.combined_nc[nav_coords["depth"]] + .get_index("universals_time") + .view(np.int64) + .tolist(), + self.combined_nc[nav_coords["depth"]].values, + fill_value=( + self.combined_nc[nav_coords["depth"]][0], + self.combined_nc[nav_coords["depth"]][-1], + ), + bounds_error=False, + ) + + except KeyError as e: + error_message = f"Missing navigation data in {src_file}: {e}" + raise InvalidCombinedFile(error_message) from e + except ValueError as e: + error_message = f"Cannot interpolate navigation coordinates: {e}" + raise InvalidCombinedFile(error_message) from e + + # Process group-based variables (skip coordinate variables) + for variable in self.combined_nc: + # Skip time coordinate variables + if variable.endswith("_time"): + continue + + # Skip the navigation coordinate variables themselves + if variable in nav_coords.values(): + continue + + # Extract group name from variable (e.g., "ctd_seabird_salinity" -> "ctd_seabird") + var_parts = variable.split("_") + if len(var_parts) < 2: # noqa: PLR2004 + self.logger.debug("Skipping variable with unexpected name format: %s", variable) + continue + + # Try to find the corresponding time coordinate + # Look for pattern: group_name + "_time" + possible_time_coords = [] + for i in range(len(var_parts)): + group_candidate = "_".join(var_parts[: i + 1]) + time_coord_candidate = f"{group_candidate}_time" + if time_coord_candidate in self.combined_nc: + possible_time_coords.append((group_candidate, time_coord_candidate)) + + if not possible_time_coords: + self.logger.warning("No time coordinate found for variable: %s", variable) + continue + + # Use the longest matching group name (most specific) + group_name, timevar = max(possible_time_coords, key=lambda x: len(x[0])) + self.logger.debug( + "Processing %s with group %s and time %s", variable, group_name, timevar + ) + + # Copy the original variable + self.aligned_nc[variable] = self.combined_nc[variable] + + # Get the time index for this variable + var_time = self.aligned_nc[variable].get_index(timevar).view(np.int64).tolist() + + # Calculate sampling rate + sample_rate = np.round( + 1.0 / (np.mean(np.diff(self.combined_nc[timevar])) / np.timedelta64(1, "s")), + decimals=2, + ) + + # Create aligned variable with proper attributes + self.aligned_nc[variable] = xr.DataArray( + self.combined_nc[variable].values, + dims={timevar}, + coords=[self.combined_nc[variable].get_index(timevar)], + name=variable, + ) + self.aligned_nc[variable].attrs = self.combined_nc[variable].attrs + self.aligned_nc[variable].attrs["coordinates"] = ( + f"{group_name}_time {group_name}_depth {group_name}_latitude {group_name}_longitude" + ) + self.logger.info("%s: instrument_sample_rate_hz = %.2f", variable, sample_rate) + self.aligned_nc[variable].attrs["instrument_sample_rate_hz"] = sample_rate + + # Create interpolated coordinate variables for this group + coord_names = ["depth", "latitude", "longitude"] + coord_interps = [depth_interp, lat_interp, lon_interp] + coord_sources = [nav_coords["depth"], nav_coords["latitude"], nav_coords["longitude"]] + + for coord_name, coord_interp, coord_source in zip( + coord_names, coord_interps, coord_sources, strict=True + ): + coord_var_name = f"{group_name}_{coord_name}" + + self.aligned_nc[coord_var_name] = xr.DataArray( + coord_interp(var_time).astype(np.float64).tolist(), + dims={timevar}, + coords=[self.combined_nc[variable].get_index(timevar)], + name=coord_var_name, + ) + + # Copy attributes from source coordinate + if coord_source in self.combined_nc: + self.aligned_nc[coord_var_name].attrs = self.combined_nc[coord_source].attrs + + # Update attributes + self.aligned_nc[coord_var_name].attrs["long_name"] = coord_name.title() + self.aligned_nc[coord_var_name].attrs["instrument_sample_rate_hz"] = sample_rate + + if coord_name in ["latitude", "longitude"]: + self.aligned_nc[coord_var_name].attrs["comment"] = ( + self.aligned_nc[coord_var_name].attrs.get("comment", "") + + f". Variable {coord_source} from {src_file} file linearly" + f" interpolated onto {group_name} time values." + ) + + # Update spatial temporal bounds for global metadata + if pd.to_datetime(self.aligned_nc[timevar][0].values).tz_localize(UTC) < pd.to_datetime( + self.min_time + ): + self.min_time = pd.to_datetime(self.aligned_nc[timevar][0].values).tz_localize(UTC) + if pd.to_datetime(self.aligned_nc[timevar][-1].values).tz_localize( + UTC + ) > pd.to_datetime(self.max_time): + self.max_time = pd.to_datetime(self.aligned_nc[timevar][-1].values).tz_localize(UTC) + + # Update bounds using the interpolated coordinates + depth_coord = f"{group_name}_depth" + lat_coord = f"{group_name}_latitude" + lon_coord = f"{group_name}_longitude" + + if self.aligned_nc[depth_coord].min() < self.min_depth: + self.min_depth = self.aligned_nc[depth_coord].min().to_numpy() + if self.aligned_nc[depth_coord].max() > self.max_depth: + self.max_depth = self.aligned_nc[depth_coord].max().to_numpy() + if self.aligned_nc[lat_coord].min() < self.min_lat: + self.min_lat = self.aligned_nc[lat_coord].min().to_numpy() + if self.aligned_nc[lat_coord].max() > self.max_lat: + self.max_lat = self.aligned_nc[lat_coord].max().to_numpy() + if self.aligned_nc[lon_coord].min() < self.min_lon: + self.min_lon = self.aligned_nc[lon_coord].min().to_numpy() + if self.aligned_nc[lon_coord].max() > self.max_lon: + self.max_lon = self.aligned_nc[lon_coord].max().to_numpy() + + return netcdfs_dir + + def write_combined_netcdf( + self, netcdfs_dir, vehicle: str = "", name: str = "", log_file: str = "" + ) -> None: + """Write aligned combined data to NetCDF file""" + if log_file: + # For LRAUV log files, use the log file stem for output name + out_fn = Path(netcdfs_dir, f"{Path(log_file).stem}_align.nc") + else: + name = name or self.args.mission + vehicle = vehicle or self.args.auv_name + out_fn = Path(netcdfs_dir, f"{vehicle}_{name}_align.nc") + + self.aligned_nc.attrs = self.global_metadata() + self.logger.info("Writing aligned combined data to %s", out_fn) + if out_fn.exists(): + self.logger.debug("Removing existing file %s", out_fn) + out_fn.unlink() + self.aligned_nc.to_netcdf(out_fn) + self.logger.info( + "Data variables written: %s", + ", ".join(sorted(self.aligned_nc.variables)), + ) + def write_netcdf(self, netcdfs_dir, vehicle: str = "", name: str = "") -> None: name = name or self.args.mission vehicle = vehicle or self.args.auv_name @@ -349,6 +603,13 @@ def process_command_line(self): examples += " Align calibrated data for some missions:\n" examples += " " + sys.argv[0] + " --mission 2020.064.10\n" examples += " " + sys.argv[0] + " --auv_name i2map --mission 2020.055.01\n" + examples += " Align combined LRAUV data:\n" + examples += ( + " " + + sys.argv[0] + + " --log_file brizo/missionlogs/2025/20250909_20250915/20250914T080941/" + + "202509140809_202509150109.nc4\n" + ) parser = argparse.ArgumentParser( formatter_class=RawTextHelpFormatter, @@ -373,6 +634,15 @@ def process_command_line(self): action="store", help="Mission directory, e.g.: 2020.064.10", ) + parser.add_argument( + "--log_file", + action="store", + help=( + "Path to the log file of original LRAUV data, e.g.: " + "brizo/missionlogs/2025/20250903_20250909/" + "20250905T072042/202509050720_202509051653.nc4" + ), + ) parser.add_argument( "--plot", action="store_true", @@ -401,6 +671,17 @@ def process_command_line(self): align_netcdf = Align_NetCDF() align_netcdf.process_command_line() p_start = time.time() - netcdf_dir = align_netcdf.process_cal() - align_netcdf.write_netcdf(netcdf_dir) + + if align_netcdf.args.log_file: + # Process combined LRAUV data using log_file + netcdf_dir = align_netcdf.process_combined(log_file=align_netcdf.args.log_file) + align_netcdf.write_combined_netcdf(netcdf_dir, log_file=align_netcdf.args.log_file) + elif align_netcdf.args.auv_name and align_netcdf.args.mission: + # Process calibrated data using auv_name and mission + netcdf_dir = align_netcdf.process_cal() + align_netcdf.write_netcdf(netcdf_dir) + else: + align_netcdf.logger.error("Must provide either --log_file or both --auv_name and --mission") + sys.exit(1) + align_netcdf.logger.info("Time to process: %.2f seconds", (time.time() - p_start)) diff --git a/src/data/calibrate.py b/src/data/calibrate.py index 2cdc8941..c9e735f0 100755 --- a/src/data/calibrate.py +++ b/src/data/calibrate.py @@ -1676,7 +1676,7 @@ def _nudge_pos(self, max_sec_diff_at_end=10): auv_name=self.args.auv_name, mission=self.args.mission, max_sec_diff_at_end=max_sec_diff_at_end, - create_plots=True, + create_plots=False, ) # Store results in instance variables for compatibility diff --git a/src/data/combine.py b/src/data/combine.py index e29963cb..0f04e03d 100755 --- a/src/data/combine.py +++ b/src/data/combine.py @@ -1,18 +1,21 @@ #!/usr/bin/env python """ -Combine original LRAUV data from separate .nc files and produce a single NetCDF -file that also contains corrected (nudged) latitudes and longitudes. +Combine original LRAUV data from separate *_Group_*.nc files and produce a +single NetCDF file that also contains corrected (nudged) latitudes and +longitudes. Read original data from netCDF files created by nc42netcdfs.py and write out a single netCDF file with the important variables at original sampling intervals. -Geometric alignment and any plumbing lag corrections are also done during this -step. This script is similar to calibrate.py that is used for Dorado and i2map -data, but does not apply any sensor calibrations as those are done on the LRAUV -vehicles before the data is logged and unserialized to NetCDF-4 files. The QC -methods implemented in calibrate.py will be reused here. +Any geometric alignment and any plumbing lag corrections can also be done during +this step. This script is similar to calibrate.py that is used for Dorado and +i2map data, but does not apply any sensor calibrations as those are done on the +LRAUV vehicles before the data is logged and unserialized to NetCDF4 files. The +QC methods implemented in calibrate.py may also be reused here. The calbrate.py +code is wrapped around the concept of "sensor" which has an anaolog in this code +of "group", but is too different to easily reuse. The file will contain combined variables (the combined_nc member variable) and -be analogous to the original NetCDF-4. Rather than using groups in NetCDF-4 the +be analogous to the original NetCDF4. Rather than using groups in NetCDF4 the data will be written in classic NetCDF-CF with a naming convention that is similar to Dorado data, with group names (any underscores removed) preceeding the variable name with an underscore - all lower case characters: @@ -25,8 +28,10 @@ _latitude _longitude ``` -The file will be named with a "_cal.nc" suffix to be consistent with the Dorado -and i2map files, indicating the stage of processing. +The file will be named with a "_combined.nc" suffix. It is analogous to the +"_cal.nc" suffix used for Dorado and i2map files and will provide a clear +indication of the stage of processing. The data are suiable for input to the +align.py script. """ @@ -43,10 +48,8 @@ from socket import gethostname from typing import NamedTuple import cf_xarray # Needed for the .cf accessor # noqa: F401 -import matplotlib.pyplot as plt import numpy as np import xarray as xr -from scipy.interpolate import interp1d import pandas as pd from AUV import monotonic_increasing_time_indices, nudge_positions @@ -61,11 +64,8 @@ class Range(NamedTuple): max: float -# Using lower case vehicle names, modify in _define_sensor_info() for changes -# over time Used to reduce ERROR & WARNING log messages for expected missing -# sensor data. There are core data common to most all vehicles, whose groups -# are listed in BASE_GROUPS. EXPECTED_GROUPS contains additional groups for -# specific vehicles. +# There are core data common to most all vehicles, whose groups are listed in +# BASE_GROUPS. EXPECTED_GROUPS contains additional groups for specific vehicles. BASE_GROUPS = { "lrauv": [ "CTDSeabird", @@ -74,75 +74,13 @@ class Range(NamedTuple): } EXPECTED_GROUPS = { - "dorado": [ - "navigation", - "gps", - "depth", - "ecopuck", - "hs2", - "ctd1", - "ctd2", - "isus", - "biolume", - "lopc", - "tailcone", - ], - "i2map": [ - "navigation", - "gps", - "depth", - "seabird25p", - "transmissometer", - "tailcone", + "pontus": [ + "WetLabsUBAT", ], } -# Used in test fixture in conftetst.py -EXPECTED_GROUPS["Dorado389"] = EXPECTED_GROUPS["dorado"] - - -def align_geom(sensor_offset, pitches): - """Use x & y sensor_offset values in meters from sensor_info and - pitch in degrees to compute and return actual depths of the sensor - based on the geometry relative to the vehicle's depth sensor. - """ - # See https://en.wikipedia.org/wiki/Rotation_matrix - # - # * instrument location with pitch applied - # / | - # / | - # / | - # / | - # / | - # / | - # / | - # / | - # / | - # / - # / - # / y - # / _ - # / o - # / f - # / f - # / * instrument location - # / | - # / \ | | - # / \ | y - # / pitch (theta) | | - # / \ | | - # --------------------x------------------+ --> nose - # - # [ cos(pitch) -sin(pitch) ] [x] [x'] - # X = - # [ sin(pitch) cos(pitch) ] [y] [y'] - offsets = [] - for pitch in pitches: - theta = pitch * np.pi / 180.0 - R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) - x_off, y_off = np.matmul(R, sensor_offset) - offsets.append(y_off) - - return offsets +# Combine the BASE_GROUPS into each EXPECTED_GROUPS entry +for vehicle, groups in EXPECTED_GROUPS.items(): + EXPECTED_GROUPS[vehicle] = groups + BASE_GROUPS["lrauv"] class Combine_NetCDF: @@ -169,13 +107,13 @@ def global_metadata(self): metadata["featureType"] = "trajectory" try: metadata["time_coverage_start"] = str( - self.combined_nc["depth_time"].to_pandas().iloc[0].isoformat(), + pd.to_datetime(self.combined_nc["universals_time"].values, unit="s")[0].isoformat(), ) except KeyError: - error_message = "No depth_time variable in combined_nc" + error_message = "No universals_time variable in combined_nc" raise EOFError(error_message) from None metadata["time_coverage_end"] = str( - self.combined_nc["depth_time"].to_pandas().iloc[-1].isoformat(), + pd.to_datetime(self.combined_nc["universals_time"].values, unit="s")[-1].isoformat(), ) metadata["distribution_statement"] = "Any use requires prior approval from MBARI" metadata["license"] = metadata["distribution_statement"] @@ -285,63 +223,6 @@ def _range_qc_combined_nc( # noqa: C901, PLR0912 self.combined_nc = self.combined_nc.drop_vars(qced_vars) self.logger.info("Done range checking %s", instrument) - def _nudge_pos(self, max_sec_diff_at_end=10): - """Apply linear nudges to underwater latitudes and longitudes so that - they match the surface gps positions. - """ - try: - lon = self.combined_nc["navigation_longitude"] - except KeyError: - error_message = "No navigation_longitude data in combined_nc" - raise EOFError(error_message) from None - lat = self.combined_nc["navigation_latitude"] - lon_fix = self.combined_nc["gps_longitude"] - lat_fix = self.combined_nc["gps_latitude"] - - # Use the shared function from AUV module - lon_nudged, lat_nudged, segment_count, segment_minsum = nudge_positions( - nav_longitude=lon, - nav_latitude=lat, - gps_longitude=lon_fix, - gps_latitude=lat_fix, - logger=self.logger, - auv_name=self.args.auv_name, - mission=self.args.mission, - max_sec_diff_at_end=max_sec_diff_at_end, - create_plots=True, - ) - - # Store results in instance variables for compatibility - self.segment_count = segment_count - self.segment_minsum = segment_minsum - - return lon_nudged, lat_nudged - - def _apply_plumbing_lag( - self, - sensor: str, - time_index: pd.DatetimeIndex, - time_name: str, - ) -> tuple[xr.DataArray, str]: - """ - Apply plumbing lag to a time index in the combined netCDF file. - """ - # Convert lag_secs to milliseconds as np.timedelta64 neeeds an integer - lagged_time = time_index - np.timedelta64( - int(self.sinfo[sensor]["lag_secs"] * 1000), - "ms", - ) - # Need to update the sensor's time coordinate in the combined netCDF file - # so that DataArrays created with lagged_time fit onto the coordinate - self.combined_nc.coords[f"{sensor}_{time_name}"] = xr.DataArray( - lagged_time, - coords=[lagged_time], - dims={f"{sensor}_{time_name}"}, - name=f"{sensor}_{time_name}", - ) - lag_info = f"with plumbing lag correction of {self.sinfo[sensor]['lag_secs']} seconds" - return lagged_time, lag_info - def _biolume_process(self, sensor): try: orig_nc = getattr(self, sensor).orig_data @@ -450,130 +331,337 @@ def _biolume_process(self, sensor): set_to_nan=True, ) - def _geometric_depth_correction(self, sensor, orig_nc): - """Performs the align_geom() function from the legacy Matlab. - Works for any sensor, but requires navigation being processed first - as its variables in combined_nc are required. Returns corrected depth - array. + def _consolidate_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dict: + """Analyze and consolidate time coordinates for a group. + + Returns: + dict: Contains consolidated time info with keys: + - consolidated_time_name: name of consolidated coordinate (or None) + - consolidated_time_data: the time coordinate data (or None) + - time_coord_mapping: dict mapping original dims to consolidated dims """ - # Fix pitch values to first and last points for interpolation to time - # values outside the range of the pitch values. - # See https://stackoverflow.com/a/45446546 - # and https://github.com/scipy/scipy/issues/12707#issuecomment-672794335 - try: - p_interp = interp1d( - self.combined_nc["navigation_time"].to_numpy().tolist(), - self.combined_nc["navigation_pitch"].to_numpy(), - fill_value=( - self.combined_nc["navigation_pitch"].to_numpy()[0], - self.combined_nc["navigation_pitch"].to_numpy()[-1], - ), - bounds_error=False, + # Find all time variables in this group + time_vars = {var: ds[var] for var in ds.variables if var.lower().endswith("time")} + + if not time_vars: + return { + "consolidated_time_name": None, + "consolidated_time_data": None, + "time_coord_mapping": {}, + } + + if len(time_vars) == 1: + # Single time coordinate - use it as consolidated + time_name = list(time_vars.keys())[0] + consolidated_name = f"{group_name}_time" + self.logger.info( + "Group %s: Single time coordinate '%s' - using as '%s'", + group_name, + time_name, + consolidated_name, ) - except KeyError: - error_message = "No navigation_time or navigation_pitch in combined_nc." - raise EOFError(error_message) from None - pitch = p_interp(orig_nc["time"].to_numpy().tolist()) - - d_interp = interp1d( - self.combined_nc["depth_time"].to_numpy().tolist(), - self.combined_nc["depth_filtdepth"].to_numpy(), - fill_value=( - self.combined_nc["depth_filtdepth"].to_numpy()[0], - self.combined_nc["depth_filtdepth"].to_numpy()[-1], - ), - bounds_error=False, - ) - orig_depth = d_interp(orig_nc["time"].to_numpy().tolist()) - offs_depth = align_geom(self.sinfo[sensor]["sensor_offset"], pitch) + return { + "consolidated_time_name": consolidated_name, + "consolidated_time_data": ds[time_name], + "time_coord_mapping": {time_name: consolidated_name}, + } + + # Multiple time coordinates - check if they're identical + time_arrays = list(time_vars.values()) + first_time = time_arrays[0] + first_time_name = list(time_vars.keys())[0] + + all_identical = True + for i, (_name, time_array) in enumerate(time_vars.items()): + if i == 0: + continue # Skip first one (reference) + + # Compare sizes first + if len(time_array) != len(first_time): + all_identical = False + self.logger.debug( + "Group %s: Time coordinate '%s' length %d differs from '%s' length %d", + group_name, + _name, + len(time_array), + first_time_name, + len(first_time), + ) + break - corrected_depth = xr.DataArray( - (orig_depth - offs_depth).astype(np.float64).tolist(), - coords=[orig_nc.get_index("time")], - dims={f"{sensor}_time"}, - name=f"{sensor}_depth", - ) - # 2008.289.03 has self.combined_nc["depth_time"][-1] (2008-10-16T15:42:32) - # at lot less than orig_nc["time"][-1] (2008-10-16T16:24:43) - # which, with "extrapolate" causes wildly incorrect depths to -359 m - # There may be other cases where this happens, in which case we'd like - # a general solution. For now, we'll just correct this mission. - d_beg_time_diff = ( - orig_nc["time"].to_numpy()[0] - self.combined_nc["depth_time"].to_numpy()[0] - ) - d_end_time_diff = ( - orig_nc["time"].to_numpy()[-1] - self.combined_nc["depth_time"].to_numpy()[-1] + # Compare values with tolerance + try: + if not np.allclose(time_array.values, first_time.values, atol=1e-6): + all_identical = False + self.logger.debug( + "Group %s: Time coordinate '%s' values differ from '%s'", + group_name, + _name, + first_time_name, + ) + break + except TypeError: + # Handle datetime arrays + if not np.array_equal(time_array.values, first_time.values): + all_identical = False + self.logger.debug( + "Group %s: Time coordinate '%s' values differ from '%s'", + group_name, + _name, + first_time_name, + ) + break + + if all_identical: + # All time coordinates are identical - consolidate them + consolidated_name = f"{group_name}_time" + time_coord_mapping = dict.fromkeys(time_vars, consolidated_name) + + self.logger.info( + "%-65s %s", + f"Consoliding {len(time_vars)} coordinates to", + consolidated_name, + ) + + return { + "consolidated_time_name": consolidated_name, + "consolidated_time_data": ds[first_time_name], + "time_coord_mapping": time_coord_mapping, + } + + # Time coordinates differ - keep them separate + time_coord_mapping = {name: f"{group_name}_{name.lower()}" for name in time_vars} + + self.logger.warning( + "Group %s: Time coordinates differ - keeping separate: %s", + group_name, + list(time_vars.keys()), ) + + return { + "consolidated_time_name": None, + "consolidated_time_data": None, + "time_coord_mapping": time_coord_mapping, + } + + def _add_time_coordinates_to_combined(self, time_info: dict, ds: xr.Dataset) -> None: + """Add time coordinates to the combined dataset.""" + if time_info["consolidated_time_name"]: + self._add_consolidated_time_coordinate(time_info) + else: + self._add_separate_time_coordinates(time_info, ds) + + def _add_consolidated_time_coordinate(self, time_info: dict) -> None: + """Add a consolidated time coordinate to the combined dataset.""" + time_name = time_info["consolidated_time_name"] self.logger.info( - "%s: d_beg_time_diff: %s, d_end_time_diff: %s", - sensor, - d_beg_time_diff.astype("timedelta64[s]"), - d_end_time_diff.astype("timedelta64[s]"), + "Adding consolidated time coordinate %-45s %s", + f"{time_name} as", + time_name, + ) + self.combined_nc[time_name] = xr.DataArray( + time_info["consolidated_time_data"].to_numpy(), + dims=[time_name], + coords={time_name: time_info["consolidated_time_data"].to_numpy()}, ) - if self.args.mission in ( - "2008.289.03", - "2010.259.01", - "2010.259.02", - ): - # This could be a more general check for all missions, but let's restrict it - # to known problematic missions for now. The above info message can help - # determine if this is needed for other missions. + self.combined_nc[time_name].attrs = time_info["consolidated_time_data"].attrs.copy() + + def _add_separate_time_coordinates(self, time_info: dict, ds: xr.Dataset) -> None: + """Add separate time coordinates to the combined dataset.""" + for orig_time_var, new_time_var in time_info["time_coord_mapping"].items(): self.logger.info( - "%s: Special QC for mission %s: Setting corrected_depth to NaN for times after %s", - sensor, - self.args.mission, - self.combined_nc["depth_time"][-1].to_numpy(), + "Adding time coordinate %-58s %s", + f"{orig_time_var} as", + new_time_var, + ) + self.combined_nc[new_time_var] = xr.DataArray( + ds[orig_time_var].to_numpy(), + dims=[new_time_var], + coords={new_time_var: ds[orig_time_var].to_numpy()}, + ) + self.combined_nc[new_time_var].attrs = ds[orig_time_var].attrs.copy() + + def _get_time_coordinate_data(self, time_info: dict, ds: xr.Dataset, orig_time_dim: str): + """Get the appropriate time coordinate data for a variable.""" + if time_info["consolidated_time_name"]: + return time_info["consolidated_time_data"].to_numpy() + return ds[orig_time_dim].to_numpy() + + def _create_data_array_for_variable( + self, ds: xr.Dataset, orig_var: str, dim_name: str, time_coord_data + ) -> xr.DataArray: + """Create a DataArray for a variable, handling unit conversions.""" + if orig_var in ("latitude", "longitude") and ds[orig_var].attrs.get("units") == "radians": + data_array = xr.DataArray( + ds[orig_var].to_numpy() * 180.0 / np.pi, + dims=[dim_name], + coords={dim_name: time_coord_data}, + ) + data_array.attrs = ds[orig_var].attrs.copy() + data_array.attrs["units"] = "degrees" + else: + data_array = xr.DataArray( + ds[orig_var].to_numpy(), + dims=[dim_name], + coords={dim_name: time_coord_data}, ) - corrected_depth[ - np.where( - orig_nc.get_index("time") > self.combined_nc["depth_time"].to_numpy()[-1], + data_array.attrs = ds[orig_var].attrs.copy() + return data_array + + def _add_time_metadata_to_variable(self, var_name: str, dim_name: str) -> None: + """Add required time metadata for cf_xarray decoding.""" + self.combined_nc[var_name].coords[dim_name].attrs["units"] = ( + "seconds since 1970-01-01T00:00:00Z" + ) + self.combined_nc[var_name].coords[dim_name].attrs["standard_name"] = "time" + + def _process_group_variables(self, ds: xr.Dataset, group_name: str, time_info: dict) -> None: + """Process all data variables in a group.""" + for orig_var in ds.variables: + if orig_var.lower().endswith("time"): + continue + + # Skip scalar variables (no dimensions) + if len(ds[orig_var].dims) == 0: + self.logger.debug("Skipping scalar variable: %s", orig_var) + continue + + new_var = group_name + "_" + orig_var.lower() + + # Get the original time dimension for this variable + orig_time_dim = ds[orig_var].dims[0] # Assuming first dim is time + + # Check if this dimension has a mapping + if orig_time_dim not in time_info["time_coord_mapping"]: + self.logger.warning( + "No time mapping found for %s dimension %s", orig_var, orig_time_dim ) - ] = np.nan - if self.args.plot: - plt.figure(figsize=(18, 6)) - plt.plot( - orig_nc["time"].to_numpy(), - orig_depth, - "-", - orig_nc["time"].to_numpy(), - corrected_depth, - "--", - orig_nc["time"].to_numpy(), - pitch, - ".", + continue + + dim_name = time_info["time_coord_mapping"][orig_time_dim] + time_coord_data = self._get_time_coordinate_data(time_info, ds, orig_time_dim) + + self.logger.info("Adding variable %-65s %s", f"{orig_var} as", new_var) + + # Create the data array + self.combined_nc[new_var] = self._create_data_array_for_variable( + ds, orig_var, dim_name, time_coord_data + ) + + # Add time metadata + self._add_time_metadata_to_variable(new_var, dim_name) + + def _add_consolidation_comment(self, time_info: dict) -> None: + """Add a comment documenting time coordinate consolidation.""" + if time_info["consolidated_time_name"] in self.combined_nc.variables: + mapping_info = ", ".join( + [f"{orig} -> {new}" for orig, new in time_info["time_coord_mapping"].items()] + ) + self.combined_nc[time_info["consolidated_time_name"]].attrs["comment"] = ( + f"Consolidated time coordinate from: {mapping_info}" ) - plt.ylabel("Depth (m) & Pitch (deg)") - plt.legend(("Original depth", "Pitch corrected depth", "Pitch")) - plt.title( - f"Original and pitch corrected depth for {self.args.auv_name} {self.args.mission}", + + def _add_nudged_coordinates(self, max_sec_diff_at_end: int = 10) -> None: + """Add nudged longitude and latitude variables to the combined dataset.""" + try: + nudged_longitude, nudged_latitude, segment_count, segment_minsum = nudge_positions( + # For LRAUV data the nav positions are shifted by 1 to align with GPS fixes + nav_longitude=self.combined_nc["universals_longitude"].shift(universals_time=1), + nav_latitude=self.combined_nc["universals_latitude"].shift(universals_time=1), + gps_longitude=self.combined_nc["nal9602_longitude_fix"], + gps_latitude=self.combined_nc["nal9602_latitude_fix"], + logger=self.logger, + auv_name="", + mission="", + max_sec_diff_at_end=max_sec_diff_at_end, + create_plots=self.args.plot, ) - plt.show() + except ValueError as e: + self.logger.error("Nudging positions failed: %s", e) # noqa: TRY400 + return - return corrected_depth + self.logger.info( + "nudge_positions created %d segments with segment_minsum = %f", + segment_count, + segment_minsum, + ) + self.combined_nc["nudged_longitude"] = nudged_longitude + self.combined_nc["nudged_longitude"].attrs = { + "long_name": "Nudged Longitude", + "standard_name": "longitude", + "units": "degrees_east", + "comment": "Dead reckoned longitude nudged to GPS positions", + } + self.combined_nc["nudged_latitude"] = nudged_latitude + self.combined_nc["nudged_latitude"].attrs = { + "long_name": "Nudged Latitude", + "standard_name": "latitude", + "units": "degrees_north", + "comment": "Dead reckoned latitude nudged to GPS positions", + } def combine_groups(self): + """Combine group files into a single NetCDF dataset with consolidated time coordinates.""" log_file = self.args.log_file src_dir = Path(BASE_LRAUV_PATH, Path(log_file).parent) group_files = sorted(src_dir.glob(f"{Path(log_file).stem}_{GROUP}_*.nc")) + self.summary_fields = set() self.combined_nc = xr.Dataset() + for group_file in group_files: self.logger.info("Group file: %s", group_file.name) - # Make nudged_longitude, nudged_latitude = self._nudge_pos() call on when appropriate - # Loop through each variable in the group file and add it to the combined_nc member list - with xr.open_dataset(group_file) as ds: - for orig_var in ds.variables: - if orig_var.lower().endswith("time"): - self.logger.debug("Skipping time variable: %s", orig_var) - continue - new_group = group_file.stem.split(f"{GROUP}_")[1].replace("_", "").lower() - new_var = new_group + "_" + orig_var.lower() - self.logger.info("Adding variable %-65s %s", f"{orig_var} as", new_var) - self.combined_nc[new_var] = ds[orig_var] + # Open group file without decoding to have np.allclose work properly + with xr.open_dataset(group_file, decode_cf=False) as ds: + # Group name to prepend variable names is lowercase with underscores removed + group_name = group_file.stem.split(f"{GROUP}_")[1].replace("_", "").lower() + time_info = self._consolidate_group_time_coords(ds, group_name) + + # Add time coordinate(s) to combined dataset + self._add_time_coordinates_to_combined(time_info, ds) + + # Process all data variables in the group + self._process_group_variables(ds, group_name, time_info) + + # Add consolidation comment if applicable + self._add_consolidation_comment(time_info) + + # Write intermediate file for cf_xarray decoding + intermediate_file = self._intermediate_write_netcdf() + with xr.open_dataset(intermediate_file, decode_cf=True) as ds: + self.combined_nc = ds.load() + + # Add nudged coordinates + self._add_nudged_coordinates() + + # Clean up intermediate file + ##Path(intermediate_file).unlink() + + def _intermediate_write_netcdf(self) -> None: + """Write out an intermediate combined netCDF file so that data can be + read using decode_cf=True for nudge_positions() to work with cf accessors.""" + log_file = self.args.log_file + netcdfs_dir = Path(BASE_LRAUV_PATH, Path(log_file).parent) + out_fn = Path(netcdfs_dir, f"{Path(log_file).stem}_combined_intermediate.nc") + + self.combined_nc.attrs = self.global_metadata() + self.logger.info("Writing intermediate combined group data to %s", out_fn) + if Path(out_fn).exists(): + Path(out_fn).unlink() + self.combined_nc.to_netcdf(out_fn) + self.logger.info( + "Data variables written: %s", + ", ".join(sorted(self.combined_nc.variables)), + ) + self.logger.info( + "Wrote intermediate (_combined_intermediate.nc) netCDF file: %s", + out_fn, + ) + return out_fn def write_netcdf(self) -> None: log_file = self.args.log_file netcdfs_dir = Path(BASE_LRAUV_PATH, Path(log_file).parent) - out_fn = Path(netcdfs_dir, f"{Path(log_file).stem}_cal.nc") + out_fn = Path(netcdfs_dir, f"{Path(log_file).stem}_combined.nc") self.combined_nc.attrs = self.global_metadata() self.logger.info("Writing combined group data to %s", out_fn) @@ -584,6 +672,7 @@ def write_netcdf(self) -> None: "Data variables written: %s", ", ".join(sorted(self.combined_nc.variables)), ) + self.logger.info("Wrote combined (_combined.nc) netCDF file: %s", out_fn) return netcdfs_dir @@ -603,27 +692,19 @@ def process_command_line(self): description=__doc__, epilog=examples, ) - - parser.add_argument( - "--noinput", - action="store_true", - help="Execute without asking for a response, e.g. to not ask to re-download file", - ) parser.add_argument( "--log_file", action="store", help=( - "Path to the log file for the mission, e.g.: " + "Path to the log file of original LRAUV data, e.g.: " "brizo/missionlogs/2025/20250903_20250909/" "20250905T072042/202509050720_202509051653.nc4" ), ) parser.add_argument( "--plot", - action="store", - help="Create intermediate plots" - " to validate data operations. Use first to plot " - " points, e.g. first2000. Program blocks upon show.", + action="store_true", + help="Create intermediate plot(s) to help validate processing", ) parser.add_argument( "-v", diff --git a/src/data/nc42netcdfs.py b/src/data/nc42netcdfs.py index 42d5e672..ed778f82 100755 --- a/src/data/nc42netcdfs.py +++ b/src/data/nc42netcdfs.py @@ -189,11 +189,6 @@ def download_with_pooch(self, url, local_dir, known_hash=None): downloader=downloader, ) - def get_groups_netcdf4(self, file_path): - """Get list of groups using netCDF4 library.""" - with netCDF4.Dataset(file_path, "r") as dataset: - return list(dataset.groups.keys()) - def extract_groups_to_files_netcdf4(self, log_file: str) -> Path: """Extract each group from .nc4 file to a separate .nc file using netCDF4 library. @@ -242,21 +237,26 @@ def _extract_root_group( if not root_parms: return - try: - self.logger.info("Extracting root group '/'") - vars_to_extract = self._get_available_variables(src_dataset, root_parms) + self.logger.info("Extracting root group '/'") + vars_to_extract, _ = self._get_available_variables(src_dataset, root_parms) - if vars_to_extract: - output_file = output_dir / f"{Path(log_file).stem}_{GROUP}_Universals.nc" - self._create_netcdf_file( - log_file, group_name, src_dataset, vars_to_extract, output_file - ) - self.logger.info("Extracted root group '/' to %s", output_file) - else: - self.logger.warning("No requested variables found in root group '/'") + # Add debugging output for root group processing + self.logger.info("=== ROOT GROUP DEBUG ===") + self.logger.info("Available variables: %s", sorted(vars_to_extract)) + self.logger.info("Available dimensions: %s", sorted(src_dataset.dimensions.keys())) + self.logger.info( + "Available coordinate variables: %s", + [v for v in sorted(src_dataset.variables.keys()) if v in src_dataset.dimensions], + ) - except Exception as e: # noqa: BLE001 - self.logger.warning("Could not extract root group '/': %s", e) + if vars_to_extract: + output_file = output_dir / f"{Path(log_file).stem}_{GROUP}_Universals.nc" + self._create_netcdf_file( + log_file, group_name, src_dataset, vars_to_extract, output_file + ) + self.logger.info("Extracted root group '/' to %s", output_file) + else: + self.logger.warning("No requested variables found in root group '/'") def _extract_single_group( self, @@ -272,7 +272,7 @@ def _extract_single_group( self.logger.debug(" Group %s", group_name) src_group = src_dataset.groups[group_name] - vars_to_extract = self._get_available_variables(src_group, group_parms) + vars_to_extract, requested_vars = self._get_available_variables(src_group, group_parms) if vars_to_extract: output_file = output_dir / f"{Path(log_file).stem}_{GROUP}_{group_name}.nc" @@ -281,12 +281,12 @@ def _extract_single_group( ) self.logger.info("Extracted %s to %s", group_name, output_file) else: - self.logger.warning("No requested variables found in group %s", group_name) + self.logger.warning( + "No requested variables (%s) found in group %s", requested_vars, group_name + ) except KeyError: self.logger.warning("Group %s not found", group_name) - # except Exception as e: # noqa: BLE001 - # self.logger.warning("Could not extract %s: %s", group_name, e) def _get_available_variables( self, src_group: netCDF4.Group, group_parms: list[dict[str, Any]] @@ -297,39 +297,7 @@ def _get_available_variables( vars_to_extract = [var for var in requested_vars if var in available_vars] self.logger.debug(" Variables to extract: %s", vars_to_extract) - return vars_to_extract - - def _find_time_coordinate(self, src_group: netCDF4.Group) -> str: - """Find the time coordinate variable in a group using introspection. - - Returns: - str: Name of the time coordinate variable, or empty string if not found - """ - # Strategy 1: Look for variables with "time" in the name (most common) - time_vars = [var_name for var_name in src_group.variables if "time" in var_name.lower()] - if time_vars: - # Prefer variables that start with 'time' (like time_NAL9602) - time_vars.sort(key=lambda x: (not x.lower().startswith("time"), x)) - self.logger.debug("Found time coordinate %s via name pattern", time_vars[0]) - return time_vars[0] - - # Strategy 2: Look for variables with time-like units - for var_name, var in src_group.variables.items(): - if hasattr(var, "units"): - units = getattr(var, "units", "").lower() - time_patterns = ["seconds since", "days since", "hours since"] - if any(pattern in units for pattern in time_patterns): - self.logger.debug("Found time coordinate %s via units", var_name) - return var_name - - # Strategy 3: Look for unlimited dimension (backup) - for dim_name, dim in src_group.dimensions.items(): - if dim.isunlimited() and dim_name in src_group.variables: - self.logger.debug("Found time coordinate %s via unlimited dimension", dim_name) - return dim_name - - self.logger.debug("No time coordinate found in group") - return "" + return vars_to_extract, requested_vars def _get_time_filters_for_variables( self, log_file: str, group_name: str, src_group: netCDF4.Group, vars_to_extract: list[str] @@ -347,6 +315,10 @@ def _get_time_filters_for_variables( # Find all time coordinates used by variables in extraction list time_coords_found = self._find_time_coordinates(group_name, src_group, vars_to_extract) + # Add diagnostic check to compare original time coordinate values + if len(time_coords_found) > 1: + self._analyze_original_time_coordinates(src_group, time_coords_found, group_name) + # Parse plot time settings once plot_group_name, plot_time_coord_name = self._parse_plot_time_argument() @@ -365,6 +337,103 @@ def _get_time_filters_for_variables( return time_filters + def _analyze_original_time_coordinates( + self, src_group: netCDF4.Group, time_coords_found: set[str], group_name: str + ): + """Quick diagnostic for Dead Reckoned timing issues in root group.""" + # Only analyze root group Dead Reckoned coordinates + if group_name != "/": + return + + if ( + "latitude_time" not in time_coords_found + or "longitude_time" not in time_coords_found + or "latitude_time" not in src_group.variables + or "longitude_time" not in src_group.variables + ): + return + + lat_time = src_group.variables["latitude_time"][:] + lon_time = src_group.variables["longitude_time"][:] + + # Quick check for Dead Reckoned timing synchronization + min_len = min(len(lat_time), len(lon_time)) + if min_len == 0: + return + + # Compare overlapping portion + overlap_equal = np.array_equal(lat_time[:min_len], lon_time[:min_len]) + + if overlap_equal and len(lat_time) == len(lon_time): + self.logger.info( + "Dead Reckoned timing: latitude_time and longitude_time are properly synchronized" + ) + return + + # Calculate timing differences for diagnosis + time_diff = lon_time[:min_len] - lat_time[:min_len] + non_zero_mask = time_diff != 0 + num_differences = np.sum(non_zero_mask) + percent_different = 100.0 * num_differences / min_len + + if len(lat_time) != len(lon_time): + self.logger.warning( + "Dead Reckoned timing: Different array lengths - " + "latitude_time: %d, longitude_time: %d", + len(lat_time), + len(lon_time), + ) + + if num_differences > 0: + diff_values = time_diff[non_zero_mask] + max_abs_diff = np.max(np.abs(diff_values)) + + # Define thresholds for Dead Reckoned timing issues + MAJOR_PERCENT_THRESHOLD = 50.0 # 50% different points + MAJOR_TIME_THRESHOLD = 3600.0 # 1 hour difference + MINOR_PERCENT_THRESHOLD = 5.0 # 5% different points + MINOR_TIME_THRESHOLD = 60.0 # 1 minute difference + + if percent_different > MAJOR_PERCENT_THRESHOLD or max_abs_diff > MAJOR_TIME_THRESHOLD: + self.logger.warning( + "Dead Reckoned timing: Significant synchronization issues detected - " + "%.1f%% of coordinates have timing differences (max: %.1f seconds)", + percent_different, + max_abs_diff, + ) + self.logger.warning( + "Dead Reckoned timing: Differences begin at index %d", + np.where(non_zero_mask)[0][0], + ) + lon_subset = lon_time[ + max(0, np.where(non_zero_mask)[0][0] - 5) : np.where(non_zero_mask)[0][0] + 5 + ] + lat_subset = lat_time[ + max(0, np.where(non_zero_mask)[0][0] - 5) : np.where(non_zero_mask)[0][0] + 5 + ] + self.logger.warning( + "Dead Reckoned timing: longitude_time around this index: %s", + " ".join(f"{val:14.2f}" for val in lon_subset), + ) + self.logger.warning( + "Dead Reckoned timing: latitude_time around this index: %s", + " ".join(f"{val:14.2f}" for val in lat_subset), + ) + elif percent_different > MINOR_PERCENT_THRESHOLD or max_abs_diff > MINOR_TIME_THRESHOLD: + self.logger.warning( + "Dead Reckoned timing: Minor synchronization issues detected - " + "%.1f%% of coordinates have timing differences (max: %.1f seconds)", + percent_different, + max_abs_diff, + ) + else: + self.logger.info( + "Dead Reckoned timing: Small timing differences detected - " + "%.1f%% of coordinates differ (max: %.1f seconds)", + percent_different, + max_abs_diff, + ) + def _find_time_coordinates( self, group_name: str, src_group: netCDF4.Group, vars_to_extract: list[str] ) -> set[str]: @@ -374,12 +443,14 @@ def _find_time_coordinates( "=================================== Group: %s =======================================", group_name, ) - for var_name in vars_to_extract: + # Sort variables to make processing deterministic + for var_name in sorted(vars_to_extract): if var_name in src_group.variables: var = src_group.variables[var_name] # Check each dimension to see if it's a time coordinate - for dim_name in var.dimensions: + # Sort dimensions to make processing deterministic + for dim_name in sorted(var.dimensions): if dim_name in src_group.variables: dim_var = src_group.variables[dim_name] @@ -665,7 +736,7 @@ def _plot_time_filtering(self, plot_data: dict): self.logger.info("Time filtering plot displayed for %s", plot_data["variable_name"]) - def _copy_variable_with_appropriate_time_filter( + def _copy_variable_with_appropriate_time_filter( # noqa: C901, PLR0912 self, src_group: netCDF4.Group, dst_dataset: netCDF4.Dataset, @@ -673,52 +744,79 @@ def _copy_variable_with_appropriate_time_filter( time_filters: dict[str, dict], ): """Copy a variable with appropriate time filtering applied.""" - try: - src_var = src_group.variables[var_name] + src_var = src_group.variables[var_name] + + # Skip variables that use time dimensions with 0 points + for dim_name in src_var.dimensions: + if ( + dim_name in time_filters + and time_filters[dim_name]["filtered"] + and len(time_filters[dim_name]["indices"]) == 0 + ): + self.logger.debug( + "Skipping variable %s (uses dimension %s with 0 points)", var_name, dim_name + ) + return - # Create variable in destination + # Create variable in destination + try: dst_var = dst_dataset.createVariable( var_name, src_var.dtype, src_var.dimensions, + zlib=True, + complevel=4, + ) + except ValueError as e: + self.logger.warning( + "Could not create variable %s in destination dataset: %s. ", + var_name, + str(e), ) + return - # Check if this variable itself is a time coordinate that needs filtering - if var_name in time_filters and time_filters[var_name]["filtered"]: - # This is a time coordinate variable that needs filtering - time_indices = time_filters[var_name]["indices"] - dst_var[:] = src_var[:][time_indices] - dst_var.setncattr("comment", time_filters[var_name]["comment"]) - self.logger.debug("Applied time filtering to time coordinate %s", var_name) - - # Check if this variable depends on any filtered time dimensions - elif src_var.dimensions: - # Find which (if any) of this variable's dimensions are filtered time coordinates - filtered_dims = {} - for dim_name in src_var.dimensions: - if dim_name in time_filters and time_filters[dim_name]["filtered"]: - filtered_dims[dim_name] = time_filters[dim_name]["indices"] - - if filtered_dims: - # Apply filtering for the appropriate dimensions - self._apply_multidimensional_time_filter( - src_var, dst_var, var_name, filtered_dims - ) - else: - # No time filtering needed - dst_var[:] = src_var[:] + # Check if this variable itself is a time coordinate that needs filtering + if var_name in time_filters and time_filters[var_name]["filtered"]: + # This is a time coordinate variable that needs filtering + time_indices = time_filters[var_name]["indices"] + dst_var[:] = src_var[:][time_indices] + dst_var.setncattr("comment", time_filters[var_name]["comment"]) + self.logger.debug("Applied time filtering to time coordinate %s", var_name) + + # Check if this variable depends on any filtered time dimensions + elif src_var.dimensions: + # Find which (if any) of this variable's dimensions are filtered time coordinates + filtered_dims = {} + for dim_name in src_var.dimensions: + if dim_name in time_filters and time_filters[dim_name]["filtered"]: + filtered_dims[dim_name] = time_filters[dim_name]["indices"] + + if filtered_dims: + # Apply filtering for the appropriate dimensions + self._apply_multidimensional_time_filter(src_var, dst_var, var_name, filtered_dims) else: - # Scalar variable or no dimensions + # No time filtering needed dst_var[:] = src_var[:] + else: + # Scalar variable or no dimensions + dst_var[:] = src_var[:] - # Copy attributes - for attr_name in src_var.ncattrs(): - dst_var.setncattr(attr_name, src_var.getncattr(attr_name)) - - self.logger.debug(" Copied variable: %s", var_name) + # Copy attributes + for attr_name in src_var.ncattrs(): + dst_var.setncattr(attr_name, src_var.getncattr(attr_name)) + if var_name in time_filters and time_filters[var_name]["filtered"]: + # Downstream process uses cf_xarray to recognize coordinates, add required attribute + dst_var.setncattr("standard_name", "time") + else: + # Override any coordinates attribute in src with just the time coordinate + dst_var.setncattr("coordinates", var_name + "_time") + # Downstream process uses cf_xarray to recognize coordinates, add required attribute + if src_group.name == "/" and var_name.startswith(("longitude", "latitude")): + dst_var.setncattr("units", "radians") + elif var_name.startswith("depth"): + dst_var.setncattr("units", "meters") - except Exception as e: # noqa: BLE001 - self.logger.warning("Failed to copy variable %s: %s", var_name, e) + self.logger.debug(" Copied variable: %s", var_name) def _apply_multidimensional_time_filter( self, src_var, dst_var, var_name: str, filtered_dims: dict[str, list[int]] @@ -764,65 +862,35 @@ def _create_dimensions_with_time_filters( time_filters: dict[str, dict], ): """Create dimensions in the destination dataset, adjusting time dimensions if filtered.""" - # NetCDF3 allows only one unlimited dimension - primary_time_dim = self._find_primary_time_dimension(src_group, dims_needed, time_filters) - unlimited_dim_created = False - + # Use fixed dimensions for all - simpler and avoids NetCDF3 unlimited dimension issues for dim_name in dims_needed: if dim_name not in src_group.dimensions: continue src_dim = src_group.dimensions[dim_name] - should_be_unlimited = dim_name == primary_time_dim and not unlimited_dim_created size = self._calculate_dimension_size( - dim_name, src_dim, time_filters, should_be_unlimited + dim_name, src_dim, time_filters, should_be_unlimited=False ) - # Track if we created the unlimited dimension - if size is None: - unlimited_dim_created = True + # Skip dimensions with 0 points to avoid NetCDF3 conflicts + if size == 0: + self.logger.debug("Skipping dimension %s with 0 points", dim_name) + continue dst_dataset.createDimension(dim_name, size) - def _find_primary_time_dimension( - self, src_group: netCDF4.Group, dims_needed: set[str], time_filters: dict[str, dict] - ) -> str | None: - """Find the primary time dimension that should be unlimited in NetCDF3.""" - for dim_name in dims_needed: - if dim_name in src_group.dimensions: - src_dim = src_group.dimensions[dim_name] - is_time_like = "time" in dim_name.lower() or dim_name in time_filters - if src_dim.isunlimited() and is_time_like: - return dim_name - - # Fallback: return first unlimited dimension found - for dim_name in dims_needed: - if dim_name in src_group.dimensions and src_group.dimensions[dim_name].isunlimited(): - return dim_name - - return None - def _calculate_dimension_size( self, dim_name: str, src_dim, time_filters: dict[str, dict], should_be_unlimited: bool, # noqa: FBT001 - ) -> int | None: - """Calculate the size for a dimension, handling NetCDF3 unlimited dimension constraint.""" + ) -> int: + """Calculate the size for a dimension - always returns fixed size for simplicity.""" is_filtered_time = dim_name in time_filters and time_filters[dim_name]["filtered"] if is_filtered_time: filtered_size = len(time_filters[dim_name]["indices"]) - if should_be_unlimited: - self.logger.debug( - "Created filtered unlimited time dimension %s: %s -> unlimited (%d points)", - dim_name, - len(src_dim), - filtered_size, - ) - return None # Unlimited - self.logger.debug( "Created filtered fixed time dimension %s: %s -> %s", dim_name, @@ -831,18 +899,16 @@ def _calculate_dimension_size( ) return filtered_size - # Non-filtered dimension - if should_be_unlimited: - self.logger.debug("Created unlimited dimension %s", dim_name) - return None - + # Non-filtered dimension - always fixed size size = len(src_dim) if src_dim.isunlimited(): self.logger.debug( - "Converting unlimited dimension %s to fixed size %s (NetCDF3 limitation)", + "Converting unlimited dimension %s to fixed size %s", dim_name, size, ) + else: + self.logger.debug("Created fixed dimension %s: %s", dim_name, size) return size def _create_netcdf_file( # noqa: PLR0913 @@ -872,7 +938,7 @@ def _create_netcdf_file( # noqa: PLR0913 if any(tf["filtered"] for tf in time_filters.values()): dst_dataset.setncattr( "processing_note", - "Non-monotonic time values filtered from original, see comment in variables", + "Non-monotonic time values filtered from original, see variable comments", ) # Create dimensions - may need to adjust time dimension sizes @@ -910,16 +976,6 @@ def _get_required_dimensions( dims_needed.update(var.dimensions) return dims_needed - def _create_dimensions( - self, src_group: netCDF4.Group, dst_dataset: netCDF4.Dataset, dims_needed: set[str] - ): - """Create dimensions in the destination dataset.""" - for dim_name in dims_needed: - if dim_name in src_group.dimensions: - src_dim = src_group.dimensions[dim_name] - size = len(src_dim) if not src_dim.isunlimited() else None - dst_dataset.createDimension(dim_name, size) - def _get_coordinate_variables( self, src_group: netCDF4.Group, dims_needed: set[str], vars_to_extract: list[str] ) -> list[str]: @@ -930,28 +986,6 @@ def _get_coordinate_variables( coord_vars.append(dim_name) # noqa: PERF401 return coord_vars - def _copy_variable(self, src_group: netCDF4.Group, dst_dataset: netCDF4.Dataset, var_name: str): - """Helper method to copy a variable from source to destination.""" - try: - src_var = src_group.variables[var_name] - - # Create variable in destination - dst_var = dst_dataset.createVariable( - var_name, - src_var.dtype, - src_var.dimensions, - ) - - # Copy data and attributes - dst_var[:] = src_var[:] - for attr_name in src_var.ncattrs(): - dst_var.setncattr(attr_name, src_var.getncattr(attr_name)) - - self.logger.debug(" Copied variable: %s", var_name) - - except Exception as e: # noqa: BLE001 - self.logger.warning("Failed to copy variable %s: %s", var_name, e) - def global_metadata(self, log_file: str, group_name: str): """Use instance variables to return a dictionary of metadata specific for the data that are written diff --git a/src/data/process.py b/src/data/process.py index 4dcedd38..6856f3db 100755 --- a/src/data/process.py +++ b/src/data/process.py @@ -68,6 +68,7 @@ class data are: download_process and calibrate, while for LRAUV class data from align import Align_NetCDF, InvalidCalFile from archive import LOG_NAME, Archiver from calibrate import EXPECTED_SENSORS, Calibrate_NetCDF +from combine import Combine_NetCDF from create_products import CreateProducts from dorado_info import FAILED, TEST, dorado_info from emailer import NOTIFICATION_EMAIL, Emailer @@ -299,7 +300,7 @@ def calibrate(self, mission: str) -> None: cal_netcdf.logger.error("%s %s", mission, e) # noqa: TRY400 cal_netcdf.logger.removeHandler(self.log_handler) - def align(self, mission: str) -> None: + def align(self, mission: str = "", log_file: str = "") -> None: self.logger.info("Alignment steps for %s", mission) align_netcdf = Align_NetCDF() align_netcdf.args = argparse.Namespace() @@ -312,7 +313,10 @@ def align(self, mission: str) -> None: align_netcdf.logger.addHandler(self.log_handler) align_netcdf.commandline = self.commandline try: - netcdf_dir = align_netcdf.process_cal() + if log_file: + netcdf_dir = align_netcdf.process_cal(log_file=log_file) + else: + netcdf_dir = align_netcdf.process_cal() align_netcdf.write_netcdf(netcdf_dir) except (FileNotFoundError, EOFError) as e: align_netcdf.logger.error("%s %s", mission, e) # noqa: TRY400 @@ -739,6 +743,8 @@ def extract(self, log_file: str) -> None: extract = Extract() extract.args = argparse.Namespace() extract.args.verbose = self.args.verbose + extract.args.log_file = self.args.log_file + extract.commandline = self.commandline extract.logger.setLevel(self._log_levels[self.args.verbose]) extract.logger.addHandler(self.log_handler) @@ -748,6 +754,23 @@ def extract(self, log_file: str) -> None: input_file = extract.download_with_pooch(url, output_dir) return extract.extract_groups_to_files_netcdf4(input_file) + def combine(self, log_file: str) -> None: + self.logger.info("Combining netCDF files for log file: %s", log_file) + self.logger.info( + "Equivalent to the calibrate step for Dorado class vehicles. " + "Adds nudge positions and more layers of quality control." + ) + combine = Combine_NetCDF() + combine.args = argparse.Namespace() + combine.args.verbose = self.args.verbose + combine.args.log_file = self.args.log_file + combine.commandline = self.commandline + combine.logger.setLevel(self._log_levels[self.args.verbose]) + combine.logger.addHandler(self.log_handler) + + combine.combine_groups() + combine.write_netcdf() + @log_file_processor def process_log_file(self, log_file: str) -> None: netcdfs_dir = Path(BASE_LRAUV_PATH, Path(log_file).parent) @@ -764,7 +787,8 @@ def process_log_file(self, log_file: str) -> None: self.logger.info("commandline = %s", self.commandline) netcdfs_dir = self.extract(log_file) - # self.align(log_file) + self.combine(log_file=log_file) + self.align(log_file=log_file) # self.resample(log_file) # self.create_products(log_file) self.logger.info("Finished processing log file: %s", log_file) diff --git a/src/data/test_process_dorado.py b/src/data/test_process_dorado.py index 90ec047b..d368b183 100644 --- a/src/data/test_process_dorado.py +++ b/src/data/test_process_dorado.py @@ -31,9 +31,9 @@ def test_process_dorado(complete_dorado_processing): # but it will alert us if a code change unexpectedly changes the file size. # If code changes are expected to change the file size then we should # update the expected size here. - EXPECTED_SIZE_GITHUB = 621286 + EXPECTED_SIZE_GITHUB = 621404 EXPECTED_SIZE_ACT = 621298 - EXPECTED_SIZE_LOCAL = 621286 + EXPECTED_SIZE_LOCAL = 621452 if str(proc.args.base_path).startswith("/home/runner"): # The size is different in GitHub Actions, maybe due to different metadata assert nc_file.stat().st_size == EXPECTED_SIZE_GITHUB # noqa: S101 @@ -50,9 +50,9 @@ def test_process_dorado(complete_dorado_processing): check_md5 = True if check_md5: # Check that the MD5 hash has not changed - EXPECTED_MD5_GITHUB = "9f3f9e2e5abed08692ddb233dec0d0ac" + EXPECTED_MD5_GITHUB = "3bab0300e575c1d752a35f49e49e340e" EXPECTED_MD5_ACT = "bdb9473e5dedb694618f518b8cf0ca1e" - EXPECTED_MD5_LOCAL = "6ecb2229b00835055619e982fe9d5023" + EXPECTED_MD5_LOCAL = "9137be5a2ed840cfca94a723285355ec" if str(proc.args.base_path).startswith("/home/runner"): # The MD5 hash is different in GitHub Actions, maybe due to different metadata assert hashlib.md5(open(nc_file, "rb").read()).hexdigest() == EXPECTED_MD5_GITHUB # noqa: PTH123, S101, S324, SIM115 diff --git a/src/data/test_process_i2map.py b/src/data/test_process_i2map.py index e2f6cb05..66508695 100644 --- a/src/data/test_process_i2map.py +++ b/src/data/test_process_i2map.py @@ -30,9 +30,9 @@ def test_process_i2map(complete_i2map_processing): # but it will alert us if a code change unexpectedly changes the file size. # If code changes are expected to change the file size then we should # update the expected size here. - EXPECTED_SIZE_GITHUB = 58832 + EXPECTED_SIZE_GITHUB = 58942 EXPECTED_SIZE_ACT = 58816 - EXPECTED_SIZE_LOCAL = 58884 + EXPECTED_SIZE_LOCAL = 59042 if str(proc.args.base_path).startswith("/home/runner"): # The size is different in GitHub Actions, maybe due to different metadata assert nc_file.stat().st_size == EXPECTED_SIZE_GITHUB # noqa: S101