diff --git a/.vscode/launch.json b/.vscode/launch.json index 019a6db..6f95cea 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -134,7 +134,10 @@ // Conflicting sizes for nudged_time and data - fixed by filtering GPS fixes to be monotonically increasing //"args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot"] // Fails in nudge_positions, maybe bad GPS data? - "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250911T073640/202509110736_202509120809.nc4", "--plot"] + //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250911T073640/202509110736_202509120809.nc4", "--plot"] + // Nudged coordinate is way out of reasonable range - segment 15 + "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250913T080940/202509130809_202509140809.nc4", "--plot"] + }, { @@ -367,9 +370,13 @@ //"args": ["-v", "1", "--auv_name", "ahi", "--start", "20250401T000000", "--end", "20250502T000000", "--noinput", "--num_cores", "1", "--no_cleanup"] // Fails with ValueError: different number of dimensions on data and dims: 2 vs 1 for wetlabsubat_digitized_raw_ad_counts variable //"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250604_20250616/20250608T020852/202506080209_202506081934.nc4", "--no_cleanup", "--clobber"] + // ValueError: Dimension mismatch: wetlabsubat_digitized_raw_ad_counts_time has 33154 elements but wetlabsubat_hv_step_calibration_coefficient_time has 33155 elements + //"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250626T041517/202506260415_202506261400.nc4", "--no_cleanup"] + // ValueError: coords is not dict-like, but it has 1 items, which does not match the 2 dimensions of the data + "args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250626T140000/202506261400_202506262031.nc4", "--no_cleanup"] // Full month of June 2025 for Pontus with WetLabsUBAT Group data - //"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--noinput", "--num_cores", "1", "--no_cleanup"] - //"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250702T000000", "--noinput", "--num_cores", "1", "--no_cleanup", "--clobber"] + //"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--no_cleanup"] + //"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--noinput", "--num_cores", "1", "--no_cleanup", "--clobber"] //"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250707T043011/slate.nc4", "--no_cleanup"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250903_20250903/20250903T202626/202509032026_202509032030.nc4", "--no_cleanup"] // Does not have ctdseabird data, hence no coordinates for resampling @@ -380,7 +387,8 @@ //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250921T001456/202509210014_202509221430.nc4", "--no_cleanup"] // A depth coordinate was not found for instrument navigation in //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250903_20250903/20250903T175939/202509031759_202509032026.nc4", "--no_cleanup"] - "args": ["-v", "1", "--auv_name", "brizo", "--start", "20250901T000000", "--end", "20251001T000000", "--no_cleanup"] + // For loading stoqs_lrauv_sep2025 - which has really bad nav in _2S_scieng.nc files + //"args": ["-v", "1", "--auv_name", "brizo", "--start", "20250901T000000", "--end", "20251001T000000", "--no_cleanup"] }, ] diff --git a/src/data/align.py b/src/data/align.py index 046954a..5eeeb2b 100755 --- a/src/data/align.py +++ b/src/data/align.py @@ -522,7 +522,7 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 # Process group-based variables (skip coordinate variables) for variable in self.combined_nc: # Skip time coordinate variables - if variable.endswith("_time"): + if variable.endswith(("_time", "_time_60hz")): continue # Skip the navigation coordinate variables themselves @@ -551,6 +551,7 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 # Extract group name from time coordinate if timevar.endswith("_time_60hz"): group_name = timevar[:-10] # Remove "_time_60hz" (10 chars) + group_name += "_60hz" elif timevar.endswith("_time"): group_name = timevar[:-5] # Remove "_time" else: @@ -563,9 +564,11 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 # Get the time index for this variable var_time = self.combined_nc[variable].get_index(timevar).view(np.int64).tolist() - # Calculate sampling rate + # Calculate sampling rate using a small sample of time differences + SAMPLE_RATE_SAMPLE = 10 sample_rate = 1.0 / ( - np.mean(np.diff(self.combined_nc[timevar])) / np.timedelta64(1, "s") + np.mean(np.diff(self.combined_nc[timevar][:SAMPLE_RATE_SAMPLE])) + / np.timedelta64(1, "s") ) # Determine coordinate variable names based on group diff --git a/src/data/combine.py b/src/data/combine.py index 2bc4aa9..663c712 100755 --- a/src/data/combine.py +++ b/src/data/combine.py @@ -92,6 +92,7 @@ class Combine_NetCDF: logger.addHandler(_handler) _log_levels = (logging.WARN, logging.INFO, logging.DEBUG) variable_time_coord_mapping: dict = {} + TIME_MATCH_TOLERANCE = 1e-6 # seconds tolerance for matching time values def __init__( self, @@ -652,11 +653,49 @@ def _add_consolidation_comment(self, time_info: dict) -> None: f"Consolidated time coordinate from: {mapping_info}" ) + def _align_ubat_time_coordinates(self, ubat_2d, calib_coeff, time_dim, calib_time_dim): + """Align UBAT and calibration coefficient time coordinates by finding common times.""" + ubat_time = self.combined_nc[time_dim].to_numpy() + calib_time = self.combined_nc[calib_time_dim].to_numpy() + + # Find intersection of time values + common_indices_ubat = [] + common_indices_calib = [] + + for i, t_ubat in enumerate(ubat_time): + # Find matching time in calib_time + matches = np.where(np.abs(calib_time - t_ubat) < self.TIME_MATCH_TOLERANCE)[0] + if len(matches) > 0: + common_indices_ubat.append(i) + common_indices_calib.append(matches[0]) + + if len(common_indices_ubat) == 0: + error_message = f"No common time values found between {time_dim} and {calib_time_dim}" + raise ValueError(error_message) + + self.logger.info( + "Found %d common time values out of %d in %s and %d in %s", + len(common_indices_ubat), + len(ubat_time), + time_dim, + len(calib_time), + calib_time_dim, + ) + + # Subset both arrays to common times + ubat_2d_aligned = ubat_2d.isel({time_dim: common_indices_ubat}) + calib_coeff_aligned = calib_coeff.isel({calib_time_dim: common_indices_calib}) + + return ubat_2d_aligned, calib_coeff_aligned + def _expand_ubat_to_60hz(self) -> None: """Expand UBAT digitized_raw_ad_counts 2D array into 60hz time series. Replaces the 2D array with a 1D 60Hz time series, analogous to how Dorado biolume_raw is stored with a time60hz coordinate. + + If expansion cannot be performed (e.g., not 2D data), the 2D variable + is removed to avoid confusing downstream consumers. """ ubat_var = "wetlabsubat_digitized_raw_ad_counts" @@ -678,8 +717,65 @@ def _expand_ubat_to_60hz(self) -> None: time_dim = ubat_2d.dims[0] n_samples = ubat_2d.shape[1] - # Get the time coordinate - time_coord = self.combined_nc[time_dim] + self.logger.info( + "UBAT data shape: %s, time dimension: %s with %d points", + ubat_2d.shape, + time_dim, + len(self.combined_nc[time_dim]), + ) + + if "wetlabsubat_hv_step_calibration_coefficient" not in self.combined_nc: + self.logger.warning( + "No UBAT calibration coefficient found, removing 2D variable to avoid confusion" + ) + del self.combined_nc[ubat_var] + return + + # Get calibration coefficient and verify dimensions match + calib_coeff = self.combined_nc["wetlabsubat_hv_step_calibration_coefficient"] + calib_time_dim = calib_coeff.dims[0] + + # Handle dimension mismatch by finding common time values + ubat_time = self.combined_nc[time_dim].to_numpy() + calib_time = self.combined_nc[calib_time_dim].to_numpy() + + if len(ubat_time) != len(calib_time): + self.logger.warning( + "Dimension mismatch: %s has %d elements but %s has %d elements - " + "finding common time values", + time_dim, + len(ubat_time), + calib_time_dim, + len(calib_time), + ) + ubat_2d, calib_coeff = self._align_ubat_time_coordinates( + ubat_2d, calib_coeff, time_dim, calib_time_dim + ) + ubat_time = ubat_2d.coords[time_dim].to_numpy() + calib_time = calib_coeff.coords[calib_time_dim].to_numpy() + + # Verify the time coordinate values are now identical + if not np.allclose(ubat_time, calib_time, rtol=1e-9): + error_message = ( + f"Time coordinates {time_dim} and {calib_time_dim} have different values " + "even after alignment" + ) + raise ValueError(error_message) + + self.logger.info( + "Verified dimensions match: %s and %s both have %d elements", + time_dim, + calib_time_dim, + len(ubat_time), + ) + + # Multiply raw 60 hz values by the calibration coefficient + # Broadcasting: calib_coeff is (m,) and ubat_2d is (m, 60) + # This multiplies each row of ubat_2d by the corresponding coefficient + ubat_2d_calibrated = ubat_2d * calib_coeff.to_numpy()[:, np.newaxis] + + # Get the time coordinate (use ubat_2d's time coordinate after alignment) + time_coord = ubat_2d.coords[time_dim] n_times = len(time_coord) # Save original attributes before removing @@ -687,7 +783,8 @@ def _expand_ubat_to_60hz(self) -> None: # Calculate 60hz time offsets (assuming samples span 1 second) # Each sample is 1/60th of a second apart - sample_offsets = np.arange(n_samples) / 60.0 + # Subtract 0.5 seconds because 60Hz data are logged at the end of the 1-second period + sample_offsets = np.arange(n_samples) / 60.0 - 0.5 # Create 60hz time series by adding offsets to each 1Hz time time_60hz_list = [] @@ -699,32 +796,30 @@ def _expand_ubat_to_60hz(self) -> None: # Flatten the arrays time_60hz = np.concatenate(time_60hz_list) - data_60hz = ubat_2d.to_numpy().flatten() + data_60hz = ubat_2d_calibrated.to_numpy().flatten() # Remove the old 2D variable del self.combined_nc[ubat_var] - # Create new 60hz time coordinate with attributes + # Create new 60hz time coordinate name time_60hz_name = f"{time_dim}_60hz" - time_60hz_coord = xr.DataArray( - time_60hz, - dims=[time_60hz_name], - name=time_60hz_name, - attrs={ - "units": "seconds since 1970-01-01T00:00:00Z", - "standard_name": "time", - "long_name": "Time at 60Hz sampling rate", - }, - ) # Create replacement 1D variable with 60hz time coordinate + # Pass the numpy array as the coordinate value, not a DataArray self.combined_nc[ubat_var] = xr.DataArray( data_60hz, - coords={time_60hz_name: time_60hz_coord}, + coords={time_60hz_name: time_60hz}, dims=[time_60hz_name], name=ubat_var, ) + # Add attributes to the time coordinate + self.combined_nc[ubat_var].coords[time_60hz_name].attrs = { + "units": "seconds since 1970-01-01T00:00:00Z", + "standard_name": "time", + "long_name": "Time at 60Hz sampling rate", + } + # Restore and update attributes self.combined_nc[ubat_var].attrs = original_attrs self.combined_nc[ubat_var].attrs["long_name"] = "UBAT digitized raw AD counts at 60Hz" @@ -744,6 +839,7 @@ def _initial_coordinate_qc(self) -> None: """Perform initial QC on core coordinate variables for specific log files.""" if self.log_file in ( "tethys/missionlogs/2012/20120908_20120920/20120909T010636/201209090106_201209091521.nc4", + "brizo/missionlogs/2025/20250909_20250915/20250913T080940/202509130809_202509140809.nc4", ): self.logger.info("Performing initial coordinate QC for %s", self.log_file) self._range_qc_combined_nc( diff --git a/src/data/nc42netcdfs.py b/src/data/nc42netcdfs.py index 844df80..925dc1c 100755 --- a/src/data/nc42netcdfs.py +++ b/src/data/nc42netcdfs.py @@ -77,6 +77,8 @@ {"name": "average_bioluminescence"}, {"name": "flow_rate"}, {"name": "digitized_raw_ad_counts"}, + {"name": "hv_step_calibration_coefficient"}, + {"name": "flowrateCalibCoeff"}, ], } diff --git a/src/data/resample.py b/src/data/resample.py index b4f7569..d47bf1b 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -1006,6 +1006,10 @@ def add_wetlabsubat_proxies( # noqa: PLR0913, PLR0915, C901, PLR0912 # s_ubat_raw includes daytime data - see below for nighttime data s_ubat_raw = self.ds["wetlabsubat_digitized_raw_ad_counts"].to_pandas().dropna() + if s_ubat_raw.empty: + self.logger.info("No wetlabsubat_digitized_raw_ad_counts data to compute proxies") + return pd.Series(dtype="float64"), [], [] + # Compute background biolumenesence envelope self.logger.debug("Applying rolling min filter") min_bg_unsmoothed = s_ubat_raw.rolling( @@ -1220,7 +1224,8 @@ def add_wetlabsubat_proxies( # noqa: PLR0913, PLR0915, C901, PLR0912 nighttime_bg_biolume = ( pd.Series(s_min_bg, index=nighttime_ubat_raw.index).resample("1s").mean() ) - nighttime_bg_biolume_perliter = nighttime_bg_biolume.divide(flow) * 1000 + # wetlabsubat_flow_rate is in l/s, so no * 1000 needed here + nighttime_bg_biolume_perliter = nighttime_bg_biolume.divide(flow) pseudo_fluorescence = nighttime_bg_biolume_perliter / proxy_ratio_adinos self.df_r["wetlabsubat_proxy_adinos"] = ( np.minimum(fluo, pseudo_fluorescence) / proxy_cal_factor @@ -1299,11 +1304,11 @@ def select_nighttime_ubat_raw( sunset, sunrise, ) + time_coord_name = self.ds["wetlabsubat_digitized_raw_ad_counts"].dims[0] nighttime_data = ( self.ds["wetlabsubat_digitized_raw_ad_counts"] .where( - (self.ds["wetlabsubat_time_60hz"] > sunset) - & (self.ds["wetlabsubat_time_60hz"] < sunrise), + (self.ds[time_coord_name] > sunset) & (self.ds[time_coord_name] < sunrise), ) .to_pandas() .dropna() diff --git a/src/data/test_process_dorado.py b/src/data/test_process_dorado.py index 35520db..88217bb 100644 --- a/src/data/test_process_dorado.py +++ b/src/data/test_process_dorado.py @@ -50,7 +50,7 @@ def test_process_dorado(complete_dorado_processing): check_md5 = True if check_md5: # Check that the MD5 hash has not changed - EXPECTED_MD5_GITHUB = "432690b72faf604e9845cfe6c3eb5b3e" + EXPECTED_MD5_GITHUB = "fec067d16eb5280a8bc2b6ef132821b8" EXPECTED_MD5_ACT = "316955fd489862ad9ed5b63df8aa7db8" EXPECTED_MD5_LOCAL = "f635cee8760aa0f40bd2070bc0c5fa65" if str(proc.args.base_path).startswith("/home/runner"): diff --git a/src/data/test_process_lrauv.py b/src/data/test_process_lrauv.py index 06e298b..49b44d2 100644 --- a/src/data/test_process_lrauv.py +++ b/src/data/test_process_lrauv.py @@ -227,6 +227,10 @@ def test_ubat_60hz_expansion(tmp_path): ["wetlabsubat_time", "sample"], np.random.randint(0, 1000, (len(time_vals), 60)), ), + "wetlabsubat_hv_step_calibration_coefficient": ( + ["wetlabsubat_time"], + np.ones(len(time_vals)) * 1.5, # Example calibration coefficient + ), }, coords={"wetlabsubat_time": time_seconds}, ) @@ -236,6 +240,10 @@ def test_ubat_60hz_expansion(tmp_path): "long_name": "Digitized raw AD counts", "comment": "Test UBAT data", } + combine.combined_nc["wetlabsubat_hv_step_calibration_coefficient"].attrs = { + "long_name": "HV step calibration coefficient", + "comment": "Test calibration coefficient", + } # Call the expansion method combine._expand_ubat_to_60hz() diff --git a/src/data/utils.py b/src/data/utils.py index 5c74ae2..45ebbc5 100644 --- a/src/data/utils.py +++ b/src/data/utils.py @@ -332,11 +332,11 @@ def nudge_positions( # noqa: C901, PLR0912, PLR0913, PLR0915 ) logger.warning( " max(abs(lon)) = %s", - np.max(np.abs(lon[segi] + lon_nudge)), + float(np.max(np.abs(lon[segi] + lon_nudge))), ) logger.warning( " max(abs(lat)) = %s", - np.max(np.abs(lat[segi] + lat_nudge)), + float(np.max(np.abs(lat[segi] + lat_nudge))), ) lon_nudged_array = np.append(lon_nudged_array, lon[segi] + lon_nudge)