Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@
// Conflicting sizes for nudged_time and data - fixed by filtering GPS fixes to be monotonically increasing
//"args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot"]
// Fails in nudge_positions, maybe bad GPS data?
"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250911T073640/202509110736_202509120809.nc4", "--plot"]
//"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250911T073640/202509110736_202509120809.nc4", "--plot"]
// Nudged coordinate is way out of reasonable range - segment 15
"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250913T080940/202509130809_202509140809.nc4", "--plot"]


},
{
Expand Down Expand Up @@ -367,9 +370,13 @@
//"args": ["-v", "1", "--auv_name", "ahi", "--start", "20250401T000000", "--end", "20250502T000000", "--noinput", "--num_cores", "1", "--no_cleanup"]
// Fails with ValueError: different number of dimensions on data and dims: 2 vs 1 for wetlabsubat_digitized_raw_ad_counts variable
//"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250604_20250616/20250608T020852/202506080209_202506081934.nc4", "--no_cleanup", "--clobber"]
// ValueError: Dimension mismatch: wetlabsubat_digitized_raw_ad_counts_time has 33154 elements but wetlabsubat_hv_step_calibration_coefficient_time has 33155 elements
//"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250626T041517/202506260415_202506261400.nc4", "--no_cleanup"]
// ValueError: coords is not dict-like, but it has 1 items, which does not match the 2 dimensions of the data
"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250626T140000/202506261400_202506262031.nc4", "--no_cleanup"]
// Full month of June 2025 for Pontus with WetLabsUBAT Group data
//"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--noinput", "--num_cores", "1", "--no_cleanup"]
//"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250702T000000", "--noinput", "--num_cores", "1", "--no_cleanup", "--clobber"]
//"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--no_cleanup"]
//"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--noinput", "--num_cores", "1", "--no_cleanup", "--clobber"]
//"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250707T043011/slate.nc4", "--no_cleanup"]
//"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250903_20250903/20250903T202626/202509032026_202509032030.nc4", "--no_cleanup"]
// Does not have ctdseabird data, hence no coordinates for resampling
Expand All @@ -380,7 +387,8 @@
//"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250921T001456/202509210014_202509221430.nc4", "--no_cleanup"]
// A depth coordinate was not found for instrument navigation in
//"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250903_20250903/20250903T175939/202509031759_202509032026.nc4", "--no_cleanup"]
"args": ["-v", "1", "--auv_name", "brizo", "--start", "20250901T000000", "--end", "20251001T000000", "--no_cleanup"]
// For loading stoqs_lrauv_sep2025 - which has really bad nav in _2S_scieng.nc files
//"args": ["-v", "1", "--auv_name", "brizo", "--start", "20250901T000000", "--end", "20251001T000000", "--no_cleanup"]
},

]
Expand Down
9 changes: 6 additions & 3 deletions src/data/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915
# Process group-based variables (skip coordinate variables)
for variable in self.combined_nc:
# Skip time coordinate variables
if variable.endswith("_time"):
if variable.endswith(("_time", "_time_60hz")):
continue

# Skip the navigation coordinate variables themselves
Expand Down Expand Up @@ -551,6 +551,7 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915
# Extract group name from time coordinate
if timevar.endswith("_time_60hz"):
group_name = timevar[:-10] # Remove "_time_60hz" (10 chars)
group_name += "_60hz"
elif timevar.endswith("_time"):
group_name = timevar[:-5] # Remove "_time"
else:
Expand All @@ -563,9 +564,11 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915
# Get the time index for this variable
var_time = self.combined_nc[variable].get_index(timevar).view(np.int64).tolist()

# Calculate sampling rate
# Calculate sampling rate using a small sample of time differences
SAMPLE_RATE_SAMPLE = 10
sample_rate = 1.0 / (
np.mean(np.diff(self.combined_nc[timevar])) / np.timedelta64(1, "s")
np.mean(np.diff(self.combined_nc[timevar][:SAMPLE_RATE_SAMPLE]))
/ np.timedelta64(1, "s")
)

# Determine coordinate variable names based on group
Expand Down
128 changes: 112 additions & 16 deletions src/data/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class Combine_NetCDF:
logger.addHandler(_handler)
_log_levels = (logging.WARN, logging.INFO, logging.DEBUG)
variable_time_coord_mapping: dict = {}
TIME_MATCH_TOLERANCE = 1e-6 # seconds tolerance for matching time values

def __init__(
self,
Expand Down Expand Up @@ -652,11 +653,49 @@ def _add_consolidation_comment(self, time_info: dict) -> None:
f"Consolidated time coordinate from: {mapping_info}"
)

def _align_ubat_time_coordinates(self, ubat_2d, calib_coeff, time_dim, calib_time_dim):
"""Align UBAT and calibration coefficient time coordinates by finding common times."""
ubat_time = self.combined_nc[time_dim].to_numpy()
calib_time = self.combined_nc[calib_time_dim].to_numpy()

# Find intersection of time values
common_indices_ubat = []
common_indices_calib = []

for i, t_ubat in enumerate(ubat_time):
# Find matching time in calib_time
matches = np.where(np.abs(calib_time - t_ubat) < self.TIME_MATCH_TOLERANCE)[0]
if len(matches) > 0:
common_indices_ubat.append(i)
common_indices_calib.append(matches[0])

if len(common_indices_ubat) == 0:
error_message = f"No common time values found between {time_dim} and {calib_time_dim}"
raise ValueError(error_message)

self.logger.info(
"Found %d common time values out of %d in %s and %d in %s",
len(common_indices_ubat),
len(ubat_time),
time_dim,
len(calib_time),
calib_time_dim,
)

# Subset both arrays to common times
ubat_2d_aligned = ubat_2d.isel({time_dim: common_indices_ubat})
calib_coeff_aligned = calib_coeff.isel({calib_time_dim: common_indices_calib})

return ubat_2d_aligned, calib_coeff_aligned

def _expand_ubat_to_60hz(self) -> None:
"""Expand UBAT digitized_raw_ad_counts 2D array into 60hz time series.

Replaces the 2D array with a 1D 60Hz time series, analogous to how
Dorado biolume_raw is stored with a time60hz coordinate.

If expansion cannot be performed (e.g., not 2D data), the 2D variable
is removed to avoid confusing downstream consumers.
"""
ubat_var = "wetlabsubat_digitized_raw_ad_counts"

Expand All @@ -678,16 +717,74 @@ def _expand_ubat_to_60hz(self) -> None:
time_dim = ubat_2d.dims[0]
n_samples = ubat_2d.shape[1]

# Get the time coordinate
time_coord = self.combined_nc[time_dim]
self.logger.info(
"UBAT data shape: %s, time dimension: %s with %d points",
ubat_2d.shape,
time_dim,
len(self.combined_nc[time_dim]),
)

if "wetlabsubat_hv_step_calibration_coefficient" not in self.combined_nc:
self.logger.warning(
"No UBAT calibration coefficient found, removing 2D variable to avoid confusion"
)
del self.combined_nc[ubat_var]
return

# Get calibration coefficient and verify dimensions match
calib_coeff = self.combined_nc["wetlabsubat_hv_step_calibration_coefficient"]
calib_time_dim = calib_coeff.dims[0]

# Handle dimension mismatch by finding common time values
ubat_time = self.combined_nc[time_dim].to_numpy()
calib_time = self.combined_nc[calib_time_dim].to_numpy()

if len(ubat_time) != len(calib_time):
self.logger.warning(
"Dimension mismatch: %s has %d elements but %s has %d elements - "
"finding common time values",
time_dim,
len(ubat_time),
calib_time_dim,
len(calib_time),
)
ubat_2d, calib_coeff = self._align_ubat_time_coordinates(
ubat_2d, calib_coeff, time_dim, calib_time_dim
)
ubat_time = ubat_2d.coords[time_dim].to_numpy()
calib_time = calib_coeff.coords[calib_time_dim].to_numpy()

# Verify the time coordinate values are now identical
if not np.allclose(ubat_time, calib_time, rtol=1e-9):
error_message = (
f"Time coordinates {time_dim} and {calib_time_dim} have different values "
"even after alignment"
)
raise ValueError(error_message)

self.logger.info(
"Verified dimensions match: %s and %s both have %d elements",
time_dim,
calib_time_dim,
len(ubat_time),
)

# Multiply raw 60 hz values by the calibration coefficient
# Broadcasting: calib_coeff is (m,) and ubat_2d is (m, 60)
# This multiplies each row of ubat_2d by the corresponding coefficient
ubat_2d_calibrated = ubat_2d * calib_coeff.to_numpy()[:, np.newaxis]

# Get the time coordinate (use ubat_2d's time coordinate after alignment)
time_coord = ubat_2d.coords[time_dim]
n_times = len(time_coord)

# Save original attributes before removing
original_attrs = ubat_2d.attrs.copy()

# Calculate 60hz time offsets (assuming samples span 1 second)
# Each sample is 1/60th of a second apart
sample_offsets = np.arange(n_samples) / 60.0
# Subtract 0.5 seconds because 60Hz data are logged at the end of the 1-second period
sample_offsets = np.arange(n_samples) / 60.0 - 0.5

# Create 60hz time series by adding offsets to each 1Hz time
time_60hz_list = []
Expand All @@ -699,32 +796,30 @@ def _expand_ubat_to_60hz(self) -> None:

# Flatten the arrays
time_60hz = np.concatenate(time_60hz_list)
data_60hz = ubat_2d.to_numpy().flatten()
data_60hz = ubat_2d_calibrated.to_numpy().flatten()

# Remove the old 2D variable
del self.combined_nc[ubat_var]

# Create new 60hz time coordinate with attributes
# Create new 60hz time coordinate name
time_60hz_name = f"{time_dim}_60hz"
time_60hz_coord = xr.DataArray(
time_60hz,
dims=[time_60hz_name],
name=time_60hz_name,
attrs={
"units": "seconds since 1970-01-01T00:00:00Z",
"standard_name": "time",
"long_name": "Time at 60Hz sampling rate",
},
)

# Create replacement 1D variable with 60hz time coordinate
# Pass the numpy array as the coordinate value, not a DataArray
self.combined_nc[ubat_var] = xr.DataArray(
data_60hz,
coords={time_60hz_name: time_60hz_coord},
coords={time_60hz_name: time_60hz},
dims=[time_60hz_name],
name=ubat_var,
)

# Add attributes to the time coordinate
self.combined_nc[ubat_var].coords[time_60hz_name].attrs = {
"units": "seconds since 1970-01-01T00:00:00Z",
"standard_name": "time",
"long_name": "Time at 60Hz sampling rate",
}

# Restore and update attributes
self.combined_nc[ubat_var].attrs = original_attrs
self.combined_nc[ubat_var].attrs["long_name"] = "UBAT digitized raw AD counts at 60Hz"
Expand All @@ -744,6 +839,7 @@ def _initial_coordinate_qc(self) -> None:
"""Perform initial QC on core coordinate variables for specific log files."""
if self.log_file in (
"tethys/missionlogs/2012/20120908_20120920/20120909T010636/201209090106_201209091521.nc4",
"brizo/missionlogs/2025/20250909_20250915/20250913T080940/202509130809_202509140809.nc4",
):
self.logger.info("Performing initial coordinate QC for %s", self.log_file)
self._range_qc_combined_nc(
Expand Down
2 changes: 2 additions & 0 deletions src/data/nc42netcdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@
{"name": "average_bioluminescence"},
{"name": "flow_rate"},
{"name": "digitized_raw_ad_counts"},
{"name": "hv_step_calibration_coefficient"},
{"name": "flowrateCalibCoeff"},
],
}

Expand Down
11 changes: 8 additions & 3 deletions src/data/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,10 @@ def add_wetlabsubat_proxies( # noqa: PLR0913, PLR0915, C901, PLR0912
# s_ubat_raw includes daytime data - see below for nighttime data
s_ubat_raw = self.ds["wetlabsubat_digitized_raw_ad_counts"].to_pandas().dropna()

if s_ubat_raw.empty:
self.logger.info("No wetlabsubat_digitized_raw_ad_counts data to compute proxies")
return pd.Series(dtype="float64"), [], []

# Compute background biolumenesence envelope
self.logger.debug("Applying rolling min filter")
min_bg_unsmoothed = s_ubat_raw.rolling(
Expand Down Expand Up @@ -1220,7 +1224,8 @@ def add_wetlabsubat_proxies( # noqa: PLR0913, PLR0915, C901, PLR0912
nighttime_bg_biolume = (
pd.Series(s_min_bg, index=nighttime_ubat_raw.index).resample("1s").mean()
)
nighttime_bg_biolume_perliter = nighttime_bg_biolume.divide(flow) * 1000
# wetlabsubat_flow_rate is in l/s, so no * 1000 needed here
nighttime_bg_biolume_perliter = nighttime_bg_biolume.divide(flow)
pseudo_fluorescence = nighttime_bg_biolume_perliter / proxy_ratio_adinos
self.df_r["wetlabsubat_proxy_adinos"] = (
np.minimum(fluo, pseudo_fluorescence) / proxy_cal_factor
Expand Down Expand Up @@ -1299,11 +1304,11 @@ def select_nighttime_ubat_raw(
sunset,
sunrise,
)
time_coord_name = self.ds["wetlabsubat_digitized_raw_ad_counts"].dims[0]
nighttime_data = (
self.ds["wetlabsubat_digitized_raw_ad_counts"]
.where(
(self.ds["wetlabsubat_time_60hz"] > sunset)
& (self.ds["wetlabsubat_time_60hz"] < sunrise),
(self.ds[time_coord_name] > sunset) & (self.ds[time_coord_name] < sunrise),
)
.to_pandas()
.dropna()
Expand Down
2 changes: 1 addition & 1 deletion src/data/test_process_dorado.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_process_dorado(complete_dorado_processing):
check_md5 = True
if check_md5:
# Check that the MD5 hash has not changed
EXPECTED_MD5_GITHUB = "432690b72faf604e9845cfe6c3eb5b3e"
EXPECTED_MD5_GITHUB = "fec067d16eb5280a8bc2b6ef132821b8"
EXPECTED_MD5_ACT = "316955fd489862ad9ed5b63df8aa7db8"
EXPECTED_MD5_LOCAL = "f635cee8760aa0f40bd2070bc0c5fa65"
if str(proc.args.base_path).startswith("/home/runner"):
Expand Down
8 changes: 8 additions & 0 deletions src/data/test_process_lrauv.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,10 @@ def test_ubat_60hz_expansion(tmp_path):
["wetlabsubat_time", "sample"],
np.random.randint(0, 1000, (len(time_vals), 60)),
),
"wetlabsubat_hv_step_calibration_coefficient": (
["wetlabsubat_time"],
np.ones(len(time_vals)) * 1.5, # Example calibration coefficient
),
},
coords={"wetlabsubat_time": time_seconds},
)
Expand All @@ -236,6 +240,10 @@ def test_ubat_60hz_expansion(tmp_path):
"long_name": "Digitized raw AD counts",
"comment": "Test UBAT data",
}
combine.combined_nc["wetlabsubat_hv_step_calibration_coefficient"].attrs = {
"long_name": "HV step calibration coefficient",
"comment": "Test calibration coefficient",
}

# Call the expansion method
combine._expand_ubat_to_60hz()
Expand Down
4 changes: 2 additions & 2 deletions src/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,11 +332,11 @@ def nudge_positions( # noqa: C901, PLR0912, PLR0913, PLR0915
)
logger.warning(
" max(abs(lon)) = %s",
np.max(np.abs(lon[segi] + lon_nudge)),
float(np.max(np.abs(lon[segi] + lon_nudge))),
)
logger.warning(
" max(abs(lat)) = %s",
np.max(np.abs(lat[segi] + lat_nudge)),
float(np.max(np.abs(lat[segi] + lat_nudge))),
)

lon_nudged_array = np.append(lon_nudged_array, lon[segi] + lon_nudge)
Expand Down