From a791bbb53af1d60d9f1231f0093c962b5578732e Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Fri, 5 Dec 2025 11:14:35 -0800 Subject: [PATCH 1/8] Remove the despiking step. --- .vscode/launch.json | 8 +++--- src/data/nc42netcdfs.py | 54 +++++++++++++---------------------------- 2 files changed, 21 insertions(+), 41 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index b1c27c1e..0400b66d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -58,14 +58,14 @@ //"args": ["-v", "1", "--log_file", "ahi/missionlogs/2025/20250908_20250912/20250911T201546/202509112015_202509112115.nc4", "--known_hash", "d1235ead55023bea05e9841465d54a45dfab007a283320322e28b84438fb8a85"] // brizo 20250914T080941 has bad latitude and longitude values and lots of bad Universal latitude_time and longitude_time values //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] - //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/longitude_time"] + "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/longitude_time"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/latitude_time"] // brizo 20250916T230652 has several ESP Samples from stoqs_lrauv_sep2025 //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--plot_time", "/longitude_time"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109_cleaned_by_quinn.nc4", "--plot_time", "/longitude_time"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109_cleaned_by_quinn_latlon.nc4", "--plot_time", "/longitude_time"] // Conflicting sizes for nudged_time and data - fixed by filtering GPS fixes to be monotonically increasing - "args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot_time", "/longitude_time"] + //"args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot_time", "/longitude_time"] }, { "name": "2.0 - calibrate.py", @@ -345,7 +345,7 @@ // Lots bad time values in brizo 20250914T080941 due to memory corruption on the vehicle //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--clobber"] - //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--clobber", "--no_cleanup"] + "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--clobber", "--no_cleanup"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--no_cleanup"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--no_cleanup", "--clobber"] // Has different universals time coodinates for longitude/latitude and depth @@ -360,7 +360,7 @@ // Fails with ValueError: different number of dimensions on data and dims: 2 vs 1 for wetlabsubat_digitized_raw_ad_counts variable //"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250604_20250616/20250608T020852/202506080209_202506081934.nc4", "--no_cleanup", "--clobber"] // Full month of June 2025 for Pontus with WetLabsUBAT Group data - "args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--noinput", "--num_cores", "1", "--no_cleanup"] + //"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250721T000000", "--noinput", "--num_cores", "1", "--no_cleanup"] //"args": ["-v", "1", "--auv_name", "pontus", "--start", "20250601T000000", "--end", "20250702T000000", "--noinput", "--num_cores", "1", "--no_cleanup", "--clobber"] //"args": ["-v", "1", "--log_file", "pontus/missionlogs/2025/20250623_20250707/20250707T043011/slate.nc4", "--no_cleanup"] }, diff --git a/src/data/nc42netcdfs.py b/src/data/nc42netcdfs.py index ddb77b8c..505b2830 100755 --- a/src/data/nc42netcdfs.py +++ b/src/data/nc42netcdfs.py @@ -345,7 +345,8 @@ def _analyze_original_time_coordinates( if overlap_equal and len(lat_time) == len(lon_time): self.logger.info( - "Dead Reckoned timing: latitude_time and longitude_time are properly synchronized" + "Dead Reckoned timing: original latitude_time and longitude_time " + "are properly synchronized" ) return @@ -502,8 +503,6 @@ def _process_single_time_coordinate( # noqa: PLR0913 plot_time_coord_name: str | None, ) -> dict: """Process filtering for a single time coordinate.""" - from scipy.signal import medfilt - time_var = src_group.variables[time_coord_name] original_time_data = time_var[:] self.logger.info("Time coordinate %s: %d points", time_coord_name, len(original_time_data)) @@ -523,27 +522,23 @@ def _process_single_time_coordinate( # noqa: PLR0913 # First filter out values that fall outside of reasonable bounds valid_indices = self._filter_valid_time_indices(original_time_data) - # Despike to remove single point outliers before getting monotonic indices - time_data = medfilt(original_time_data[valid_indices], kernel_size=3) - - # Store valid indices and despiked data for plotting - if plot_data is not None: - plot_data["valid_indices"] = valid_indices - plot_data["valid_data"] = original_time_data[valid_indices] - plot_data["despiked"] = time_data.copy() + # Get the valid time subset + valid_time_data = original_time_data[valid_indices] - # Now apply monotonic filtering to the valid subset - mono_indices_in_filtered = self._get_monotonic_indices(time_data) + # Apply monotonic filtering to the valid subset + mono_indices_in_filtered = self._get_monotonic_indices(valid_time_data) # Convert monotonic indices back to original array indices # mono_indices_in_filtered are indices into the valid_indices subset # We need to map them back to indices in the original time array final_indices = [valid_indices[i] for i in mono_indices_in_filtered] - # Generate plot if requested for this variable + # Store data for plotting if requested if plot_data is not None: + plot_data["valid_indices"] = valid_indices + plot_data["valid_data"] = valid_time_data plot_data["final_indices"] = mono_indices_in_filtered - plot_data["final_data"] = time_data[mono_indices_in_filtered] + plot_data["final_data"] = valid_time_data[mono_indices_in_filtered] self._plot_time_filtering(plot_data) return self._create_time_filter_result( @@ -641,12 +636,11 @@ def _plot_time_filtering(self, plot_data: dict): original = plot_data["original"] valid_indices = plot_data["valid_indices"] valid_data = plot_data["valid_data"] - despiked = plot_data["despiked"] final_indices = plot_data["final_indices"] final_data = plot_data["final_data"] # Create figure with subplots - fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 9), sharex=True) + fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 8), sharex=True) # Plot 1: Original data ax1.plot(original, "b-", label="Original", alpha=0.7) @@ -672,40 +666,26 @@ def _plot_time_filtering(self, plot_data: dict): bbox={"boxstyle": "round", "facecolor": "wheat"}, ) - # Plot 3: After despiking - ax3.plot(despiked, "g-", label="After Median Filter (3-point)", alpha=0.7) + # Plot 3: Final After Monotonic filtered data + ax3.plot(final_indices, final_data, "r.-", label="After Monotonic Filter", alpha=0.7) + ax3.set_xlabel("Index") ax3.set_ylabel("Time Value") ax3.legend() ax3.grid(visible=True, alpha=0.3) - ax3.text( - 0.02, - 0.60, - f"Points removed: {len(valid_data) - len(despiked)}\n", - transform=ax3.transAxes, - verticalalignment="top", - bbox={"boxstyle": "round", "facecolor": "wheat"}, - ) - - # Plot 4: Final After Monotonic filtered data - ax4.plot(final_indices, final_data, "r.-", label="After Monotonic Filter", alpha=0.7) - ax4.set_xlabel("Index") - ax4.set_ylabel("Time Value") - ax4.legend() - ax4.grid(visible=True, alpha=0.3) # Add statistics text stats_text = ( - f"Points removed: {len(despiked) - len(final_data)}\n" + f"Points removed: {len(valid_data) - len(final_data)}\n" f"Original points: {len(original)}\n" f"After final filter: {len(final_data)}\n" f"Total removed: {len(original) - len(final_data)} " f"({100 * (len(original) - len(final_data)) / len(original):.1f}%)" ) - ax4.text( + ax3.text( 0.02, 0.90, stats_text, - transform=ax4.transAxes, + transform=ax3.transAxes, verticalalignment="top", bbox={"boxstyle": "round", "facecolor": "wheat"}, ) From 6638a4a1656b2eb7452b69e35fde24959c176870 Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 10:16:54 -0800 Subject: [PATCH 2/8] Add --plot_universals and do intersection before non-monotonic filtering. --- src/data/nc42netcdfs.py | 299 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 285 insertions(+), 14 deletions(-) diff --git a/src/data/nc42netcdfs.py b/src/data/nc42netcdfs.py index 505b2830..f4ec4485 100755 --- a/src/data/nc42netcdfs.py +++ b/src/data/nc42netcdfs.py @@ -134,6 +134,7 @@ def __init__( # noqa: PLR0913 self, log_file: str = None, plot_time: str = None, + plot_universals: bool = False, # noqa: FBT001, FBT002 filter_monotonic_time: bool = True, # noqa: FBT001, FBT002 verbose: int = 0, commandline: str = "", @@ -143,15 +144,18 @@ def __init__( # noqa: PLR0913 Args: log_file: Log file path for processing plot_time: Optional plot time specification (e.g., /latitude_time) + plot_universals: Plot longitude, latitude, depth time filtering for root group filter_monotonic_time: Filter out non-monotonic time values verbose: Verbosity level (0-2) commandline: Command line string for tracking """ self.log_file = log_file self.plot_time = plot_time + self.plot_universals = plot_universals self.filter_monotonic_time = filter_monotonic_time self.verbose = verbose self.commandline = commandline + self.universals_plot_data = {} # Store plot data for universals def download_with_pooch(self, url, local_dir, known_hash=None): """Download using pooch with caching and verification.""" @@ -310,9 +314,15 @@ def _get_time_filters_for_variables( ) time_filters[time_coord_name] = time_filter - # Align latitude and longitude in root group if needed + # For root group, apply intersection of valid indices before monotonic filtering if group_name == "/": + time_filters = self._apply_universals_intersection( + log_file, src_group, time_filters, vars_to_extract + ) time_filters = self._align_root_group_coordinates(time_filters, vars_to_extract) + # Plot universals if requested + if self.plot_universals: + self._plot_universals_filtering() return time_filters @@ -525,14 +535,40 @@ def _process_single_time_coordinate( # noqa: PLR0913 # Get the valid time subset valid_time_data = original_time_data[valid_indices] - # Apply monotonic filtering to the valid subset + # Apply monotonic filtering mono_indices_in_filtered = self._get_monotonic_indices(valid_time_data) # Convert monotonic indices back to original array indices - # mono_indices_in_filtered are indices into the valid_indices subset - # We need to map them back to indices in the original time array final_indices = [valid_indices[i] for i in mono_indices_in_filtered] + # For root group universals, store data for intersection step + is_universal_coord = group_name == "/" and time_coord_name in [ + "longitude_time", + "latitude_time", + "depth_time", + "time_time", + ] + + if is_universal_coord: + # Store intermediate data for intersection step + if self.plot_universals: + self.universals_plot_data[time_coord_name] = { + "original": original_time_data.copy(), + "valid_indices": valid_indices.copy(), + "valid_data": valid_time_data.copy(), + "mono_indices": final_indices.copy(), + "mono_data": valid_time_data[mono_indices_in_filtered].copy(), + "log_file": log_file, + } + # Return with monotonic-filtered indices - intersection happens later + return { + "indices": final_indices, + "filtered": len(final_indices) < len(original_time_data), + "comment": "Pre-intersection state", + "mono_indices": final_indices, # Store for intersection + "original_time_data": original_time_data, # Store for reprocessing + } + # Store data for plotting if requested if plot_data is not None: plot_data["valid_indices"] = valid_indices @@ -545,6 +581,62 @@ def _process_single_time_coordinate( # noqa: PLR0913 final_indices, len(original_time_data), time_coord_name ) + def _apply_universals_intersection( + self, + log_file: str, + src_group: netCDF4.Group, + time_filters: dict, + vars_to_extract: list[str], + ) -> dict: + """Apply intersection of monotonic-filtered indices for longitude, latitude, and depth. + + After outlier removal and monotonic filtering, we intersect the indices so that only + time points that exist in all three coordinates are kept. + """ + # Identify which universal coordinates are present + universal_coords = ["longitude_time", "latitude_time", "depth_time", "time_time"] + present_coords = [tc for tc in universal_coords if tc in time_filters] + + if len(present_coords) < 2: # noqa: PLR2004 + # Need at least 2 coordinates to do intersection + self.logger.info("Less than 2 universal coordinates found, skipping intersection") + return time_filters + + # Get monotonic-filtered indices for each coordinate + mono_indices_sets = {} + for time_coord in present_coords: + mono_indices_sets[time_coord] = set(time_filters[time_coord]["mono_indices"]) + + # Compute intersection of all monotonic indices + intersected_indices = set.intersection(*mono_indices_sets.values()) + intersected_indices_list = sorted(intersected_indices) + + self.logger.info( + "Intersection of universal coordinates (after monotonic filtering): %s", + ", ".join(f"{tc}: {len(mono_indices_sets[tc])} points" for tc in present_coords), + ) + self.logger.info("After intersection: %d common points", len(intersected_indices_list)) + + # Update each coordinate with intersected indices + for time_coord in present_coords: + original_time_data = time_filters[time_coord]["original_time_data"] + + # Update the filter with final indices + time_filters[time_coord] = self._create_time_filter_result( + intersected_indices_list, len(original_time_data), time_coord + ) + + # Update plot data if collecting for universals + if self.plot_universals and time_coord in self.universals_plot_data: + self.universals_plot_data[time_coord]["final_indices"] = ( + intersected_indices_list.copy() + ) + self.universals_plot_data[time_coord]["final_data"] = original_time_data[ + intersected_indices_list + ].copy() + + return time_filters + def _is_time_variable(self, var_name: str, var) -> bool: """Check if a variable is a time coordinate variable.""" # Check name pattern @@ -564,24 +656,40 @@ def _filter_valid_time_indices(self, time_data) -> list[int]: """Filter out wildly invalid time values before monotonic filtering. Returns indices of time values that are reasonable Unix epoch timestamps. - Uses numpy for efficient vectorized operations. + Uses time bounds from log file name. """ - # LRAUV data bounds: September 2012 to current + 5 years buffer - lrauv_start_date = datetime(2012, 9, 1, tzinfo=UTC) - current_date = datetime.now(UTC) - future_buffer_date = current_date.replace(year=current_date.year + 5) + # Parse filename like: 202509140809_202509150109.nc4 + # Format: YYYYMMDDHHMM_YYYYMMDDHHMM + import re + from datetime import timedelta + from pathlib import Path + + filename = Path(self.log_file).stem + match = re.search(r"(\d{12})_(\d{12})", filename) - MIN_UNIX_TIME = int(lrauv_start_date.timestamp()) # September 1, 2012 UTC - MAX_UNIX_TIME = int(future_buffer_date.timestamp()) # Current + 5 years buffer + start_str = match.group(1) + end_str = match.group(2) + + # Parse YYYYMMDDHHMM format + start_time = datetime.strptime(start_str, "%Y%m%d%H%M").replace(tzinfo=UTC) + end_time = datetime.strptime(end_str, "%Y%m%d%H%M").replace(tzinfo=UTC) + + # Add 10-minute buffer before and after + MIN_UNIX_TIME = int((start_time - timedelta(minutes=10)).timestamp()) + MAX_UNIX_TIME = int((end_time + timedelta(minutes=10)).timestamp()) + + self.logger.debug( + "Using time bounds from log file: %s to %s", + (start_time - timedelta(minutes=10)).isoformat(), + (end_time + timedelta(minutes=10)).isoformat(), + ) # Convert to numpy array for efficient operations time_array = np.asarray(time_data) - # Create boolean masks for valid conditions + # Basic validity checks is_finite = np.isfinite(time_array) is_in_range = (time_array >= MIN_UNIX_TIME) & (time_array <= MAX_UNIX_TIME) - - # Combine all conditions - all must be True for valid indices valid_mask = is_finite & is_in_range # Get indices where all conditions are met @@ -624,6 +732,153 @@ def _get_monotonic_indices(self, time_data) -> list[int]: return mono_indices + def _plot_universals_filtering(self): + """Plot longitude, latitude, and depth time coordinate filtering for root group.""" + if not MATPLOTLIB_AVAILABLE: + self.logger.error("Matplotlib not available. Install with: uv add matplotlib") + return + + if not self.universals_plot_data: + self.logger.warning("No universals plot data collected") + return + + import matplotlib.pyplot as plt + + # Determine which coordinates are available + coords = [] + coord_names = [] + for coord in ["longitude", "latitude", "depth", "time"]: + time_coord = f"{coord}_time" + if time_coord in self.universals_plot_data: + coords.append(coord) + coord_names.append(time_coord) + + if not coords: + self.logger.warning("No universal coordinates found for plotting") + return + + n_coords = len(coords) + fig, axes = plt.subplots(n_coords, 4, figsize=(18, 2.5 * n_coords), sharex="col") + + # Handle case of single coordinate + if n_coords == 1: + axes = axes.reshape(1, -1) + + log_file_name = self.universals_plot_data[coord_names[0]]["log_file"] + fig.suptitle( + f"Universal Coordinates Time Filtering\nFile: {log_file_name}", + fontsize=14, + fontweight="bold", + ) + + for i, (coord, time_coord) in enumerate(zip(coords, coord_names, strict=True)): + data = self.universals_plot_data[time_coord] + original = data["original"] + valid_indices = data["valid_indices"] + valid_data = data["valid_data"] + mono_indices = data["mono_indices"] + mono_data = data["mono_data"] + final_indices = data["final_indices"] + final_data = data["final_data"] + + self._plot_universals_row( + axes[i], + coord, + original, + valid_indices, + valid_data, + mono_indices, + mono_data, + final_indices, + final_data, + is_first_row=(i == 0), + ) + + # Set x-label only on bottom row + for j in range(4): + axes[-1, j].set_xlabel("Index") + + plt.tight_layout() + plt.show() + + self.logger.info("Universals time filtering plot displayed") + + def _plot_universals_row( # noqa: PLR0913 + self, + axes_row, + coord: str, + original, + valid_indices, + valid_data, + mono_indices, + mono_data, + final_indices, + final_data, + is_first_row: bool, # noqa: FBT001 + ): + """Plot a single row of the universals filtering plot.""" + # Column 1: Original data + axes_row[0].plot(original, "b-", alpha=0.7) + axes_row[0].set_ylabel(f"{coord}_time\n(Unix seconds)") + if is_first_row: + axes_row[0].set_title("Original") + axes_row[0].grid(visible=True, alpha=0.3) + axes_row[0].text( + 0.02, + 0.95, + f"Points: {len(original)}", + transform=axes_row[0].transAxes, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "lightblue", "alpha": 0.8}, + ) + + # Column 2: After outlier removal + axes_row[1].plot(valid_indices, valid_data, "m.-", alpha=0.7, markersize=2) + if is_first_row: + axes_row[1].set_title("After Outlier Removal") + axes_row[1].grid(visible=True, alpha=0.3) + removed_outliers = len(original) - len(valid_data) + axes_row[1].text( + 0.02, + 0.95, + f"Points: {len(valid_data)}\nRemoved: {removed_outliers}", + transform=axes_row[1].transAxes, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "plum", "alpha": 0.8}, + ) + + # Column 3: After monotonic removal + axes_row[2].plot(mono_indices, mono_data, "g.-", alpha=0.7, markersize=2) + if is_first_row: + axes_row[2].set_title("After Monotonic Removal") + axes_row[2].grid(visible=True, alpha=0.3) + removed_monotonic = len(valid_data) - len(mono_data) + axes_row[2].text( + 0.02, + 0.95, + f"Points: {len(mono_data)}\nRemoved: {removed_monotonic}", + transform=axes_row[2].transAxes, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "lightgreen", "alpha": 0.8}, + ) + + # Column 4: After intersection + axes_row[3].plot(final_indices, final_data, "r.-", alpha=0.7, markersize=2) + if is_first_row: + axes_row[3].set_title("After Intersection") + axes_row[3].grid(visible=True, alpha=0.3) + removed_intersection = len(mono_data) - len(final_data) + total_removed = len(original) - len(final_data) + axes_row[3].text( + 0.02, + 0.95, + f"Points: {len(final_data)}\nRemoved: {removed_intersection}\n" + f"Total removed: {total_removed} ({100 * total_removed / len(original):.1f}%)", + transform=axes_row[3].transAxes, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "lightcoral", "alpha": 0.8}, + ) + def _plot_time_filtering(self, plot_data: dict): """Plot before and after time coordinate filtering.""" if not MATPLOTLIB_AVAILABLE: @@ -1090,6 +1345,12 @@ def process_command_line(self): + " --log_file brizo/missionlogs/2025/20250909_20250915/20250914T080941/" + "202509140809_202509150109.nc4 --plot_time /latitude_time\n" ) + examples += ( + " " + + sys.argv[0] + + " --log_file brizo/missionlogs/2025/20250909_20250915/20250914T080941/" + + "202509140809_202509150109.nc4 --plot_universals\n" + ) # Use shared parser with nc42netcdfs-specific additions parser = get_standard_lrauv_parser( @@ -1138,12 +1399,22 @@ def process_command_line(self): "Format for is /Group/variable_name." ), ) + parser.add_argument( + "--plot_universals", + action="store_true", + help=( + "Plot time filtering for longitude, latitude, and depth coordinates " + "in the root (/) group. Shows original, after outlier removal, after " + "intersection, and after non-monotonic removal in 4-column layout." + ), + ) self.args = parser.parse_args() # Set instance attributes from parsed arguments self.log_file = self.args.log_file self.plot_time = self.args.plot_time + self.plot_universals = self.args.plot_universals self.filter_monotonic_time = self.args.filter_monotonic_time self.verbose = self.args.verbose self.commandline = " ".join(sys.argv) From 9df9466275944ddfd61f49764c677c989ceef679 Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 10:20:47 -0800 Subject: [PATCH 3/8] Shorten method names & improve formatting of log messages. --- .vscode/launch.json | 7 ++++--- src/data/combine.py | 30 +++++++++++++++++------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 0400b66d..c5cab5ae 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -58,7 +58,8 @@ //"args": ["-v", "1", "--log_file", "ahi/missionlogs/2025/20250908_20250912/20250911T201546/202509112015_202509112115.nc4", "--known_hash", "d1235ead55023bea05e9841465d54a45dfab007a283320322e28b84438fb8a85"] // brizo 20250914T080941 has bad latitude and longitude values and lots of bad Universal latitude_time and longitude_time values //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] - "args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/longitude_time"] + //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/longitude_time"] + "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_universals"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/latitude_time"] // brizo 20250916T230652 has several ESP Samples from stoqs_lrauv_sep2025 //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--plot_time", "/longitude_time"] @@ -122,11 +123,11 @@ "program": "${workspaceFolder}/src/data/combine.py", "console": "integratedTerminal", "justMyCode": false, - //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] + "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--plot"] //"args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120909T010636/201209090106_201209091521.nc4", "--plot"] // Conflicting sizes for nudged_time and data - fixed by filtering GPS fixes to be monotonically increasing - "args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot"] + //"args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot"] }, { diff --git a/src/data/combine.py b/src/data/combine.py index 7d745830..2bc4aa99 100755 --- a/src/data/combine.py +++ b/src/data/combine.py @@ -381,7 +381,7 @@ def _biolume_process(self, sensor): set_to_nan=True, ) - def _consolidate_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dict: + def _cons_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dict: # noqa: C901 """Analyze and consolidate time coordinates for a group. Returns: @@ -391,8 +391,11 @@ def _consolidate_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dic - time_coord_mapping: dict mapping original dims to consolidated dims - variable_time_coord_mapping: dict mapping variables to their time coords """ - # Find all time variables in this group - time_vars = {var: ds[var] for var in ds.variables if var.lower().endswith("time")} + if group_name.lower() == "universals": + # Do not include the "time" record variable from universals group + time_vars = {var: ds[var] for var in ds.variables if var.lower().endswith("_time")} + else: + time_vars = {var: ds[var] for var in ds.variables if var.lower().endswith("time")} if not time_vars: return { @@ -407,7 +410,7 @@ def _consolidate_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dic time_name = list(time_vars.keys())[0] consolidated_name = f"{group_name}_time" self.logger.info( - "Group %s: Single time coordinate '%s' - using as '%s'", + "Group %s: Single time coordinate '%s' -> '%s'", group_name, time_name, consolidated_name, @@ -475,7 +478,7 @@ def _consolidate_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dic time_coord_mapping = dict.fromkeys(time_vars, consolidated_name) self.logger.info( - "%-65s %s", + "%-77s %s", f"Consoliding {len(time_vars)} coordinates to", consolidated_name, ) @@ -512,15 +515,15 @@ def _consolidate_group_time_coords(self, ds: xr.Dataset, group_name: str) -> dic def _add_time_coordinates_to_combined(self, time_info: dict, ds: xr.Dataset) -> None: """Add time coordinates to the combined dataset.""" if time_info["consolidated_time_name"]: - self._add_consolidated_time_coordinate(time_info) + self._add_cons_time_coord(time_info) else: - self._add_separate_time_coordinates(time_info, ds) + self._add_sep_time_coord(time_info, ds) - def _add_consolidated_time_coordinate(self, time_info: dict) -> None: + def _add_cons_time_coord(self, time_info: dict) -> None: """Add a consolidated time coordinate to the combined dataset.""" time_name = time_info["consolidated_time_name"] self.logger.info( - "Adding consolidated time coordinate %-45s %s", + "Adding consolidated time coordinate %-44s %s", f"{time_name} as", time_name, ) @@ -531,7 +534,7 @@ def _add_consolidated_time_coordinate(self, time_info: dict) -> None: ) self.combined_nc[time_name].attrs = time_info["consolidated_time_data"].attrs.copy() - def _add_separate_time_coordinates(self, time_info: dict, ds: xr.Dataset) -> None: + def _add_sep_time_coord(self, time_info: dict, ds: xr.Dataset) -> None: """Add separate time coordinates to the combined dataset.""" for orig_time_var, new_time_var in time_info["time_coord_mapping"].items(): self.logger.info( @@ -603,7 +606,7 @@ def _add_time_metadata_to_variable(self, var_name: str, dim_name: str) -> None: ) self.combined_nc[var_name].coords[dim_name].attrs["standard_name"] = "time" - def _process_group_variables(self, ds: xr.Dataset, group_name: str, time_info: dict) -> None: + def _process_group_vars(self, ds: xr.Dataset, group_name: str, time_info: dict) -> None: """Process all data variables in a group.""" for orig_var in ds.variables: if orig_var.lower().endswith("time"): @@ -877,18 +880,19 @@ def combine_groups(self) -> None: self.combined_nc = xr.Dataset() for group_file in group_files: + self.logger.info("-" * 110) self.logger.info("Group file: %s", group_file.name) # Open group file without decoding to have np.allclose work properly with xr.open_dataset(group_file, decode_cf=False) as ds: # Group name to prepend variable names is lowercase with underscores removed group_name = group_file.stem.split(f"{GROUP}_")[1].replace("_", "").lower() - time_info = self._consolidate_group_time_coords(ds, group_name) + time_info = self._cons_group_time_coords(ds, group_name) # Add time coordinate(s) to combined dataset self._add_time_coordinates_to_combined(time_info, ds) # Process all data variables in the group - self._process_group_variables(ds, group_name, time_info) + self._process_group_vars(ds, group_name, time_info) # Add consolidation comment if applicable self._add_consolidation_comment(time_info) From 9a470061840207d11e9f137885d03dfa04e051ca Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 10:45:02 -0800 Subject: [PATCH 4/8] Remove redundant coord interpolation, fix sample_rate and comment metadata. --- .vscode/launch.json | 4 +-- src/data/align.py | 77 +++++++++++++++------------------------------ 2 files changed, 28 insertions(+), 53 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index c5cab5ae..fcb5c6ed 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -150,8 +150,8 @@ //"args": ["-v", "1", "--auv_name", "dorado", "--mission", "2008.289.03"], //"args": ["-v", "1", "--auv_name", "dorado", "--mission", "2023.192.01"], //"args": ["-v", "1", "--auv_name", "dorado", "--mission", "2024.317.01"], - //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4"] - "args": ["-v", "1", "--auv_name", "dorado", "--mission", "2025.316.02"], + "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4"] + //"args": ["-v", "1", "--auv_name", "dorado", "--mission", "2025.316.02"], }, { "name": "3.1 - align.py for LRAUV --log_file", diff --git a/src/data/align.py b/src/data/align.py index 082bafbe..b2d41dbf 100755 --- a/src/data/align.py +++ b/src/data/align.py @@ -564,53 +564,10 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 var_time = self.combined_nc[variable].get_index(timevar).view(np.int64).tolist() # Calculate sampling rate - sample_rate = np.round( - 1.0 / (np.mean(np.diff(self.combined_nc[timevar])) / np.timedelta64(1, "s")), - decimals=2, + sample_rate = 1.0 / ( + np.mean(np.diff(self.combined_nc[timevar])) / np.timedelta64(1, "s") ) - # Create interpolated coordinate variables for this group - coord_names = ["depth", "latitude", "longitude"] - coord_interps = [depth_interp, lat_interp, lon_interp] - coord_sources = [nav_coords["depth"], nav_coords["latitude"], nav_coords["longitude"]] - - for coord_name, coord_interp, coord_source in zip( - coord_names, coord_interps, coord_sources, strict=True - ): - coord_var_name = f"{group_name}_{coord_name}" - - self.aligned_nc[coord_var_name] = xr.DataArray( - coord_interp(var_time).astype(np.float64).tolist(), - dims={timevar}, - coords=[self.combined_nc[variable].get_index(timevar)], - name=coord_var_name, - ) - - # Copy attributes from source coordinate - if coord_source in self.combined_nc: - self.aligned_nc[coord_var_name].attrs = self.combined_nc[coord_source].attrs - - # Update attributes - self.aligned_nc[coord_var_name].attrs["long_name"] = coord_name.title() - self.aligned_nc[coord_var_name].attrs["instrument_sample_rate_hz"] = sample_rate - - if coord_name in ["longitude", "latitude", "depth"]: - self.aligned_nc[coord_var_name].attrs["comment"] = ( - self.aligned_nc[coord_var_name].attrs.get("comment", "") - + f". Variable {coord_source} from {src_file} file linearly" - f" interpolated onto {group_name} time values." - ) - - # Update spatial temporal bounds for global metadata - if pd.to_datetime(self.aligned_nc[timevar][0].values).tz_localize(UTC) < pd.to_datetime( - self.min_time - ): - self.min_time = pd.to_datetime(self.aligned_nc[timevar][0].values).tz_localize(UTC) - if pd.to_datetime(self.aligned_nc[timevar][-1].values).tz_localize( - UTC - ) > pd.to_datetime(self.max_time): - self.max_time = pd.to_datetime(self.aligned_nc[timevar][-1].values).tz_localize(UTC) - time_coord = variable_time_coord_mapping.get(variable) depth_coord = ( time_coord[:-5] + "_depth" @@ -639,7 +596,10 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 ) self.aligned_nc[depth_coord].attrs["long_name"] = "Depth" self.aligned_nc[depth_coord].attrs["comment"] = "depth from Group_Universals.nc" - self.aligned_nc[depth_coord].attrs["instrument_sample_rate_hz"] = sample_rate + TINY_SAMPLE_RATE = 10e-2 + self.aligned_nc[depth_coord].attrs["instrument_sample_rate_hz"] = ( + f"{sample_rate:.2f}" if sample_rate > TINY_SAMPLE_RATE else f"{sample_rate:.6f}" + ) self.aligned_nc[lat_coord] = xr.DataArray( lat_interp(var_time).astype(np.float64).tolist(), @@ -649,7 +609,7 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 ) self.aligned_nc[lat_coord].attrs = self.combined_nc["nudged_latitude"].attrs self.aligned_nc[lat_coord].attrs["comment"] += ( - f". Variable nudged_latitude from {src_file} file linearly" + f". Variable nudged_latitude linearly" f" interpolated onto {variable.split('_')[0]} time values." ) self.aligned_nc[lat_coord].attrs["long_name"] = "Latitude" @@ -663,13 +623,22 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 ) self.aligned_nc[lon_coord].attrs = self.combined_nc["nudged_longitude"].attrs self.aligned_nc[lon_coord].attrs["comment"] += ( - f". Variable nudged_longitude from {src_file} file linearly" + f". Variable nudged_longitude linearly" f" interpolated onto {variable.split('_')[0]} time values." ) self.aligned_nc[lon_coord].attrs["long_name"] = "Longitude" self.aligned_nc[lon_coord].attrs["instrument_sample_rate_hz"] = sample_rate - # Update bounds using the interpolated coordinates + # Update spatial temporal bounds for global metadata + if pd.to_datetime(self.aligned_nc[timevar][0].values).tz_localize(UTC) < pd.to_datetime( + self.min_time + ): + self.min_time = pd.to_datetime(self.aligned_nc[timevar][0].values).tz_localize(UTC) + if pd.to_datetime(self.aligned_nc[timevar][-1].values).tz_localize( + UTC + ) > pd.to_datetime(self.max_time): + self.max_time = pd.to_datetime(self.aligned_nc[timevar][-1].values).tz_localize(UTC) + if self.aligned_nc[depth_coord].min() < self.min_depth: self.min_depth = self.aligned_nc[depth_coord].min().to_numpy() if self.aligned_nc[depth_coord].max() > self.max_depth: @@ -703,8 +672,14 @@ def process_combined(self) -> Path: # noqa: C901, PLR0912, PLR0915 else: self.logger.info("Skipping setting coordinates attribute for %s", variable) - self.logger.info("%s: instrument_sample_rate_hz = %.2f", variable, sample_rate) - self.aligned_nc[variable].attrs["instrument_sample_rate_hz"] = sample_rate + self.logger.info( + "%s: instrument_sample_rate_hz = %s", + variable, + f"{sample_rate:.2f}" if sample_rate > TINY_SAMPLE_RATE else f"{sample_rate:.6f}", + ) + self.aligned_nc[variable].attrs["instrument_sample_rate_hz"] = ( + f"{sample_rate:.2f}" if sample_rate > TINY_SAMPLE_RATE else f"{sample_rate:.6f}" + ) return netcdfs_dir From 98b4668e66e228fcfc1358a354884f104a3b9cf7 Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 10:55:56 -0800 Subject: [PATCH 5/8] Add _get_pitch_corrected_instrument() for more robustness, fixup coordinate metadata. --- src/data/resample.py | 116 ++++++++++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 35 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index 87a39c3e..fb8bd5c1 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -365,6 +365,36 @@ def instruments_variables(self, nc_file: str) -> dict: instr_vars[instr].append(variable) return instr_vars + def _get_pitch_corrected_instrument(self) -> str: + # Determine which instrument provides the pitch corrected depth + # For dorado/i2map missions, this is usually 'ctd1' or 'seabird25p' + # For LRAUV missions, this is usually 'ctdseabird' or 'ctdneilbrown' + if self.log_file: + pitch_corrected_instr = "ctdseabird" + if f"{pitch_corrected_instr}_depth" in self.ds: + return pitch_corrected_instr + pitch_corrected_instr += "_sea_water_temperature" + if f"{pitch_corrected_instr}_depth" in self.ds: + return pitch_corrected_instr + pitch_corrected_instr = "ctdneilbrown" + if f"{pitch_corrected_instr}_depth" in self.ds: + return pitch_corrected_instr + else: + pitch_corrected_instr = "ctd1" + if f"{pitch_corrected_instr}_depth" in self.ds: + return pitch_corrected_instr + pitch_corrected_instr = "seabird25p" + if f"{pitch_corrected_instr}_depth" in self.ds: + return pitch_corrected_instr + + # If neither instrument found, log error and raise InvalidAlignFile exception. + self.logger.warning("No pitch corrected depth coordinate found") + self.logger.info( + "Cannot continue without a pitch corrected depth coordinate", + ) + msg = f"A pitch corrected depth coordinate was not found in {self.ds.encoding['source']}" + raise InvalidAlignFile(msg) from None + def resample_coordinates(self, instr: str, mf_width: int, freq: str) -> None: self.logger.info( "Resampling coordinates depth, latitude and longitude with" @@ -372,23 +402,32 @@ def resample_coordinates(self, instr: str, mf_width: int, freq: str) -> None: freq, mf_width, ) - # Original - try: - self.df_o[f"{instr}_depth"] = self.ds[f"{instr}_depth"].to_pandas() - except KeyError: - self.logger.warning( - "Variable %s_depth not found in %s align.nc file", - instr, - self.mission, - ) + # Find depth coordinate for the instrument - lrauv ctd may have unique coordinates + # per variable. Check for shared depth coordinate first, dorado/i2map style, then + # check for instrument specific depth coordinate. If neither found, log error and + # raise InvalidAlignFile exception. + if f"{instr}_depth" in self.ds: + depth_coord = f"{instr}_depth" + elif f"{instr}_sea_water_temperature_depth" in self.ds: + depth_coord = f"{instr}_sea_water_temperature_depth" + elif "ctdneilbrown_depth" in self.ds: + depth_coord = "ctdneilbrown_depth" + else: + self.logger.warning("No depth coordinate found for instrument %s", instr) self.logger.info( "Cannot continue without a pitch corrected depth coordinate", ) if self.log_file: - msg = f"A CTD depth was not found in {self.ds.encoding['source']}" + msg = ( + f"A depth coordinate was not found for instrument {instr} in " + f"{self.ds.encoding['source']}" + ) else: msg = f"{instr}_depth not found in {self.auv_name}_{self.mission}_align.nc4" raise InvalidAlignFile(msg) from None + + # Original + self.df_o[f"{instr}_depth"] = self.ds[depth_coord].to_pandas() try: self.df_o[f"{instr}_latitude"] = self.ds[f"{instr}_latitude"].to_pandas() self.df_o[f"{instr}_longitude"] = self.ds[f"{instr}_longitude"].to_pandas() @@ -399,6 +438,7 @@ def resample_coordinates(self, instr: str, mf_width: int, freq: str) -> None: ) self.logger.warning(msg) raise InvalidAlignFile(msg) from None + # Median Filtered - back & forward filling nan values at ends self.df_o[f"{instr}_depth_mf"] = ( self.ds[f"{instr}_depth"] @@ -450,36 +490,46 @@ def resample_coordinates(self, instr: str, mf_width: int, freq: str) -> None: ) return aggregator - def save_coordinates( + def save_coordinates( # noqa: PLR0913 self, instr: str, + pitch_corrected_instr: str, mf_width: int, freq: str, aggregator: str, ) -> None: - in_fn = self.ds.encoding["source"].split("/")[-1] self.df_r["depth"].index.rename("time", inplace=True) # noqa: PD002 self.resampled_nc["depth"] = self.df_r["depth"].to_xarray() - self.df_r["latitude"].index.rename("time", inplace=True) # noqa: PD002 - self.resampled_nc["latitude"] = self.df_r["latitude"].to_xarray() - self.df_r["longitude"].index.rename("time", inplace=True) # noqa: PD002 - self.resampled_nc["longitude"] = self.df_r["longitude"].to_xarray() - self.resampled_nc["depth"].attrs = self.ds[f"{instr}_depth"].attrs + self.resampled_nc["depth"].attrs = {} + self.resampled_nc["depth"].attrs["units"] = "meters" + self.resampled_nc["depth"].attrs["long_name"] = "Depth" self.resampled_nc["depth"].attrs["standard_name"] = "depth" - self.resampled_nc["depth"].attrs["comment"] += ( - f". {self.ds[f'{instr}_depth'].attrs['comment']}" + self.resampled_nc["depth"].attrs["comment"] = ( + f"{self.ds[f'{pitch_corrected_instr}_depth'].attrs['comment']}" f" mean sampled at {self.freq} intervals following" f" {self.mf_width} point median filter." ) - self.resampled_nc["latitude"].attrs = self.ds[f"{instr}_latitude"].attrs - self.resampled_nc["latitude"].attrs["comment"] += ( - f" Variable {instr}_latitude from {in_fn}" + + self.df_r["latitude"].index.rename("time", inplace=True) # noqa: PD002 + self.resampled_nc["latitude"] = self.df_r["latitude"].to_xarray() + self.resampled_nc["latitude"].attrs = {} + self.resampled_nc["latitude"].attrs["units"] = "degrees_north" + self.resampled_nc["latitude"].attrs["long_name"] = "Latitude" + self.resampled_nc["latitude"].attrs["standard_name"] = "latitude" + self.resampled_nc["latitude"].attrs["comment"] = ( + f"{self.ds[f'{pitch_corrected_instr}_latitude'].attrs['comment']}" f" median filtered with {mf_width} samples" f" and resampled with {aggregator} to {freq} intervals." ) - self.resampled_nc["longitude"].attrs = self.ds[f"{instr}_longitude"].attrs - self.resampled_nc["longitude"].attrs["comment"] += ( - f" Variable {instr}_longitude from {in_fn}" + + self.df_r["longitude"].index.rename("time", inplace=True) # noqa: PD002 + self.resampled_nc["longitude"] = self.df_r["longitude"].to_xarray() + self.resampled_nc["longitude"].attrs = {} + self.resampled_nc["longitude"].attrs["units"] = "degrees_east" + self.resampled_nc["longitude"].attrs["long_name"] = "Longitude" + self.resampled_nc["longitude"].attrs["standard_name"] = "longitude" + self.resampled_nc["longitude"].attrs["comment"] = ( + f"{self.ds[f'{pitch_corrected_instr}_longitude'].attrs['comment']}" f" median filtered with {mf_width} samples" f" and resampled with {aggregator} to {freq} intervals." ) @@ -1755,21 +1805,17 @@ def resample_mission( # noqa: C901, PLR0912, PLR0915, PLR0913 # use 'ctddseabird', otherwise start with 'ctd1' and fall back to # 'seabird25p' if needed for i2map missions. Early LRAUV missions # had only CTD_NeilBrown instruments, later ones had CTD_Seabird. - pitch_corrected_instr = "ctdseabird" if self.log_file else "ctd1" - if f"{pitch_corrected_instr}_depth" not in self.ds: - pitch_corrected_instr = "ctdneilbrown" - if f"{pitch_corrected_instr}_depth" not in self.ds: - pitch_corrected_instr = "seabird25p" - if pitch_corrected_instr in instrs_to_pad: - # Use navigation if seabird25p failed to record - # data all the way to the end of the mission - pitch_corrected_instr = "navigation" + pitch_corrected_instr = self._get_pitch_corrected_instrument() + if pitch_corrected_instr in instrs_to_pad: + # Use navigation if seabird25p failed to record + # data all the way to the end of the mission + pitch_corrected_instr = "navigation" aggregator = self.resample_coordinates( pitch_corrected_instr, mf_width, freq, ) - self.save_coordinates(instr, mf_width, freq, aggregator) + self.save_coordinates(instr, pitch_corrected_instr, mf_width, freq, aggregator) if self.plot: self.plot_coordinates(instr, freq, plot_seconds) self.add_profile(depth_threshold=depth_threshold) From 8e1fc9b78e451b797c65dc713a9778050f014c21 Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 13:01:59 -0800 Subject: [PATCH 6/8] Add _remove_time_spikes() and show its results in the plots. --- .vscode/launch.json | 6 +- src/data/nc42netcdfs.py | 276 +++++++++++++++++++++++++++++++++------- 2 files changed, 233 insertions(+), 49 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index fcb5c6ed..0a71902c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -59,14 +59,16 @@ // brizo 20250914T080941 has bad latitude and longitude values and lots of bad Universal latitude_time and longitude_time values //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4"] //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/longitude_time"] - "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_universals"] - //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/latitude_time"] + //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_universals"] + "args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/latitude_time"] // brizo 20250916T230652 has several ESP Samples from stoqs_lrauv_sep2025 //"args": ["-v", "2", "--log_file", "brizo/missionlogs/2025/20250916_20250922/20250916T230652/202509162306_202509180305.nc4", "--plot_time", "/longitude_time"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109_cleaned_by_quinn.nc4", "--plot_time", "/longitude_time"] //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109_cleaned_by_quinn_latlon.nc4", "--plot_time", "/longitude_time"] // Conflicting sizes for nudged_time and data - fixed by filtering GPS fixes to be monotonically increasing //"args": ["-v", "1", "--log_file", "tethys/missionlogs/2012/20120908_20120920/20120917T025522/201209170255_201209171110.nc4", "--plot_time", "/longitude_time"] + // Single spike in longitude_time + //"args": ["-v", "1", "--log_file", "brizo/missionlogs/2025/20250909_20250915/20250914T080941/202509140809_202509150109.nc4", "--plot_time", "/NAL9602/longitude_fix_time"] }, { "name": "2.0 - calibrate.py", diff --git a/src/data/nc42netcdfs.py b/src/data/nc42netcdfs.py index f4ec4485..e463f342 100755 --- a/src/data/nc42netcdfs.py +++ b/src/data/nc42netcdfs.py @@ -512,7 +512,25 @@ def _process_single_time_coordinate( # noqa: PLR0913 plot_group_name: str | None, plot_time_coord_name: str | None, ) -> dict: - """Process filtering for a single time coordinate.""" + """Process filtering for a single time coordinate. + + Applies a pipeline of filtering steps to clean time coordinate data: + 1. Range-based outlier removal (using filename time bounds ±10 min) + 2. Single-point spike detection (removes anomalous forward-then-back jumps) + 3. Monotonic filtering (ensures strictly increasing time values) + 4. Universal coordinates intersection (applied separately for lon/lat/depth/time) + + Args: + log_file: Name of the log file being processed + group_name: NetCDF group name + src_group: NetCDF group object containing the time coordinate + time_coord_name: Name of the time coordinate variable + plot_group_name: Group name for plotting (if requested) + plot_time_coord_name: Time coordinate name for plotting (if requested) + + Returns: + Dictionary with filtering results including indices and metadata + """ time_var = src_group.variables[time_coord_name] original_time_data = time_var[:] self.logger.info("Time coordinate %s: %d points", time_coord_name, len(original_time_data)) @@ -532,14 +550,28 @@ def _process_single_time_coordinate( # noqa: PLR0913 # First filter out values that fall outside of reasonable bounds valid_indices = self._filter_valid_time_indices(original_time_data) + # Remove single-point time spikes + spike_removed_indices = self._remove_time_spikes(original_time_data, valid_indices) + # Get the valid time subset - valid_time_data = original_time_data[valid_indices] + valid_time_data = original_time_data[spike_removed_indices] # Apply monotonic filtering mono_indices_in_filtered = self._get_monotonic_indices(valid_time_data) # Convert monotonic indices back to original array indices - final_indices = [valid_indices[i] for i in mono_indices_in_filtered] + final_indices = [spike_removed_indices[i] for i in mono_indices_in_filtered] + + # Store data for plotting if requested (do this before early return) + if plot_data is not None: + outlier_removed_data = original_time_data[valid_indices] + plot_data["outlier_indices"] = valid_indices + plot_data["outlier_data"] = outlier_removed_data + plot_data["spike_indices"] = spike_removed_indices + plot_data["spike_data"] = valid_time_data + plot_data["final_indices"] = mono_indices_in_filtered + plot_data["final_data"] = valid_time_data[mono_indices_in_filtered] + self._plot_time_filtering(plot_data) # For root group universals, store data for intersection step is_universal_coord = group_name == "/" and time_coord_name in [ @@ -552,10 +584,14 @@ def _process_single_time_coordinate( # noqa: PLR0913 if is_universal_coord: # Store intermediate data for intersection step if self.plot_universals: + outlier_removed_data = original_time_data[valid_indices] + spike_removed_data = original_time_data[spike_removed_indices] self.universals_plot_data[time_coord_name] = { "original": original_time_data.copy(), - "valid_indices": valid_indices.copy(), - "valid_data": valid_time_data.copy(), + "outlier_indices": valid_indices.copy(), + "outlier_data": outlier_removed_data.copy(), + "spike_indices": spike_removed_indices.copy(), + "spike_data": spike_removed_data.copy(), "mono_indices": final_indices.copy(), "mono_data": valid_time_data[mono_indices_in_filtered].copy(), "log_file": log_file, @@ -569,14 +605,6 @@ def _process_single_time_coordinate( # noqa: PLR0913 "original_time_data": original_time_data, # Store for reprocessing } - # Store data for plotting if requested - if plot_data is not None: - plot_data["valid_indices"] = valid_indices - plot_data["valid_data"] = valid_time_data - plot_data["final_indices"] = mono_indices_in_filtered - plot_data["final_data"] = valid_time_data[mono_indices_in_filtered] - self._plot_time_filtering(plot_data) - return self._create_time_filter_result( final_indices, len(original_time_data), time_coord_name ) @@ -712,6 +740,88 @@ def _filter_valid_time_indices(self, time_data) -> list[int]: return valid_indices + def _remove_time_spikes(self, time_data, indices: list[int]) -> list[int]: + """Remove single-point time spikes from a list of indices. + + A spike is detected when a time value jumps significantly forward or backward + and then returns close to the expected progression within the next few points. + + Args: + time_data: Original time data array + indices: List of indices that have passed other filters + + Returns: + List of indices with spikes removed + """ + MIN_POINTS_FOR_SPIKE_DETECTION = 3 + if len(indices) < MIN_POINTS_FOR_SPIKE_DETECTION: + return indices + + # Get time values at the filtered indices + time_values = time_data[indices] + + # Calculate time differences between consecutive points + time_diffs = np.diff(time_values) + + # Calculate median and MAD of time differences for robust statistics + median_diff = np.median(time_diffs) + abs_deviations = np.abs(time_diffs - median_diff) + mad = np.median(abs_deviations) + + if mad == 0: + # All differences are identical, no spikes to remove + return indices + + # Use modified z-score to identify anomalous jumps + # A spike threshold of 10 is quite conservative + SPIKE_THRESHOLD = 10.0 + modified_z_scores = 0.6745 * abs_deviations / mad + + # Find potential spike points where jump is anomalous + spike_candidates = np.where(modified_z_scores > SPIKE_THRESHOLD)[0] + + if len(spike_candidates) == 0: + return indices + + # Check each candidate to see if it's a single-point spike + indices_to_remove = set() + + for spike_idx in spike_candidates: + # spike_idx is the index in time_diffs array + # The spike affects index spike_idx+1 in the indices array + point_idx = spike_idx + 1 + + # Check if this is a single-point spike by looking at the next difference + if point_idx < len(time_diffs): + # If the next jump is in the opposite direction and also large, + # this is likely a spike + current_jump = time_diffs[spike_idx] + next_jump = time_diffs[point_idx] + + # Spike pattern: large jump in one direction, then large jump back + if current_jump * next_jump < 0: # Opposite signs + next_z_score = 0.6745 * abs(next_jump - median_diff) / mad + if next_z_score > SPIKE_THRESHOLD / 2: # Less strict for return jump + indices_to_remove.add(point_idx) + self.logger.debug( + "Detected time spike at index %d: %.2f -> %.2f -> %.2f", + indices[point_idx], + time_values[point_idx - 1] if point_idx > 0 else 0, + time_values[point_idx], + time_values[point_idx + 1] if point_idx < len(time_values) - 1 else 0, + ) + + if indices_to_remove: + filtered_indices = [idx for i, idx in enumerate(indices) if i not in indices_to_remove] + self.logger.info( + "Removed %d time spike(s) from %d points", + len(indices_to_remove), + len(indices), + ) + return filtered_indices + + return indices + def _get_monotonic_indices(self, time_data) -> list[int]: """Get indices for monotonic time values from time data array.""" mono_indices = [] @@ -758,7 +868,7 @@ def _plot_universals_filtering(self): return n_coords = len(coords) - fig, axes = plt.subplots(n_coords, 4, figsize=(18, 2.5 * n_coords), sharex="col") + fig, axes = plt.subplots(n_coords, 5, figsize=(22, 2.5 * n_coords), sharex="col") # Handle case of single coordinate if n_coords == 1: @@ -774,8 +884,10 @@ def _plot_universals_filtering(self): for i, (coord, time_coord) in enumerate(zip(coords, coord_names, strict=True)): data = self.universals_plot_data[time_coord] original = data["original"] - valid_indices = data["valid_indices"] - valid_data = data["valid_data"] + outlier_indices = data["outlier_indices"] + outlier_data = data["outlier_data"] + spike_indices = data["spike_indices"] + spike_data = data["spike_data"] mono_indices = data["mono_indices"] mono_data = data["mono_data"] final_indices = data["final_indices"] @@ -785,8 +897,10 @@ def _plot_universals_filtering(self): axes[i], coord, original, - valid_indices, - valid_data, + outlier_indices, + outlier_data, + spike_indices, + spike_data, mono_indices, mono_data, final_indices, @@ -795,7 +909,7 @@ def _plot_universals_filtering(self): ) # Set x-label only on bottom row - for j in range(4): + for j in range(5): axes[-1, j].set_xlabel("Index") plt.tight_layout() @@ -808,8 +922,10 @@ def _plot_universals_row( # noqa: PLR0913 axes_row, coord: str, original, - valid_indices, - valid_data, + outlier_indices, + outlier_data, + spike_indices, + spike_data, mono_indices, mono_data, final_indices, @@ -833,48 +949,82 @@ def _plot_universals_row( # noqa: PLR0913 ) # Column 2: After outlier removal - axes_row[1].plot(valid_indices, valid_data, "m.-", alpha=0.7, markersize=2) + axes_row[1].plot( + outlier_indices, outlier_data, "m.-", alpha=0.7, markersize=2, label="Data" + ) + + # Highlight spikes that will be removed + spike_indices_set = set(spike_indices) + removed_spike_indices = [idx for idx in outlier_indices if idx not in spike_indices_set] + if removed_spike_indices: + removed_spike_values = [original[idx] for idx in removed_spike_indices] + axes_row[1].plot( + removed_spike_indices, + removed_spike_values, + "rs", + markersize=8, + markerfacecolor="none", + markeredgewidth=1.5, + label=f"Spikes to remove ({len(removed_spike_indices)})", + ) + if is_first_row: axes_row[1].set_title("After Outlier Removal") + axes_row[1].legend(loc="upper right", fontsize=8) axes_row[1].grid(visible=True, alpha=0.3) - removed_outliers = len(original) - len(valid_data) + removed_outliers = len(original) - len(outlier_data) axes_row[1].text( 0.02, 0.95, - f"Points: {len(valid_data)}\nRemoved: {removed_outliers}", + f"Points: {len(outlier_data)}\nRemoved: {removed_outliers}", transform=axes_row[1].transAxes, verticalalignment="top", bbox={"boxstyle": "round", "facecolor": "plum", "alpha": 0.8}, ) - # Column 3: After monotonic removal - axes_row[2].plot(mono_indices, mono_data, "g.-", alpha=0.7, markersize=2) + # Column 3: After spike removal + axes_row[2].plot(spike_indices, spike_data, "c.-", alpha=0.7, markersize=2) if is_first_row: - axes_row[2].set_title("After Monotonic Removal") + axes_row[2].set_title("After Spike Removal") axes_row[2].grid(visible=True, alpha=0.3) - removed_monotonic = len(valid_data) - len(mono_data) + removed_spikes = len(outlier_data) - len(spike_data) axes_row[2].text( 0.02, 0.95, - f"Points: {len(mono_data)}\nRemoved: {removed_monotonic}", + f"Points: {len(spike_data)}\nRemoved: {removed_spikes}", transform=axes_row[2].transAxes, verticalalignment="top", - bbox={"boxstyle": "round", "facecolor": "lightgreen", "alpha": 0.8}, + bbox={"boxstyle": "round", "facecolor": "cyan", "alpha": 0.8}, ) - # Column 4: After intersection - axes_row[3].plot(final_indices, final_data, "r.-", alpha=0.7, markersize=2) + # Column 4: After monotonic removal + axes_row[3].plot(mono_indices, mono_data, "g.-", alpha=0.7, markersize=2) if is_first_row: - axes_row[3].set_title("After Intersection") + axes_row[3].set_title("After Monotonic Removal") axes_row[3].grid(visible=True, alpha=0.3) + removed_monotonic = len(spike_data) - len(mono_data) + axes_row[3].text( + 0.02, + 0.95, + f"Points: {len(mono_data)}\nRemoved: {removed_monotonic}", + transform=axes_row[3].transAxes, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "lightgreen", "alpha": 0.8}, + ) + + # Column 5: After intersection + axes_row[4].plot(final_indices, final_data, "r.-", alpha=0.7, markersize=2) + if is_first_row: + axes_row[4].set_title("After Intersection") + axes_row[4].grid(visible=True, alpha=0.3) removed_intersection = len(mono_data) - len(final_data) total_removed = len(original) - len(final_data) - axes_row[3].text( + axes_row[4].text( 0.02, 0.95, f"Points: {len(final_data)}\nRemoved: {removed_intersection}\n" f"Total removed: {total_removed} ({100 * total_removed / len(original):.1f}%)", - transform=axes_row[3].transAxes, + transform=axes_row[4].transAxes, verticalalignment="top", bbox={"boxstyle": "round", "facecolor": "lightcoral", "alpha": 0.8}, ) @@ -889,13 +1039,15 @@ def _plot_time_filtering(self, plot_data: dict): import matplotlib.pyplot as plt # noqa: F401 original = plot_data["original"] - valid_indices = plot_data["valid_indices"] - valid_data = plot_data["valid_data"] + outlier_indices = plot_data["outlier_indices"] + outlier_data = plot_data["outlier_data"] + spike_indices = plot_data["spike_indices"] + spike_data = plot_data["spike_data"] final_indices = plot_data["final_indices"] final_data = plot_data["final_data"] # Create figure with subplots - fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 8), sharex=True) + fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 10), sharex=True) # Plot 1: Original data ax1.plot(original, "b-", label="Original", alpha=0.7) @@ -907,40 +1059,70 @@ def _plot_time_filtering(self, plot_data: dict): ax1.legend() ax1.grid(visible=True, alpha=0.3) - # Plot 2: After valid Values filtering - ax2.plot(valid_indices, valid_data, "m.-", label="After Valid Values Filter", alpha=0.7) + # Plot 2: After outlier removal + ax2.plot(outlier_indices, outlier_data, "m.-", label="After Outlier Removal", alpha=0.7) + + # Highlight spikes that will be removed (points in outlier_indices but not in spike_indices) + spike_indices_set = set(spike_indices) + removed_spike_indices = [idx for idx in outlier_indices if idx not in spike_indices_set] + if removed_spike_indices: + removed_spike_values = [original[idx] for idx in removed_spike_indices] + ax2.plot( + removed_spike_indices, + removed_spike_values, + "rs", + markersize=10, + markerfacecolor="none", + markeredgewidth=2, + label=f"Spikes to remove ({len(removed_spike_indices)})", + ) + ax2.set_ylabel("Time Value") ax2.legend() ax2.grid(visible=True, alpha=0.3) ax2.text( 0.02, 0.60, - f"Points removed: {len(original) - len(valid_data)}\n", + f"Points removed: {len(original) - len(outlier_data)}\n", transform=ax2.transAxes, verticalalignment="top", bbox={"boxstyle": "round", "facecolor": "wheat"}, ) - # Plot 3: Final After Monotonic filtered data - ax3.plot(final_indices, final_data, "r.-", label="After Monotonic Filter", alpha=0.7) - ax3.set_xlabel("Index") + # Plot 3: After spike removal + ax3.plot(spike_indices, spike_data, "c.-", label="After Spike Removal", alpha=0.7) ax3.set_ylabel("Time Value") ax3.legend() ax3.grid(visible=True, alpha=0.3) + ax3.text( + 0.02, + 0.60, + f"Points removed: {len(outlier_data) - len(spike_data)}\n", + transform=ax3.transAxes, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "lightcyan"}, + ) + + # Plot 4: Final After Monotonic filtered data + ax4.plot(final_indices, final_data, "r.-", label="After Monotonic Filter", alpha=0.7) + ax4.set_xlabel("Index") + ax4.set_ylabel("Time Value") + ax4.legend() + ax4.grid(visible=True, alpha=0.3) # Add statistics text stats_text = ( - f"Points removed: {len(valid_data) - len(final_data)}\n" + f"Points removed: {len(spike_data) - len(final_data)}\n" f"Original points: {len(original)}\n" f"After final filter: {len(final_data)}\n" f"Total removed: {len(original) - len(final_data)} " f"({100 * (len(original) - len(final_data)) / len(original):.1f}%)" ) - ax3.text( + ax4.text( 0.02, 0.90, stats_text, - transform=ax3.transAxes, + transform=ax4.transAxes, verticalalignment="top", bbox={"boxstyle": "round", "facecolor": "wheat"}, ) From 5c80408fb79911a1085d4de73c7e05f7406ee30e Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 13:23:57 -0800 Subject: [PATCH 7/8] Update test values. --- src/data/test_process_dorado.py | 10 +++++----- src/data/test_process_i2map.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/data/test_process_dorado.py b/src/data/test_process_dorado.py index aa8c06b3..a225a1e8 100644 --- a/src/data/test_process_dorado.py +++ b/src/data/test_process_dorado.py @@ -31,9 +31,9 @@ def test_process_dorado(complete_dorado_processing): # but it will alert us if a code change unexpectedly changes the file size. # If code changes are expected to change the file size then we should # update the expected size here. - EXPECTED_SIZE_GITHUB = 621716 - EXPECTED_SIZE_ACT = 621714 - EXPECTED_SIZE_LOCAL = 621764 + EXPECTED_SIZE_GITHUB = 621410 + EXPECTED_SIZE_ACT = 621408 + EXPECTED_SIZE_LOCAL = 621458 if str(proc.args.base_path).startswith("/home/runner"): # The size is different in GitHub Actions, maybe due to different metadata assert nc_file.stat().st_size == EXPECTED_SIZE_GITHUB # noqa: S101 @@ -51,8 +51,8 @@ def test_process_dorado(complete_dorado_processing): if check_md5: # Check that the MD5 hash has not changed EXPECTED_MD5_GITHUB = "bbb9e7e9129c19e39895218d35cc474f" - EXPECTED_MD5_ACT = "1fb192c144f6f4f8f06b555a339587ed" - EXPECTED_MD5_LOCAL = "aec48d0abb58d9e831e5b22180eebd8f" + EXPECTED_MD5_ACT = "316955fd489862ad9ed5b63df8aa7db8" + EXPECTED_MD5_LOCAL = "f635cee8760aa0f40bd2070bc0c5fa65" if str(proc.args.base_path).startswith("/home/runner"): # The MD5 hash is different in GitHub Actions, maybe due to different metadata assert hashlib.md5(open(nc_file, "rb").read()).hexdigest() == EXPECTED_MD5_GITHUB # noqa: PTH123, S101, S324, SIM115 diff --git a/src/data/test_process_i2map.py b/src/data/test_process_i2map.py index 121382a1..dc972932 100644 --- a/src/data/test_process_i2map.py +++ b/src/data/test_process_i2map.py @@ -30,9 +30,9 @@ def test_process_i2map(complete_i2map_processing): # but it will alert us if a code change unexpectedly changes the file size. # If code changes are expected to change the file size then we should # update the expected size here. - EXPECTED_SIZE_GITHUB = 52994 - EXPECTED_SIZE_ACT = 52964 - EXPECTED_SIZE_LOCAL = 53094 + EXPECTED_SIZE_GITHUB = 52758 + EXPECTED_SIZE_ACT = 52728 + EXPECTED_SIZE_LOCAL = 52858 if str(proc.args.base_path).startswith("/home/runner"): # The size is different in GitHub Actions, maybe due to different metadata assert nc_file.stat().st_size == EXPECTED_SIZE_GITHUB # noqa: S101 From 9f308077a2be5bd97bea3157696562abedc7c0a9 Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Sun, 7 Dec 2025 13:25:54 -0800 Subject: [PATCH 8/8] Update EXPECTED_MD5_GITHUB. --- src/data/test_process_dorado.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/test_process_dorado.py b/src/data/test_process_dorado.py index a225a1e8..88217bbc 100644 --- a/src/data/test_process_dorado.py +++ b/src/data/test_process_dorado.py @@ -50,7 +50,7 @@ def test_process_dorado(complete_dorado_processing): check_md5 = True if check_md5: # Check that the MD5 hash has not changed - EXPECTED_MD5_GITHUB = "bbb9e7e9129c19e39895218d35cc474f" + EXPECTED_MD5_GITHUB = "fec067d16eb5280a8bc2b6ef132821b8" EXPECTED_MD5_ACT = "316955fd489862ad9ed5b63df8aa7db8" EXPECTED_MD5_LOCAL = "f635cee8760aa0f40bd2070bc0c5fa65" if str(proc.args.base_path).startswith("/home/runner"):