From 9ebd42bf43c68ee7e5a955396f668c9fa7259666 Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Mon, 14 Apr 2025 19:14:11 -0700 Subject: [PATCH 1/9] draft adding proxy corrections Signed-off-by: Michael Anderson --- src/data/resample.py | 86 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/data/resample.py b/src/data/resample.py index 162a5028..639e5d93 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -664,6 +664,92 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 f" = {proxy_ratio_adinos:.4e} and proxy_cal_factor = {proxy_cal_factor:.6f}" ) + def correct_biolume_proxies( + self, + adinos_threshold: float = 0.1, + correction_threshold: int = 3, + fluoBL_threshold: float = 0.25, # use 0.45 for pearson corr + corr_type: str = 'spearman', # or pearson + depth_threshold: float = 2.0, + minutes_from_surface_threshold: int = 5 + ) -> None: + df = self.df_r[ + [ + 'depth', + 'biolum_proxy_diatoms', 'biolum_proxy_adinos', 'biolum_proxy_hdinos', + 'fluo', 'bg_biolum' + ] + ].copy(deep=True) + df['fluoBL_corr'] = \ + ('time', np.full_like(self.resampled_nc["profile_number"], np.nan)) + + profile_series = self.resampled_nc['profile_number'].to_series() + df['profile_number'] = profile_series.reindex(self.df_r.index, method='ffill') + # make unique profiles across all surveys + profil = np.cumsum(np.abs(np.diff(df['profile_number'], prepend=0))) + + # new proxies are the "N" fields + for new, old in zip( + ['diatomsN','adinosN','hdinosN'], + ['biolum_proxy_diatoms','biolum_proxy_adinos','biolum_proxy_hdinos'] + ): + df[new] = ('time', np.full_like(df[old], np.nan)) + + # compute correlation per profil and then correct proxies + dt_5mins = np.timedelta64(timedelta(minutes=minutes_from_surface_threshold)) + for iprofil_ in tqdm(range(1, int(np.max(profil)))): + iprofil = (profil == iprofil_) + # excludes surface, must be within 5 min of it + ideep = iprofil & (df.depth > depth_threshold) + itime = (df.index > (df.index[ideep].min() - dt_5mins)) & \ + (df.index < (df.index[ideep].max() + dt_5mins)) + iprofil = iprofil & itime + if not np.any(iprofil): + #print(f'no corrections possible for {iprofil_=}') + continue + auv_profil = df.loc[iprofil] + if auv_profil.shape[0] > correction_threshold: + if np.sum(auv_profil.adinos > adinos_threshold) < correction_threshold: + # no correction for low fluo & biolum values + fluoBL_corr = 1.0 + else: + # correlation between fluo and bg_biolum computed on high + # adino values for each profile + idepth = auv_profil.depth < auv_profil.depth[ + auv_profil.adinos > adinos_threshold + ].max() + # pandas' corr ignores NaN + auv_profil_idepth = auv_profil[['fluo', 'bg_biolum']].loc[idepth] + fluoBL_corr = auv_profil_idepth.fluo.corr(auv_profil_idepth.bg_biolum, + method=corr_type) + + # save correlation + df.fluoBL_corr[iprofil] = fluoBL_corr + + # scale between 0 and 1 first + fluoBL_correctionfactor = (fluoBL_corr + 1.0) / 2.0 + + # then scale between fluoBL_threshold and 1 + fluoBL_correctionfactor = \ + fluoBL_correctionfactor * (1.0 - fluoBL_threshold) + fluoBL_threshold + + # can happen if fluoBL_threshold is negative + fluoBL_correctionfactor = max(fluoBL_correctionfactor, 0.0) + + df.adinosN[iprofil] = df.adinos[iprofil] * fluoBL_correctionfactor + + # preserving adinos+diatoms + df.diatomsN[iprofil] = \ + df.adinos[iprofil] + df.diatoms[iprofil] - df.adinosN[iprofil] + + # preserving adinos+hdinos + df.hdinosN[iprofil] = \ + df.adinos[iprofil] + df.hdinos[iprofil] - df.adinosN[iprofil] + + self.df_r.biolum_proxy_adinos[iprofil] = df.adinosN[iprofil] + self.df_r.biolum_proxy_diatoms[iprofil] = df.diatomsN[iprofil] + self.df_r.biolum_proxy_hdinos[iprofil] = df.hdinosN[iprofil] + def resample_variable( # noqa: PLR0913 self, instr: str, From 7e1453f47be56b2d869b2df40ce26e4ff32ceed3 Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Mon, 14 Apr 2025 19:24:32 -0700 Subject: [PATCH 2/9] linters Signed-off-by: Michael Anderson --- src/data/resample.py | 60 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index 639e5d93..b85d6165 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -664,50 +664,50 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 f" = {proxy_ratio_adinos:.4e} and proxy_cal_factor = {proxy_cal_factor:.6f}" ) - def correct_biolume_proxies( + def correct_biolume_proxies( # noqa: PLR0913 self, adinos_threshold: float = 0.1, correction_threshold: int = 3, - fluoBL_threshold: float = 0.25, # use 0.45 for pearson corr + fluo_bl_threshold: float = 0.25, # use 0.45 for pearson corr corr_type: str = 'spearman', # or pearson depth_threshold: float = 2.0, minutes_from_surface_threshold: int = 5 ) -> None: - df = self.df_r[ - [ - 'depth', - 'biolum_proxy_diatoms', 'biolum_proxy_adinos', 'biolum_proxy_hdinos', - 'fluo', 'bg_biolum' - ] - ].copy(deep=True) - df['fluoBL_corr'] = \ + df_p = self.df_r[ + [ + 'depth', + 'biolum_proxy_diatoms', 'biolum_proxy_adinos', 'biolum_proxy_hdinos', + 'fluo', 'bg_biolum' + ] + ].copy(deep=True) + df_p['fluoBL_corr'] = \ ('time', np.full_like(self.resampled_nc["profile_number"], np.nan)) profile_series = self.resampled_nc['profile_number'].to_series() - df['profile_number'] = profile_series.reindex(self.df_r.index, method='ffill') + df_p['profile_number'] = profile_series.reindex(df_p.index, method='ffill') # make unique profiles across all surveys - profil = np.cumsum(np.abs(np.diff(df['profile_number'], prepend=0))) + profil = np.cumsum(np.abs(np.diff(df_p['profile_number'], prepend=0))) # new proxies are the "N" fields for new, old in zip( ['diatomsN','adinosN','hdinosN'], ['biolum_proxy_diatoms','biolum_proxy_adinos','biolum_proxy_hdinos'] ): - df[new] = ('time', np.full_like(df[old], np.nan)) + df_p[new] = ('time', np.full_like(df_p[old], np.nan)) # compute correlation per profil and then correct proxies dt_5mins = np.timedelta64(timedelta(minutes=minutes_from_surface_threshold)) - for iprofil_ in tqdm(range(1, int(np.max(profil)))): + for iprofil_ in range(1, int(np.max(profil))): iprofil = (profil == iprofil_) # excludes surface, must be within 5 min of it - ideep = iprofil & (df.depth > depth_threshold) - itime = (df.index > (df.index[ideep].min() - dt_5mins)) & \ - (df.index < (df.index[ideep].max() + dt_5mins)) + ideep = iprofil & (df_p.depth > depth_threshold) + itime = (df_p.index > (df_p.index[ideep].min() - dt_5mins)) & \ + (df_p.index < (df_p.index[ideep].max() + dt_5mins)) iprofil = iprofil & itime if not np.any(iprofil): #print(f'no corrections possible for {iprofil_=}') continue - auv_profil = df.loc[iprofil] + auv_profil = df_p.loc[iprofil] if auv_profil.shape[0] > correction_threshold: if np.sum(auv_profil.adinos > adinos_threshold) < correction_threshold: # no correction for low fluo & biolum values @@ -724,31 +724,31 @@ def correct_biolume_proxies( method=corr_type) # save correlation - df.fluoBL_corr[iprofil] = fluoBL_corr + df_p.fluoBL_corr[iprofil] = fluoBL_corr # scale between 0 and 1 first fluoBL_correctionfactor = (fluoBL_corr + 1.0) / 2.0 - # then scale between fluoBL_threshold and 1 + # then scale between fluo_bl_threshold and 1 fluoBL_correctionfactor = \ - fluoBL_correctionfactor * (1.0 - fluoBL_threshold) + fluoBL_threshold + fluoBL_correctionfactor * (1.0 - fluo_bl_threshold) + fluo_bl_threshold - # can happen if fluoBL_threshold is negative + # can happen if fluo_bl_threshold is negative fluoBL_correctionfactor = max(fluoBL_correctionfactor, 0.0) - df.adinosN[iprofil] = df.adinos[iprofil] * fluoBL_correctionfactor + df_p.adinosN[iprofil] = df_p.adinos[iprofil] * fluoBL_correctionfactor # preserving adinos+diatoms - df.diatomsN[iprofil] = \ - df.adinos[iprofil] + df.diatoms[iprofil] - df.adinosN[iprofil] + df_p.diatomsN[iprofil] = \ + df_p.adinos[iprofil] + df_p.diatoms[iprofil] - df_p.adinosN[iprofil] # preserving adinos+hdinos - df.hdinosN[iprofil] = \ - df.adinos[iprofil] + df.hdinos[iprofil] - df.adinosN[iprofil] + df_p.hdinosN[iprofil] = \ + df_p.adinos[iprofil] + df_p.hdinos[iprofil] - df_p.adinosN[iprofil] - self.df_r.biolum_proxy_adinos[iprofil] = df.adinosN[iprofil] - self.df_r.biolum_proxy_diatoms[iprofil] = df.diatomsN[iprofil] - self.df_r.biolum_proxy_hdinos[iprofil] = df.hdinosN[iprofil] + self.df_r.biolum_proxy_adinos[iprofil] = df_p.adinosN[iprofil] + self.df_r.biolum_proxy_diatoms[iprofil] = df_p.diatomsN[iprofil] + self.df_r.biolum_proxy_hdinos[iprofil] = df_p.hdinosN[iprofil] def resample_variable( # noqa: PLR0913 self, From 0463f8064e9c368a3e4ca9f3c818f8559ddc1473 Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Mon, 14 Apr 2025 19:43:57 -0700 Subject: [PATCH 3/9] ruff linter Signed-off-by: Michael Anderson --- src/data/resample.py | 69 +++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index b85d6165..822145d5 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -669,43 +669,47 @@ def correct_biolume_proxies( # noqa: PLR0913 adinos_threshold: float = 0.1, correction_threshold: int = 3, fluo_bl_threshold: float = 0.25, # use 0.45 for pearson corr - corr_type: str = 'spearman', # or pearson + corr_type: str = "spearman", # or pearson depth_threshold: float = 2.0, - minutes_from_surface_threshold: int = 5 + minutes_from_surface_threshold: int = 5, ) -> None: df_p = self.df_r[ - [ - 'depth', - 'biolum_proxy_diatoms', 'biolum_proxy_adinos', 'biolum_proxy_hdinos', - 'fluo', 'bg_biolum' - ] - ].copy(deep=True) - df_p['fluoBL_corr'] = \ - ('time', np.full_like(self.resampled_nc["profile_number"], np.nan)) - - profile_series = self.resampled_nc['profile_number'].to_series() - df_p['profile_number'] = profile_series.reindex(df_p.index, method='ffill') + [ + "depth", + "biolum_proxy_diatoms", + "biolum_proxy_adinos", + "biolum_proxy_hdinos", + "fluo", + "bg_biolum", + ] + ].copy(deep=True) + df_p["fluoBL_corr"] = ("time", np.full_like(self.resampled_nc["profile_number"], np.nan)) + + profile_series = self.resampled_nc["profile_number"].to_series() + df_p["profile_number"] = profile_series.reindex(df_p.index, method="ffill") # make unique profiles across all surveys - profil = np.cumsum(np.abs(np.diff(df_p['profile_number'], prepend=0))) + profil = np.cumsum(np.abs(np.diff(df_p["profile_number"], prepend=0))) # new proxies are the "N" fields for new, old in zip( - ['diatomsN','adinosN','hdinosN'], - ['biolum_proxy_diatoms','biolum_proxy_adinos','biolum_proxy_hdinos'] + ["diatomsN", "adinosN", "hdinosN"], + ["biolum_proxy_diatoms", "biolum_proxy_adinos", "biolum_proxy_hdinos"], + strict=False, ): - df_p[new] = ('time', np.full_like(df_p[old], np.nan)) + df_p[new] = ("time", np.full_like(df_p[old], np.nan)) # compute correlation per profil and then correct proxies dt_5mins = np.timedelta64(timedelta(minutes=minutes_from_surface_threshold)) for iprofil_ in range(1, int(np.max(profil))): - iprofil = (profil == iprofil_) + iprofil = profil == iprofil_ # excludes surface, must be within 5 min of it ideep = iprofil & (df_p.depth > depth_threshold) - itime = (df_p.index > (df_p.index[ideep].min() - dt_5mins)) & \ - (df_p.index < (df_p.index[ideep].max() + dt_5mins)) + itime = (df_p.index > (df_p.index[ideep].min() - dt_5mins)) & ( + df_p.index < (df_p.index[ideep].max() + dt_5mins) + ) iprofil = iprofil & itime if not np.any(iprofil): - #print(f'no corrections possible for {iprofil_=}') + # print(f'no corrections possible for {iprofil_=}') continue auv_profil = df_p.loc[iprofil] if auv_profil.shape[0] > correction_threshold: @@ -715,13 +719,15 @@ def correct_biolume_proxies( # noqa: PLR0913 else: # correlation between fluo and bg_biolum computed on high # adino values for each profile - idepth = auv_profil.depth < auv_profil.depth[ - auv_profil.adinos > adinos_threshold - ].max() + idepth = ( + auv_profil.depth + < auv_profil.depth[auv_profil.adinos > adinos_threshold].max() + ) # pandas' corr ignores NaN - auv_profil_idepth = auv_profil[['fluo', 'bg_biolum']].loc[idepth] - fluoBL_corr = auv_profil_idepth.fluo.corr(auv_profil_idepth.bg_biolum, - method=corr_type) + auv_profil_idepth = auv_profil[["fluo", "bg_biolum"]].loc[idepth] + fluoBL_corr = auv_profil_idepth.fluo.corr( + auv_profil_idepth.bg_biolum, method=corr_type + ) # save correlation df_p.fluoBL_corr[iprofil] = fluoBL_corr @@ -730,8 +736,9 @@ def correct_biolume_proxies( # noqa: PLR0913 fluoBL_correctionfactor = (fluoBL_corr + 1.0) / 2.0 # then scale between fluo_bl_threshold and 1 - fluoBL_correctionfactor = \ + fluoBL_correctionfactor = ( fluoBL_correctionfactor * (1.0 - fluo_bl_threshold) + fluo_bl_threshold + ) # can happen if fluo_bl_threshold is negative fluoBL_correctionfactor = max(fluoBL_correctionfactor, 0.0) @@ -739,12 +746,14 @@ def correct_biolume_proxies( # noqa: PLR0913 df_p.adinosN[iprofil] = df_p.adinos[iprofil] * fluoBL_correctionfactor # preserving adinos+diatoms - df_p.diatomsN[iprofil] = \ + df_p.diatomsN[iprofil] = ( df_p.adinos[iprofil] + df_p.diatoms[iprofil] - df_p.adinosN[iprofil] + ) # preserving adinos+hdinos - df_p.hdinosN[iprofil] = \ + df_p.hdinosN[iprofil] = ( df_p.adinos[iprofil] + df_p.hdinos[iprofil] - df_p.adinosN[iprofil] + ) self.df_r.biolum_proxy_adinos[iprofil] = df_p.adinosN[iprofil] self.df_r.biolum_proxy_diatoms[iprofil] = df_p.diatomsN[iprofil] From 71b088c0ff764f907d6cf4710373a03bc975ffa8 Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Tue, 15 Apr 2025 15:19:25 -0700 Subject: [PATCH 4/9] pass fluo from add_biolume_proxies to correction; get depth and profile from resampled_nc Signed-off-by: Michael Anderson --- src/data/resample.py | 67 ++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index 822145d5..cca6df48 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -610,6 +610,7 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 self.df_r["biolume_bg_biolume"].attrs["units"] = "photons/liter" self.df_r["biolume_bg_biolume"].attrs["comment"] = zero_note + fluo = None nighttime_bl_raw, sunsets, sunrises = self.select_nighttime_bl_raw() if nighttime_bl_raw.empty: self.logger.info( @@ -621,7 +622,7 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 self.logger.info( "No hs2_fl700 data. Not computing adinos, diatoms, and hdinos", ) - return + return None fluo = ( self.resampled_nc["hs2_fl700"] .where( @@ -664,8 +665,11 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 f" = {proxy_ratio_adinos:.4e} and proxy_cal_factor = {proxy_cal_factor:.6f}" ) + return fluo + def correct_biolume_proxies( # noqa: PLR0913 self, + biolume_fluo: float, # from add_biolume_proxies adinos_threshold: float = 0.1, correction_threshold: int = 3, fluo_bl_threshold: float = 0.25, # use 0.45 for pearson corr @@ -673,17 +677,23 @@ def correct_biolume_proxies( # noqa: PLR0913 depth_threshold: float = 2.0, minutes_from_surface_threshold: int = 5, ) -> None: - df_p = self.df_r[ - [ - "depth", - "biolum_proxy_diatoms", - "biolum_proxy_adinos", - "biolum_proxy_hdinos", - "fluo", - "bg_biolum", - ] - ].copy(deep=True) - df_p["fluoBL_corr"] = ("time", np.full_like(self.resampled_nc["profile_number"], np.nan)) + variables = [ + "biolume_proxy_diatoms", + "biolume_proxy_adinos", + "biolume_proxy_hdinos", + "biolume_bg_biolume", + ] + try: + df_p = self.df_r[variables].copy(deep=True) + except KeyError: + # We didn't add biolum proxies this round... + return + + df_p["biolume_fluo"] = biolume_fluo + df_p["fluoBL_corr"] = ("time", np.full_like(df_p.biolume_fluo, np.nan)) + + depth_series = self.resampled_nc["depth"].to_series() + df_p["depth"] = depth_series.reindex(df_p.index, method="ffill") profile_series = self.resampled_nc["profile_number"].to_series() df_p["profile_number"] = profile_series.reindex(df_p.index, method="ffill") @@ -713,7 +723,10 @@ def correct_biolume_proxies( # noqa: PLR0913 continue auv_profil = df_p.loc[iprofil] if auv_profil.shape[0] > correction_threshold: - if np.sum(auv_profil.adinos > adinos_threshold) < correction_threshold: + if ( + np.sum(auv_profil.biolume_proxy_adinos > adinos_threshold) + < correction_threshold + ): # no correction for low fluo & biolum values fluoBL_corr = 1.0 else: @@ -721,16 +734,23 @@ def correct_biolume_proxies( # noqa: PLR0913 # adino values for each profile idepth = ( auv_profil.depth - < auv_profil.depth[auv_profil.adinos > adinos_threshold].max() + < auv_profil.depth[auv_profil.biolume_proxy_adinos > adinos_threshold].max() ) + auv_profil_idepth = auv_profil[["biolume_fluo", "biolume_bg_biolume"]].loc[ + idepth + ] # pandas' corr ignores NaN - auv_profil_idepth = auv_profil[["fluo", "bg_biolum"]].loc[idepth] - fluoBL_corr = auv_profil_idepth.fluo.corr( - auv_profil_idepth.bg_biolum, method=corr_type + fluoBL_corr = auv_profil_idepth.biolume_fluo.corr( + auv_profil_idepth.biolume_bg_biolume, method=corr_type ) # save correlation df_p.fluoBL_corr[iprofil] = fluoBL_corr + self.logger.info( + "Correcting proxies for profile=%d using fluoBL_corr=%.4f", + iprofil_, + fluoBL_corr, + ) # scale between 0 and 1 first fluoBL_correctionfactor = (fluoBL_corr + 1.0) / 2.0 @@ -743,16 +763,20 @@ def correct_biolume_proxies( # noqa: PLR0913 # can happen if fluo_bl_threshold is negative fluoBL_correctionfactor = max(fluoBL_correctionfactor, 0.0) - df_p.adinosN[iprofil] = df_p.adinos[iprofil] * fluoBL_correctionfactor + df_p.adinosN[iprofil] = df_p.biolume_proxy_adinos[iprofil] * fluoBL_correctionfactor # preserving adinos+diatoms df_p.diatomsN[iprofil] = ( - df_p.adinos[iprofil] + df_p.diatoms[iprofil] - df_p.adinosN[iprofil] + df_p.biolume_proxy_adinos[iprofil] + + df_p.biolume_proxy_diatoms[iprofil] + - df_p.adinosN[iprofil] ) # preserving adinos+hdinos df_p.hdinosN[iprofil] = ( - df_p.adinos[iprofil] + df_p.hdinos[iprofil] - df_p.adinosN[iprofil] + df_p.biolume_proxy_adinos[iprofil] + + df_p.biolume_proxy_hdinos[iprofil] + - df_p.adinosN[iprofil] ) self.df_r.biolum_proxy_adinos[iprofil] = df_p.adinosN[iprofil] @@ -773,7 +797,8 @@ def resample_variable( # noqa: PLR0913 if instr == "biolume" and variable == "biolume_raw": # Only biolume_avg_biolume and biolume_flow treated like other data # All other biolume variables in self.df_r[] are computed from biolume_raw - self.add_biolume_proxies(freq) + biolume_fluo = self.add_biolume_proxies(freq) + self.correct_biolume_proxies(biolume_fluo) else: self.df_o[variable] = self.ds[variable].to_pandas() self.df_o[f"{variable}_mf"] = ( From 3216e2db9bdc1386d4935b2586cf14287ca5ce3e Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Wed, 7 May 2025 16:22:10 -0700 Subject: [PATCH 5/9] tested proxy corrections with sample mission Signed-off-by: Michael Anderson --- src/data/resample.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index cca6df48..dee68c6f 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -672,7 +672,7 @@ def correct_biolume_proxies( # noqa: PLR0913 biolume_fluo: float, # from add_biolume_proxies adinos_threshold: float = 0.1, correction_threshold: int = 3, - fluo_bl_threshold: float = 0.25, # use 0.45 for pearson corr + fluo_bl_threshold: float = 0.35, # use 0.45 for pearson corr corr_type: str = "spearman", # or pearson depth_threshold: float = 2.0, minutes_from_surface_threshold: int = 5, @@ -690,23 +690,32 @@ def correct_biolume_proxies( # noqa: PLR0913 return df_p["biolume_fluo"] = biolume_fluo - df_p["fluoBL_corr"] = ("time", np.full_like(df_p.biolume_fluo, np.nan)) + df_p["fluoBL_corr"] = np.full_like(df_p.biolume_fluo, np.nan) depth_series = self.resampled_nc["depth"].to_series() df_p["depth"] = depth_series.reindex(df_p.index, method="ffill") profile_series = self.resampled_nc["profile_number"].to_series() df_p["profile_number"] = profile_series.reindex(df_p.index, method="ffill") + df_p.biolume_bg_biolume.dropna(inplace=True) # make unique profiles across all surveys - profil = np.cumsum(np.abs(np.diff(df_p["profile_number"], prepend=0))) + df_p.dropna(inplace=True, subset=[ + "depth", + "profile_number", + "biolume_proxy_diatoms", + "biolume_proxy_adinos", + "biolume_proxy_hdinos", + "biolume_fluo" + ]) + profil = np.cumsum(np.abs(np.diff(df_p.profile_number, prepend=0))) # new proxies are the "N" fields for new, old in zip( ["diatomsN", "adinosN", "hdinosN"], - ["biolum_proxy_diatoms", "biolum_proxy_adinos", "biolum_proxy_hdinos"], + ["biolume_proxy_diatoms", "biolume_proxy_adinos", "biolume_proxy_hdinos"], strict=False, ): - df_p[new] = ("time", np.full_like(df_p[old], np.nan)) + df_p[new] = np.full_like(df_p[old], np.nan) # compute correlation per profil and then correct proxies dt_5mins = np.timedelta64(timedelta(minutes=minutes_from_surface_threshold)) @@ -779,9 +788,10 @@ def correct_biolume_proxies( # noqa: PLR0913 - df_p.adinosN[iprofil] ) - self.df_r.biolum_proxy_adinos[iprofil] = df_p.adinosN[iprofil] - self.df_r.biolum_proxy_diatoms[iprofil] = df_p.diatomsN[iprofil] - self.df_r.biolum_proxy_hdinos[iprofil] = df_p.hdinosN[iprofil] + target_indices = df_p.index[iprofil] + self.df_r.loc[target_indices, 'biolume_proxy_adinos'] = df_p.adinosN.loc[iprofil] + self.df_r.loc[target_indices, 'biolume_proxy_diatoms'] = df_p.diatomsN.loc[iprofil] + self.df_r.loc[target_indices, 'biolume_proxy_hdinos'] = df_p.hdinosN.loc[iprofil] def resample_variable( # noqa: PLR0913 self, From 282132bfa1807f2cbc208ccd7d43920964269011 Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Wed, 21 May 2025 14:36:48 -0700 Subject: [PATCH 6/9] proxies corrections edge cases; make linters happy Signed-off-by: Michael Anderson --- src/data/resample.py | 178 +++++++++++++++++++++++++++++++++---------- 1 file changed, 137 insertions(+), 41 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index dee68c6f..b7c79a98 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -15,7 +15,7 @@ import sys import time from collections import defaultdict -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta from pathlib import Path from socket import gethostname @@ -51,7 +51,7 @@ class Resampler: def __init__(self) -> None: plt.rcParams["figure.figsize"] = (15, 5) self.resampled_nc = xr.Dataset() - iso_now = datetime.now(tz=timezone.utc).isoformat().split(".")[0] + "Z" + iso_now = datetime.now(tz=UTC).isoformat().split(".")[0] + "Z" # Common static attributes for all auv platforms self.metadata = {} self.metadata["netcdf_version"] = "4" @@ -75,7 +75,7 @@ def _build_global_metadata(self) -> None: e, ) gitcommit = "" - iso_now = datetime.now(tz=timezone.utc).isoformat().split(".")[0] + "Z" + iso_now = datetime.now(tz=UTC).isoformat().split(".")[0] + "Z" # Common dynamic attributes for all auv platforms self.metadata["time_coverage_start"] = str(min(self.resampled_nc.time.values)) self.metadata["time_coverage_end"] = str(max(self.resampled_nc.time.values)) @@ -373,7 +373,7 @@ def select_nighttime_bl_raw( get_altitude( lat, lon, - datetime.fromtimestamp(ts.astype(int) / 1.0e9, tz=timezone.utc), + datetime.fromtimestamp(ts.astype(int) / 1.0e9, tz=UTC), ), ) @@ -461,7 +461,7 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 flash_threshold: float = FLASH_THRESHOLD, proxy_ratio_adinos: float = 3.9811e13, # 4-Oct-2010 to 2-Dec-2020 value proxy_cal_factor=0.00470, # Same as used in 5.2-mpm-bg_biolume-PiO-paper.ipynb - ) -> None: + ) -> tuple[pd.Series, list[datetime], list[datetime]]: # Add variables via the calculations according to Appendix B in # "Using fluorescence and bioluminescence sensors to characterize # auto- and heterotrophic plankton communities" by Messie et al." @@ -622,7 +622,7 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 self.logger.info( "No hs2_fl700 data. Not computing adinos, diatoms, and hdinos", ) - return None + return fluo, sunsets, sunrises fluo = ( self.resampled_nc["hs2_fl700"] .where( @@ -665,15 +665,17 @@ def add_biolume_proxies( # noqa: PLR0913, PLR0915 f" = {proxy_ratio_adinos:.4e} and proxy_cal_factor = {proxy_cal_factor:.6f}" ) - return fluo + return fluo, sunsets, sunrises - def correct_biolume_proxies( # noqa: PLR0913 + def correct_biolume_proxies( # noqa: PLR0913, PLR0915 self, - biolume_fluo: float, # from add_biolume_proxies + biolume_fluo: pd.Series, # from add_biolume_proxies + biolume_sunsets: list[datetime], # from add_biolume_proxies + biolume_sunrises: list[datetime], # from add_biolume_proxies adinos_threshold: float = 0.1, correction_threshold: int = 3, fluo_bl_threshold: float = 0.35, # use 0.45 for pearson corr - corr_type: str = "spearman", # or pearson + corr_type: str = "pearson", # "spearman" or "pearson" depth_threshold: float = 2.0, minutes_from_surface_threshold: int = 5, ) -> None: @@ -693,21 +695,32 @@ def correct_biolume_proxies( # noqa: PLR0913 df_p["fluoBL_corr"] = np.full_like(df_p.biolume_fluo, np.nan) depth_series = self.resampled_nc["depth"].to_series() - df_p["depth"] = depth_series.reindex(df_p.index, method="ffill") + # df_p["depth"] = depth_series.reindex(df_p.index, method="ffill") + df_p = pd.merge_asof( + df_p, depth_series.to_frame(), left_index=True, right_index=True, direction="nearest" + ) + + self.logger.info( + "correct proxies: df_p depth count=%d nans=%d, resampled_nc depth count=%d nans=%d", + df_p.depth.count(), + df_p.depth.isna().sum(), + self.resampled_nc.depth.count(), + self.resampled_nc.depth.isnull().sum(), # noqa: PD003 + ) profile_series = self.resampled_nc["profile_number"].to_series() - df_p["profile_number"] = profile_series.reindex(df_p.index, method="ffill") - df_p.biolume_bg_biolume.dropna(inplace=True) - # make unique profiles across all surveys - df_p.dropna(inplace=True, subset=[ - "depth", - "profile_number", - "biolume_proxy_diatoms", - "biolume_proxy_adinos", - "biolume_proxy_hdinos", - "biolume_fluo" - ]) - profil = np.cumsum(np.abs(np.diff(df_p.profile_number, prepend=0))) + # df_p["profile_number"] = profile_series.reindex(df_p.index, method="ffill") + df_p = pd.merge_asof( + df_p, profile_series.to_frame(), left_index=True, right_index=True, direction="nearest" + ) + + self.logger.info( + "correct proxies: df_p max profile=%d (nans=%d), resampled_nc max profile=%d (nans=%d)", + df_p.profile_number.max(), + df_p.profile_number.isna().sum(), + self.resampled_nc.profile_number.max(), + self.resampled_nc.profile_number.isnull().sum(), # noqa: PD003 + ) # new proxies are the "N" fields for new, old in zip( @@ -717,10 +730,52 @@ def correct_biolume_proxies( # noqa: PLR0913 ): df_p[new] = np.full_like(df_p[old], np.nan) + def _interval_contains_sunevent( + start: pd.Timestamp, end: pd.Timestamp, events: pd.DatetimeIndex + ) -> bool: + mask = (events >= start) & (events <= end) + return mask.any() + + biolume_sunsets = pd.DatetimeIndex(biolume_sunsets).sort_values() + biolume_sunrises = pd.DatetimeIndex(biolume_sunrises).sort_values() + profile_intervals = ( + df_p.groupby("profile_number") + .apply(lambda g: (g.index.min(), g.index.max()), include_groups=False) + .rename("interval") + .apply(pd.Series) + .rename(columns={0: "start", 1: "end"}) + ) + profile_intervals["has_sunset"] = profile_intervals.apply( + lambda row: _interval_contains_sunevent(row["start"], row["end"], biolume_sunsets), + axis=1, + ) + profile_intervals["has_sunrise"] = profile_intervals.apply( + lambda row: _interval_contains_sunevent(row["start"], row["end"], biolume_sunrises), + axis=1, + ) + profile_intervals["has_sunevent"] = ( + profile_intervals["has_sunrise"] | profile_intervals["has_sunset"] + ) + df_p["has_sunset"] = df_p["profile_number"].map(profile_intervals["has_sunset"]) + df_p["has_sunrise"] = df_p["profile_number"].map(profile_intervals["has_sunrise"]) + df_p["has_sunevent"] = df_p["profile_number"].map(profile_intervals["has_sunevent"]) + # compute correlation per profil and then correct proxies + profil = df_p.profile_number dt_5mins = np.timedelta64(timedelta(minutes=minutes_from_surface_threshold)) - for iprofil_ in range(1, int(np.max(profil))): + for iprofil_ in range(1, int(np.max(profil)) + 1): iprofil = profil == iprofil_ + has_sunevent = df_p.loc[iprofil, "has_sunevent"].any() + if has_sunevent: # set proxies for this profile to NaN + self.logger.info( + "Processing profile=%d for proxy correction: found sun event -- set NaN", + iprofil_, + ) + target_indices = df_p.index[iprofil] + self.df_r.loc[target_indices, "biolume_proxy_adinos"] = np.nan + self.df_r.loc[target_indices, "biolume_proxy_diatoms"] = np.nan + self.df_r.loc[target_indices, "biolume_proxy_hdinos"] = np.nan + continue # excludes surface, must be within 5 min of it ideep = iprofil & (df_p.depth > depth_threshold) itime = (df_p.index > (df_p.index[ideep].min() - dt_5mins)) & ( @@ -731,13 +786,35 @@ def correct_biolume_proxies( # noqa: PLR0913 # print(f'no corrections possible for {iprofil_=}') continue auv_profil = df_p.loc[iprofil] + self.logger.info( + "Processing profile=%d for proxy correction: total_points=%d > thresh=%d ?", + iprofil_, + auv_profil.shape[0], + correction_threshold, + ) if auv_profil.shape[0] > correction_threshold: if ( np.sum(auv_profil.biolume_proxy_adinos > adinos_threshold) < correction_threshold ): + if auv_profil.biolume_proxy_adinos.count() == 0: # all proxies are NaN so skip + self.logger.info( + "Correcting proxies: valid adinos=%d < thresh=%d -- all NaN so skip", + np.sum(auv_profil.biolume_proxy_adinos > adinos_threshold), + correction_threshold, + ) + continue # no correction for low fluo & biolum values fluoBL_corr = 1.0 + self.logger.info( + "Correcting proxies: valid adinos=%d < thresh=%d" + " -- using fluoBL_corr=%.4f, total_size_adinos=%d, nans=%d", + np.sum(auv_profil.biolume_proxy_adinos > adinos_threshold), + correction_threshold, + fluoBL_corr, + auv_profil.biolume_proxy_adinos.shape[0], + auv_profil.biolume_proxy_adinos.isna().sum(), + ) else: # correlation between fluo and bg_biolum computed on high # adino values for each profile @@ -745,21 +822,33 @@ def correct_biolume_proxies( # noqa: PLR0913 auv_profil.depth < auv_profil.depth[auv_profil.biolume_proxy_adinos > adinos_threshold].max() ) - auv_profil_idepth = auv_profil[["biolume_fluo", "biolume_bg_biolume"]].loc[ - idepth - ] + auv_profil_idepth = auv_profil[ + ["biolume_fluo", "biolume_bg_biolume", "depth"] + ].loc[idepth] # pandas' corr ignores NaN fluoBL_corr = auv_profil_idepth.biolume_fluo.corr( auv_profil_idepth.biolume_bg_biolume, method=corr_type ) + self.logger.info( + "Correcting proxies: valid adinos=%d > thresh=%d" + " -- using fluoBL_corr=%.4f, total_size_idepth=%d, nans=%d," + " min_depth=%.4f, max_depth=%.4f", + np.sum(auv_profil.biolume_proxy_adinos > adinos_threshold), + correction_threshold, + fluoBL_corr, + auv_profil_idepth.shape[0], + auv_profil.biolume_proxy_adinos.isna().sum(), + auv_profil_idepth.depth.min(), + auv_profil_idepth.depth.max(), + ) # save correlation - df_p.fluoBL_corr[iprofil] = fluoBL_corr - self.logger.info( - "Correcting proxies for profile=%d using fluoBL_corr=%.4f", - iprofil_, - fluoBL_corr, - ) + df_p.loc[iprofil, "fluoBL_corr"] = fluoBL_corr + # self.logger.info( + # "Correcting proxies for profile=%d using fluoBL_corr=%.4f", + # iprofil_, + # fluoBL_corr, + # ) # scale between 0 and 1 first fluoBL_correctionfactor = (fluoBL_corr + 1.0) / 2.0 @@ -772,26 +861,33 @@ def correct_biolume_proxies( # noqa: PLR0913 # can happen if fluo_bl_threshold is negative fluoBL_correctionfactor = max(fluoBL_correctionfactor, 0.0) - df_p.adinosN[iprofil] = df_p.biolume_proxy_adinos[iprofil] * fluoBL_correctionfactor + df_p.loc[iprofil, "adinosN"] = ( + df_p.biolume_proxy_adinos[iprofil] * fluoBL_correctionfactor + ) # preserving adinos+diatoms - df_p.diatomsN[iprofil] = ( + df_p.loc[iprofil, "diatomsN"] = ( df_p.biolume_proxy_adinos[iprofil] + df_p.biolume_proxy_diatoms[iprofil] - df_p.adinosN[iprofil] ) # preserving adinos+hdinos - df_p.hdinosN[iprofil] = ( + df_p.loc[iprofil, "hdinosN"] = ( df_p.biolume_proxy_adinos[iprofil] + df_p.biolume_proxy_hdinos[iprofil] - df_p.adinosN[iprofil] ) target_indices = df_p.index[iprofil] - self.df_r.loc[target_indices, 'biolume_proxy_adinos'] = df_p.adinosN.loc[iprofil] - self.df_r.loc[target_indices, 'biolume_proxy_diatoms'] = df_p.diatomsN.loc[iprofil] - self.df_r.loc[target_indices, 'biolume_proxy_hdinos'] = df_p.hdinosN.loc[iprofil] + self.df_r.loc[target_indices, "biolume_proxy_adinos"] = df_p.adinosN.loc[iprofil] + self.df_r.loc[target_indices, "biolume_proxy_diatoms"] = df_p.diatomsN.loc[iprofil] + self.df_r.loc[target_indices, "biolume_proxy_hdinos"] = df_p.hdinosN.loc[iprofil] + else: + self.logger.info( + "profile=%d skipped for proxy correction", + iprofil_, + ) def resample_variable( # noqa: PLR0913 self, @@ -807,8 +903,8 @@ def resample_variable( # noqa: PLR0913 if instr == "biolume" and variable == "biolume_raw": # Only biolume_avg_biolume and biolume_flow treated like other data # All other biolume variables in self.df_r[] are computed from biolume_raw - biolume_fluo = self.add_biolume_proxies(freq) - self.correct_biolume_proxies(biolume_fluo) + biolume_fluo, biolume_sunsets, biolume_sunrises = self.add_biolume_proxies(freq) + self.correct_biolume_proxies(biolume_fluo, biolume_sunsets, biolume_sunrises) else: self.df_o[variable] = self.ds[variable].to_pandas() self.df_o[f"{variable}_mf"] = ( From a8179f27fda99219dc8eb3512938b7c5243aac5f Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Wed, 21 May 2025 14:42:03 -0700 Subject: [PATCH 7/9] put UTC back to timezone.utc for python3.10 Signed-off-by: Michael Anderson --- src/data/resample.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data/resample.py b/src/data/resample.py index b7c79a98..a81131e5 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -15,7 +15,7 @@ import sys import time from collections import defaultdict -from datetime import UTC, datetime, timedelta +from datetime import datetime, timedelta, timezone from pathlib import Path from socket import gethostname @@ -51,7 +51,7 @@ class Resampler: def __init__(self) -> None: plt.rcParams["figure.figsize"] = (15, 5) self.resampled_nc = xr.Dataset() - iso_now = datetime.now(tz=UTC).isoformat().split(".")[0] + "Z" + iso_now = datetime.now(tz=timezone.utc).isoformat().split(".")[0] + "Z" # Common static attributes for all auv platforms self.metadata = {} self.metadata["netcdf_version"] = "4" @@ -75,7 +75,7 @@ def _build_global_metadata(self) -> None: e, ) gitcommit = "" - iso_now = datetime.now(tz=UTC).isoformat().split(".")[0] + "Z" + iso_now = datetime.now(tz=timezone.utc).isoformat().split(".")[0] + "Z" # Common dynamic attributes for all auv platforms self.metadata["time_coverage_start"] = str(min(self.resampled_nc.time.values)) self.metadata["time_coverage_end"] = str(max(self.resampled_nc.time.values)) @@ -373,7 +373,7 @@ def select_nighttime_bl_raw( get_altitude( lat, lon, - datetime.fromtimestamp(ts.astype(int) / 1.0e9, tz=UTC), + datetime.fromtimestamp(ts.astype(int) / 1.0e9, tz=timezone.utc), ), ) From 2d54f722e12174c7094841516dd6aee1c763c0bd Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Fri, 23 May 2025 15:08:52 -0700 Subject: [PATCH 8/9] remove include_groups field in pandas lambda for now Signed-off-by: Michael Anderson --- src/data/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/resample.py b/src/data/resample.py index a81131e5..7c407db4 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -740,7 +740,7 @@ def _interval_contains_sunevent( biolume_sunrises = pd.DatetimeIndex(biolume_sunrises).sort_values() profile_intervals = ( df_p.groupby("profile_number") - .apply(lambda g: (g.index.min(), g.index.max()), include_groups=False) + .apply(lambda g: (g.index.min(), g.index.max())) # pandas2.2.3: include_groups=False .rename("interval") .apply(pd.Series) .rename(columns={0: "start", 1: "end"}) From 8a998c8a14fca45b804b59c78805a3c30f66f08e Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Tue, 27 May 2025 10:56:09 -0700 Subject: [PATCH 9/9] no reason to exclude that last depth point since that's still above adinos_threshold Signed-off-by: Michael Anderson --- src/data/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/resample.py b/src/data/resample.py index 7c407db4..64b7b73e 100755 --- a/src/data/resample.py +++ b/src/data/resample.py @@ -820,7 +820,7 @@ def _interval_contains_sunevent( # adino values for each profile idepth = ( auv_profil.depth - < auv_profil.depth[auv_profil.biolume_proxy_adinos > adinos_threshold].max() + <= auv_profil.depth[auv_profil.biolume_proxy_adinos > adinos_threshold].max() ) auv_profil_idepth = auv_profil[ ["biolume_fluo", "biolume_bg_biolume", "depth"]