Merge pull request #70 from holukas/v0.71.2

V0.71.2
holukas · Mar 18, 2024 · d487c5f · d487c5f
2 parents e303170 + 31d177f
commit d487c5f
Show file tree

Hide file tree

Showing 17 changed files with 8,273 additions and 165 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,6 @@
 # e.g.: /src
 /.idea/
 /__local_folders
-
 /notebooks/_scratch/
 /notebooks/Workbench/FLUXNET_CH4-N2O_Committee_WP2/data/
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,20 @@
 
 ![DIIVE](images/logo_diive1_256px.png)
 
+## v0.71.2 | 18 Mar 2024
+
+### Notebooks
+
+- Added new notebook for `daily_correlation` function (`notebooks/Analyses/DailyCorrelation.ipynb`)
+- Added new notebook for `Histogram` class (`notebooks/Analyses/Histogram.ipynb`)
+
+### Bugfixes & changes
+
+- Daily correlations are now returned with daily (`1d`) timestamp
+  index (`diive.pkgs.analyses.correlation.daily_correlation`)
+- Updated README
+- Environment: Added [ruff](https://github.com/astral-sh/ruff) to dev dependencies for linting
+
 ## v0.71.1 | 15 Mar 2024
 
 ### Bugfixes & changes

diff --git a/README.md b/README.md
@@ -16,10 +16,11 @@ More notebooks are added constantly.
 
 ### Analyses
 
+- Daily correlation ([notebook example](notebooks/Analyses/DailyCorrelation.ipynb))
 - Decoupling
 - Detect time resolution from data ([notebook example](notebooks/TimeStamps/Detect_time_resolution.ipynb))
 - Find data gaps ([notebook example](notebooks/Analyses/GapFinder.ipynb))
-- Histogram
+- Histogram ([notebook example](notebooks/Analyses/Histogram.ipynb))
 - Optimum range
 - Quantiles
 
@@ -39,12 +40,13 @@ More notebooks are added constantly.
 ### Eddy covariance high-resolution
 
 - Flux detection limit from high-resolution data
+- Find maximum covariance between turbulent wind and scalar
+- Wind rotation to calculate turbulent departures of wind components and scalar (e.g. CO2)
 
-### Formats
+### Files
 
-- Convert EddyPro fluxnet output files for upload to FLUXNET
-  database ([notebook example](notebooks/Formats/FormatEddyProFluxnetFileForUpload.ipynb))
-- Load and save parquet files ([notebook example](notebooks/Formats/LoadSaveParquetFile.ipynb))
+- Detect expected and unexpected (irregular) files in a list of files
+- Split multiple files into smaller parts and export them as (compressed) CSV files
 
 ### Fits
 
@@ -71,8 +73,9 @@ see [here](https://www.swissfluxnet.ethz.ch/index.php/data/ecosystem-fluxes/flux
 
 Format data to specific formats
 
-- Format EddyPro _fluxnet_ output file for upload to FLUXNET
+- Convert EddyPro fluxnet output files for upload to FLUXNET
   database ([notebook example](notebooks/Formats/FormatEddyProFluxnetFileForUpload.ipynb))
+- Load and save parquet files ([notebook example](notebooks/Formats/LoadSaveParquetFile.ipynb))
 
 ### Gap-filling
 
@@ -111,6 +114,11 @@ Fill gaps in time series with various methods
 
 - Time series stats ([notebook example](notebooks/Stats/TimeSeriesStats.ipynb))
 
+### Timestamps
+
+- Create continuous timestamp based on number of records in the file and the file duration
+- Insert additional timestamps in various formats
+
 ## Installation
 
 `diive` can be installed from source code, e.g. using [`poetry`](https://python-poetry.org/) for dependencies.

diff --git a/diive/core/base/flagbase.py b/diive/core/base/flagbase.py
@@ -30,7 +30,7 @@ def __init__(self, series: Series, flagid: str, idstr: str = None, verbose: bool
     def overall_flag(self) -> Series:
         """Overall flag, calculated from individual flags from multiple iterations."""
         if not isinstance(self._overall_flag, Series):
-            raise Exception(f'No overall flag available.')
+            raise Exception('No overall flag available.')
         return self._overall_flag
 
     def get_flag(self):
@@ -112,7 +112,7 @@ def generate_flagname(self, iteration: int = None) -> str:
         if iteration:
             flagname += f'_ITER{iteration}_TEST'
         else:
-            flagname += f'_TEST'
+            flagname += '_TEST'
         return flagname
 
     def generate_iteration_filtered_variable_name(self, iteration: int):
@@ -167,7 +167,7 @@ def defaultplot(self, n_iterations: int = 1):
                             label="outlier (rejected)", color="#F44336", alpha=1,
                             markersize=12, markeredgecolor='none', fmt='X')
         ax_ok.plot_date(self.series[ok].index, self.series[ok],
-                        label=f"filtered series", alpha=.5,
+                        label="filtered series", alpha=.5,
                         markersize=8, markeredgecolor='none')
         default_format(ax=ax_series)
         default_format(ax=ax_ok)

diff --git a/diive/core/times/times.py b/diive/core/times/times.py
@@ -293,11 +293,13 @@ def validate_timestamp_naming(data: Series or DataFrame, verbose: bool = False)
     """
     timestamp_name = data.index.name
     allowed_timestamp_names = ['TIMESTAMP_END', 'TIMESTAMP_START', 'TIMESTAMP_MIDDLE']
-    if verbose: print(f"Validating timestamp naming of timestamp column {timestamp_name} ...", end=" ")
+    if verbose:
+        print(f"Validating timestamp naming of timestamp column {timestamp_name} ...", end=" ")
 
     # First check if timestamp already has one of the required names
     if any(fnmatch.fnmatch(timestamp_name, allowed_name) for allowed_name in allowed_timestamp_names):
-        if verbose: print("Timestamp name OK.")
+        if verbose:
+            print("Timestamp name OK.")
         return timestamp_name
 
     else:

diff --git a/diive/pkgs/analyses/correlation.py b/diive/pkgs/analyses/correlation.py
@@ -9,7 +9,7 @@ def daily_correlation(s1: Series,
                       s2: Series,
                       mincorr: float = 0.8,
                       showplot: bool = False) -> Series:
-    """ Calculate daily correlation between two time series
+    """Calculate daily correlation between two time series.
 
     Args:
         s1: any time series, timestamp must overlap with *s2*
@@ -22,6 +22,9 @@ def daily_correlation(s1: Series,
 
     Returns:
         series with correlations for each day
+
+    - Example notebook available in:
+        notebooks/Analyses/DailyCorrelation.ipynb
     """
     if -1 <= mincorr <= 1:
         # Use absolute value for mincorr
@@ -43,11 +46,14 @@ def daily_correlation(s1: Series,
     daycorrs_index = groups.count().index
     daycorrs = pd.Series(index=daycorrs_index, name='daycorrs')
 
-    # Calculate correlation between measured and potential for each day
+    # Calculate correlation for each day
     for day, day_df in groups:
         corr = day_df[s1.name].corr(day_df[s2.name])
         daycorrs.loc[day] = corr
 
+    daycorrs.index = pd.to_datetime(daycorrs.index)
+    daycorrs = daycorrs.asfreq('1d')
+
     if showplot:
         _plot_daily_correlation(daycorrs=daycorrs, mincorr=mincorr,
                                 df=df, s1=s1, s2=s2)
@@ -75,7 +81,7 @@ def _plot_daily_correlation(daycorrs, mincorr, df, s1, s2):
     lowestcorrs = lowestcorrs.index.astype(str).to_list()
     lowestdays = df['DATE'].isin(lowestcorrs)
 
-    fig = plt.figure(facecolor='white', figsize=(9, 12), dpi=150)
+    fig = plt.figure(facecolor='white', figsize=(8, 12), dpi=100)
     gs = gridspec.GridSpec(4, 3)  # rows, cols
     gs.update(wspace=0.3, hspace=0.4, left=0.05, right=0.97, top=0.9, bottom=0.1)
     ax1 = fig.add_subplot(gs[0, 0:])

diff --git a/diive/pkgs/analyses/histogram.py b/diive/pkgs/analyses/histogram.py
@@ -12,21 +12,21 @@
 
 
 class Histogram:
-    """
-    Calculate histogram from Series in DataFrame
-    """
 
     def __init__(self,
                  s: Series,
                  method: Literal['n_bins', 'uniques'] = 'n_bins',
                  n_bins: int = 10,
                  ignore_fringe_bins: list = None):
-        """
+        """Calculate histogram from Series.
+
         Args:
-            s: Time series
+            s: A pandas Series.
             method: Method used for binning data
                 Options:
                     - 'uniques': Each unique value in the dataset is a separate bin
+                    - 'n_bins': Number of bins
+            n_bins: Number of bins, needed if *method* is 'n_bins', otherwise ignored.
             ignore_fringe_bins: List of integers [i, j] with length 2
                 If a list is provided, then the first i and last j number of
                 bins are removed from the results and ignored during
@@ -58,6 +58,9 @@ def __init__(self,
                             9.60            4
                             9.65            6
                             9.70            16
+
+        - Example notebook available in:
+            notebooks/Analyses/Histogram.ipynb
         """
         self.method = method
         self.n_bins = n_bins
@@ -77,7 +80,7 @@ def results(self) -> DataFrame:
 
     @property
     def peakbins(self):
-        """Returns the five bins with the most counts"""
+        """Returns the five bins with the most counts in decreasing order"""
         ix_maxcounts = self.results['COUNTS'].sort_values(ascending=False).head(5).index
         peakbins = self.results['BIN_START_INCL'].iloc[ix_maxcounts]
         return list(peakbins.values)
@@ -126,110 +129,21 @@ def _binning_method(self):
 
 
 def example():
-    # # from diive.core.io.filereader import ReadFileType
-    # # SOURCE = r"F:\01-NEW\FF202303\FRU\Level-0_OPENLAG_results_2005-2022\OUT_DIIVE-20230410-020904\winddir_Dataset_DIIVE-20230410-020904_Original-30T.diive.csv"
-    # # loaddatafile = ReadFileType(filetype='DIIVE-CSV-30MIN', filepath=SOURCE, data_nrows=None)
-    # # data_df, metadata_df = loaddatafile.get_filedata()
-    #
-    # # from diive.core.io.files import load_pickle, save_as_pickle
-    # # pickle_ = save_as_pickle(data=data_df, outpath='F:\_temp', filename='temp')
-    #
-    # from diive.core.io.files import load_pickle
-    # data_df = load_pickle(filepath=r"F:\Sync\luhk_work\_temp\temp.pickle")
-    #
-    # col = 'wind_dir'
-    # s = data_df[col].copy()
-    #
-    # s = s.loc[s.index.year <= 2022]
-    # # s = s.loc[s.index.year <= 2022]
-    # # s = s.loc[(s.index.month >= 8) & (s.index.month <= 8)]
-    # s = s.dropna()
-
-    # # Wind direction correction for certain years
-    # _locs = (s.index.year >= 2005) & (s.index.year <= 2019)
-    # s[_locs] -= 25
-    # _locs_below_zero = s < 0
-    # s[_locs_below_zero] += 360
-
-    # s = s.astype(int)
-    # rounded = round(s / 10) * 10
-
-    # # Calculate wind direction e.g. per YEAR-MONTH, e.g. 2022-06, 2022-07, ...
-    # sdf = pd.DataFrame(s)
-    # # sdf['YEAR-MONTH'] = sdf.index.year.astype(str).str.cat(sdf.index.month.astype(str).str.zfill(2), sep='-')
-    # # grouped = sdf.groupby(sdf['YEAR-MONTH'])
-    # sdf['YEAR'] = sdf.index.year.astype(str)
-    # grouped = sdf.groupby(sdf['YEAR'])
-    # wdavg = pd.Series()
-    # wdp25 = pd.Series()
-    # wdp75 = pd.Series()
-    # for g, d in grouped:
-    #     wd_avg = direction_avg_kanda(angles=d['wind_dir'], agg='median')
-    #     wdavg.loc[g] = wd_avg
-    #     wd_p25 = direction_avg_kanda(angles=d['wind_dir'], agg='P25')
-    #     wdp25.loc[g] = wd_p25
-    #     wd_p75 = direction_avg_kanda(angles=d['wind_dir'], agg='P75')
-    #     wdp75.loc[g] = wd_p75
-    # import matplotlib.pyplot as plt
-    # wdavg.plot()
-    # wdp25.plot()
-    # wdp75.plot()
-    # plt.show()
-    # plt.show()
-
-    # gr = sdf.groupby(sdf['YEAR-MONTH']).apply(direction_avg_kanda)
-
-    # s.apply(direction_avg_kanda)
-    # winddir_avg = direction_avg_kanda(angles=_s.to_numpy())
-
-    # from diive.core.plotting.heatmap_datetime import HeatmapDateTime
-    # HeatmapDateTime(series=s).show()
-
-    # # Reference histogram from 2021 and 2022
-    # ref_s = s.loc[s.index.year >= 2021]
-    # ref_histo = Histogram(s=ref_s, method='n_bins', n_bins=360)
-    # ref_results = ref_histo.results
-
-    # # Test year
-    # test_s = s.loc[s.index.year == 2020]
-    # shiftdf = pd.DataFrame(columns=['SHIFT', 'CORR'])
-    # for shift in np.arange(-100, 100, 1):
-    #     # print(shift)
-    #     test_s_shifted = test_s.copy()
-    #     test_s_shifted += shift
-    #
-    #     _locs_above360 = test_s_shifted > 360
-    #     test_s_shifted[_locs_above360] -= 360
-    #     _locs_belowzero = test_s_shifted < 0
-    #     test_s_shifted[_locs_belowzero] += 360
-    #
-    #     test_histo = Histogram(s=test_s_shifted, method='n_bins', n_bins=360)
-    #     test_results = test_histo.results
-    #     corr = test_results['COUNTS'].corr(ref_results['COUNTS'])
-    #     shiftdf.loc[len(shiftdf)] = [shift, corr]
-    #     # print(f"{shift:.1f}: corr = {corr}")
-    #
-    # import matplotlib.pyplot as plt
-    # shiftdf = shiftdf.set_index(keys='SHIFT', drop=True)
-    # shiftdf.plot()
-    # plt.show()
-    # print("X")
-
-    shiftdf = shiftdf.sort_values(by='CORR', ascending=False).copy()
-    shift_maxcorr = shiftdf.iloc[0].name
-    print(shift_maxcorr)
-
-    # g = s.groupby(s.index.year)
-    # for year, _s in g:
-    #     h = Histogram(s=_s, method='n_bins', n_bins=36)
-    #     # h = Histogram(s=s, method='uniques', ignore_fringe_bins=[1, 5])
-    #     print(h.results)
-    #     print(f"{year}: {h.peakbins}")
-
-    # # bins = 10
-    # # valuebins = True
-    # # binsize = 0.05
-    # # bins = np.arange(data.min(), data.max(), binsize)
+    from diive.configs.exampledata import load_exampledata_eddypro_fluxnet_CSV_30MIN
+    data_df, metadata_df = load_exampledata_eddypro_fluxnet_CSV_30MIN()
+    print(data_df.head())
+
+    series = data_df['CO2_TLAG_ACTUAL'].copy()
+
+    hist = Histogram(
+        s=series,
+        method='n_bins',
+        n_bins=10,
+        ignore_fringe_bins=[1, 1]
+    )
+
+    hist.results
+    hist.peakbins
 
 
 if __name__ == '__main__':