Skip to content

Commit

Permalink
Merge pull request #70 from holukas/v0.71.2
Browse files Browse the repository at this point in the history
V0.71.2
  • Loading branch information
holukas authored Mar 18, 2024
2 parents e303170 + 31d177f commit d487c5f
Show file tree
Hide file tree
Showing 17 changed files with 8,273 additions and 165 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# e.g.: /src
/.idea/
/__local_folders

/notebooks/_scratch/
/notebooks/Workbench/FLUXNET_CH4-N2O_Committee_WP2/data/

Expand Down
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

![DIIVE](images/logo_diive1_256px.png)

## v0.71.2 | 18 Mar 2024

### Notebooks

- Added new notebook for `daily_correlation` function (`notebooks/Analyses/DailyCorrelation.ipynb`)
- Added new notebook for `Histogram` class (`notebooks/Analyses/Histogram.ipynb`)

### Bugfixes & changes

- Daily correlations are now returned with daily (`1d`) timestamp
index (`diive.pkgs.analyses.correlation.daily_correlation`)
- Updated README
- Environment: Added [ruff](https://github.com/astral-sh/ruff) to dev dependencies for linting

## v0.71.1 | 15 Mar 2024

### Bugfixes & changes
Expand Down
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ More notebooks are added constantly.

### Analyses

- Daily correlation ([notebook example](notebooks/Analyses/DailyCorrelation.ipynb))
- Decoupling
- Detect time resolution from data ([notebook example](notebooks/TimeStamps/Detect_time_resolution.ipynb))
- Find data gaps ([notebook example](notebooks/Analyses/GapFinder.ipynb))
- Histogram
- Histogram ([notebook example](notebooks/Analyses/Histogram.ipynb))
- Optimum range
- Quantiles

Expand All @@ -39,12 +40,13 @@ More notebooks are added constantly.
### Eddy covariance high-resolution

- Flux detection limit from high-resolution data
- Find maximum covariance between turbulent wind and scalar
- Wind rotation to calculate turbulent departures of wind components and scalar (e.g. CO2)

### Formats
### Files

- Convert EddyPro fluxnet output files for upload to FLUXNET
database ([notebook example](notebooks/Formats/FormatEddyProFluxnetFileForUpload.ipynb))
- Load and save parquet files ([notebook example](notebooks/Formats/LoadSaveParquetFile.ipynb))
- Detect expected and unexpected (irregular) files in a list of files
- Split multiple files into smaller parts and export them as (compressed) CSV files

### Fits

Expand All @@ -71,8 +73,9 @@ see [here](https://www.swissfluxnet.ethz.ch/index.php/data/ecosystem-fluxes/flux

Format data to specific formats

- Format EddyPro _fluxnet_ output file for upload to FLUXNET
- Convert EddyPro fluxnet output files for upload to FLUXNET
database ([notebook example](notebooks/Formats/FormatEddyProFluxnetFileForUpload.ipynb))
- Load and save parquet files ([notebook example](notebooks/Formats/LoadSaveParquetFile.ipynb))

### Gap-filling

Expand Down Expand Up @@ -111,6 +114,11 @@ Fill gaps in time series with various methods

- Time series stats ([notebook example](notebooks/Stats/TimeSeriesStats.ipynb))

### Timestamps

- Create continuous timestamp based on number of records in the file and the file duration
- Insert additional timestamps in various formats

## Installation

`diive` can be installed from source code, e.g. using [`poetry`](https://python-poetry.org/) for dependencies.
Expand Down
6 changes: 3 additions & 3 deletions diive/core/base/flagbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, series: Series, flagid: str, idstr: str = None, verbose: bool
def overall_flag(self) -> Series:
"""Overall flag, calculated from individual flags from multiple iterations."""
if not isinstance(self._overall_flag, Series):
raise Exception(f'No overall flag available.')
raise Exception('No overall flag available.')
return self._overall_flag

def get_flag(self):
Expand Down Expand Up @@ -112,7 +112,7 @@ def generate_flagname(self, iteration: int = None) -> str:
if iteration:
flagname += f'_ITER{iteration}_TEST'
else:
flagname += f'_TEST'
flagname += '_TEST'
return flagname

def generate_iteration_filtered_variable_name(self, iteration: int):
Expand Down Expand Up @@ -167,7 +167,7 @@ def defaultplot(self, n_iterations: int = 1):
label="outlier (rejected)", color="#F44336", alpha=1,
markersize=12, markeredgecolor='none', fmt='X')
ax_ok.plot_date(self.series[ok].index, self.series[ok],
label=f"filtered series", alpha=.5,
label="filtered series", alpha=.5,
markersize=8, markeredgecolor='none')
default_format(ax=ax_series)
default_format(ax=ax_ok)
Expand Down
6 changes: 4 additions & 2 deletions diive/core/times/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,13 @@ def validate_timestamp_naming(data: Series or DataFrame, verbose: bool = False)
"""
timestamp_name = data.index.name
allowed_timestamp_names = ['TIMESTAMP_END', 'TIMESTAMP_START', 'TIMESTAMP_MIDDLE']
if verbose: print(f"Validating timestamp naming of timestamp column {timestamp_name} ...", end=" ")
if verbose:
print(f"Validating timestamp naming of timestamp column {timestamp_name} ...", end=" ")

# First check if timestamp already has one of the required names
if any(fnmatch.fnmatch(timestamp_name, allowed_name) for allowed_name in allowed_timestamp_names):
if verbose: print("Timestamp name OK.")
if verbose:
print("Timestamp name OK.")
return timestamp_name

else:
Expand Down
12 changes: 9 additions & 3 deletions diive/pkgs/analyses/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def daily_correlation(s1: Series,
s2: Series,
mincorr: float = 0.8,
showplot: bool = False) -> Series:
""" Calculate daily correlation between two time series
"""Calculate daily correlation between two time series.
Args:
s1: any time series, timestamp must overlap with *s2*
Expand All @@ -22,6 +22,9 @@ def daily_correlation(s1: Series,
Returns:
series with correlations for each day
- Example notebook available in:
notebooks/Analyses/DailyCorrelation.ipynb
"""
if -1 <= mincorr <= 1:
# Use absolute value for mincorr
Expand All @@ -43,11 +46,14 @@ def daily_correlation(s1: Series,
daycorrs_index = groups.count().index
daycorrs = pd.Series(index=daycorrs_index, name='daycorrs')

# Calculate correlation between measured and potential for each day
# Calculate correlation for each day
for day, day_df in groups:
corr = day_df[s1.name].corr(day_df[s2.name])
daycorrs.loc[day] = corr

daycorrs.index = pd.to_datetime(daycorrs.index)
daycorrs = daycorrs.asfreq('1d')

if showplot:
_plot_daily_correlation(daycorrs=daycorrs, mincorr=mincorr,
df=df, s1=s1, s2=s2)
Expand Down Expand Up @@ -75,7 +81,7 @@ def _plot_daily_correlation(daycorrs, mincorr, df, s1, s2):
lowestcorrs = lowestcorrs.index.astype(str).to_list()
lowestdays = df['DATE'].isin(lowestcorrs)

fig = plt.figure(facecolor='white', figsize=(9, 12), dpi=150)
fig = plt.figure(facecolor='white', figsize=(8, 12), dpi=100)
gs = gridspec.GridSpec(4, 3) # rows, cols
gs.update(wspace=0.3, hspace=0.4, left=0.05, right=0.97, top=0.9, bottom=0.1)
ax1 = fig.add_subplot(gs[0, 0:])
Expand Down
134 changes: 24 additions & 110 deletions diive/pkgs/analyses/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@


class Histogram:
"""
Calculate histogram from Series in DataFrame
"""

def __init__(self,
s: Series,
method: Literal['n_bins', 'uniques'] = 'n_bins',
n_bins: int = 10,
ignore_fringe_bins: list = None):
"""
"""Calculate histogram from Series.
Args:
s: Time series
s: A pandas Series.
method: Method used for binning data
Options:
- 'uniques': Each unique value in the dataset is a separate bin
- 'n_bins': Number of bins
n_bins: Number of bins, needed if *method* is 'n_bins', otherwise ignored.
ignore_fringe_bins: List of integers [i, j] with length 2
If a list is provided, then the first i and last j number of
bins are removed from the results and ignored during
Expand Down Expand Up @@ -58,6 +58,9 @@ def __init__(self,
9.60 4
9.65 6
9.70 16
- Example notebook available in:
notebooks/Analyses/Histogram.ipynb
"""
self.method = method
self.n_bins = n_bins
Expand All @@ -77,7 +80,7 @@ def results(self) -> DataFrame:

@property
def peakbins(self):
"""Returns the five bins with the most counts"""
"""Returns the five bins with the most counts in decreasing order"""
ix_maxcounts = self.results['COUNTS'].sort_values(ascending=False).head(5).index
peakbins = self.results['BIN_START_INCL'].iloc[ix_maxcounts]
return list(peakbins.values)
Expand Down Expand Up @@ -126,110 +129,21 @@ def _binning_method(self):


def example():
# # from diive.core.io.filereader import ReadFileType
# # SOURCE = r"F:\01-NEW\FF202303\FRU\Level-0_OPENLAG_results_2005-2022\OUT_DIIVE-20230410-020904\winddir_Dataset_DIIVE-20230410-020904_Original-30T.diive.csv"
# # loaddatafile = ReadFileType(filetype='DIIVE-CSV-30MIN', filepath=SOURCE, data_nrows=None)
# # data_df, metadata_df = loaddatafile.get_filedata()
#
# # from diive.core.io.files import load_pickle, save_as_pickle
# # pickle_ = save_as_pickle(data=data_df, outpath='F:\_temp', filename='temp')
#
# from diive.core.io.files import load_pickle
# data_df = load_pickle(filepath=r"F:\Sync\luhk_work\_temp\temp.pickle")
#
# col = 'wind_dir'
# s = data_df[col].copy()
#
# s = s.loc[s.index.year <= 2022]
# # s = s.loc[s.index.year <= 2022]
# # s = s.loc[(s.index.month >= 8) & (s.index.month <= 8)]
# s = s.dropna()

# # Wind direction correction for certain years
# _locs = (s.index.year >= 2005) & (s.index.year <= 2019)
# s[_locs] -= 25
# _locs_below_zero = s < 0
# s[_locs_below_zero] += 360

# s = s.astype(int)
# rounded = round(s / 10) * 10

# # Calculate wind direction e.g. per YEAR-MONTH, e.g. 2022-06, 2022-07, ...
# sdf = pd.DataFrame(s)
# # sdf['YEAR-MONTH'] = sdf.index.year.astype(str).str.cat(sdf.index.month.astype(str).str.zfill(2), sep='-')
# # grouped = sdf.groupby(sdf['YEAR-MONTH'])
# sdf['YEAR'] = sdf.index.year.astype(str)
# grouped = sdf.groupby(sdf['YEAR'])
# wdavg = pd.Series()
# wdp25 = pd.Series()
# wdp75 = pd.Series()
# for g, d in grouped:
# wd_avg = direction_avg_kanda(angles=d['wind_dir'], agg='median')
# wdavg.loc[g] = wd_avg
# wd_p25 = direction_avg_kanda(angles=d['wind_dir'], agg='P25')
# wdp25.loc[g] = wd_p25
# wd_p75 = direction_avg_kanda(angles=d['wind_dir'], agg='P75')
# wdp75.loc[g] = wd_p75
# import matplotlib.pyplot as plt
# wdavg.plot()
# wdp25.plot()
# wdp75.plot()
# plt.show()
# plt.show()

# gr = sdf.groupby(sdf['YEAR-MONTH']).apply(direction_avg_kanda)

# s.apply(direction_avg_kanda)
# winddir_avg = direction_avg_kanda(angles=_s.to_numpy())

# from diive.core.plotting.heatmap_datetime import HeatmapDateTime
# HeatmapDateTime(series=s).show()

# # Reference histogram from 2021 and 2022
# ref_s = s.loc[s.index.year >= 2021]
# ref_histo = Histogram(s=ref_s, method='n_bins', n_bins=360)
# ref_results = ref_histo.results

# # Test year
# test_s = s.loc[s.index.year == 2020]
# shiftdf = pd.DataFrame(columns=['SHIFT', 'CORR'])
# for shift in np.arange(-100, 100, 1):
# # print(shift)
# test_s_shifted = test_s.copy()
# test_s_shifted += shift
#
# _locs_above360 = test_s_shifted > 360
# test_s_shifted[_locs_above360] -= 360
# _locs_belowzero = test_s_shifted < 0
# test_s_shifted[_locs_belowzero] += 360
#
# test_histo = Histogram(s=test_s_shifted, method='n_bins', n_bins=360)
# test_results = test_histo.results
# corr = test_results['COUNTS'].corr(ref_results['COUNTS'])
# shiftdf.loc[len(shiftdf)] = [shift, corr]
# # print(f"{shift:.1f}: corr = {corr}")
#
# import matplotlib.pyplot as plt
# shiftdf = shiftdf.set_index(keys='SHIFT', drop=True)
# shiftdf.plot()
# plt.show()
# print("X")

shiftdf = shiftdf.sort_values(by='CORR', ascending=False).copy()
shift_maxcorr = shiftdf.iloc[0].name
print(shift_maxcorr)

# g = s.groupby(s.index.year)
# for year, _s in g:
# h = Histogram(s=_s, method='n_bins', n_bins=36)
# # h = Histogram(s=s, method='uniques', ignore_fringe_bins=[1, 5])
# print(h.results)
# print(f"{year}: {h.peakbins}")

# # bins = 10
# # valuebins = True
# # binsize = 0.05
# # bins = np.arange(data.min(), data.max(), binsize)
from diive.configs.exampledata import load_exampledata_eddypro_fluxnet_CSV_30MIN
data_df, metadata_df = load_exampledata_eddypro_fluxnet_CSV_30MIN()
print(data_df.head())

series = data_df['CO2_TLAG_ACTUAL'].copy()

hist = Histogram(
s=series,
method='n_bins',
n_bins=10,
ignore_fringe_bins=[1, 1]
)

hist.results
hist.peakbins


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit d487c5f

Please sign in to comment.