-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobson.py
163 lines (143 loc) · 6.61 KB
/
robson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 19 13:32:09 2017
@author: ian
"""
import datetime as dt
import numpy as np
import os
import pandas as pd
import pdb
import xlrd
#------------------------------------------------------------------------------
def align_data(df_list):
begin_list, end_list = [], []
for df in df_list:
begin_list.append(df.index[0])
end_list.append(df.index[-1])
begin_date = max(begin_list)
end_date = min(end_list)
new_index = pd.date_range(begin_date, end_date, freq = '30T')
new_df = df_list[0].join(df_list[1])
return new_df.reindex(new_index)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def drop_data(df, date_list):
for l in date_list:
if l[0] == '-':
l = [dt.datetime.strftime(df.index[0].to_pydatetime(),
'%Y-%m-%d %H:%M:%S'),
l[1]]
elif l[1] == '-':
l = [l[0],
dt.datetime.strftime(df.index[-1].to_pydatetime(),
'%Y-%m-%d %H:%M:%S')]
df.loc[l[0]: l[1]] = np.nan
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def filter_data(df, var, limits):
df.loc[df[var] < limits[0], var] = np.nan
df.loc[df[var] > limits[1], var] = np.nan
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def get_irga_data(path):
print 'Processing file: {}'.format(os.path.basename(path))
df = pd.read_csv(path, skiprows = [0, 2, 3], na_values = 'NaN',
error_bad_lines = False, dtype = {'CO2_Li820': 'float',
'Level_&_Sample': 'int'})
df.index = pd.to_datetime(df.TIMESTAMP, errors = 'coerce')
df = df[pd.notnull(df.index)]
df.drop('TIMESTAMP', axis = 1, inplace = True)
df['level'] = [str(x)[0] for x in df['Level_&_Sample']]
prep_data(df)
filter_data(df, 'CO2_Li820', [300, 900])
return process_irga_data(df)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def get_file_list(path, word):
f_list = filter(lambda x: word in x, os.listdir(path))
return sorted(map(lambda x: os.path.join(path, x), f_list))
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def get_met_data(path):
header_rows = [0, 1, 2]
book = xlrd.open_workbook(path)
sheet = book.sheet_by_name('Data')
var_list = sheet.row_values(0)
T_var_list = sorted(filter(lambda x: 'Ta_CS' in x, var_list))
p_var_list = ['Ps_PTB110_Avg']
date_time = map(lambda x: xlrd.xldate_as_datetime(x,
datemode = book.datemode),
sheet.col_values(0, header_rows[-1] + 1))
df = pd.DataFrame(index = date_time)
heights_dict = dict(zip(T_var_list,
map(lambda x: 'Tair_{}m'.format(x), heights_list)))
for var in T_var_list:
idx = var_list.index(var)
name = heights_dict[var]
df[name] = sheet.col_values(idx, header_rows[-1] + 1)
for var in p_var_list:
idx = var_list.index(var)
df['ps'] = sheet.col_values(idx, header_rows[-1] + 1)
prep_data(df)
return df
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def make_date_iterator(df):
marker = df['Level_&_Sample'] - df['Level_&_Sample'].shift()
date_list = marker[~marker.isin([1, 91])].index
int_start_list = map(lambda x: df.index.get_loc(x), date_list)
int_end_list = list(np.array(int_start_list[1:]) - 1) + [len(df) - 1]
return zip(df.iloc[int_start_list].index, df.iloc[int_end_list].index)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def make_result_dataframe(df):
index = pd.date_range(df.index[0].round('min'),
df.index[-1].round('min'),
freq = 'T')
columns = ['CO2_{}m'.format(x) for x in heights_list]
return pd.DataFrame(index = index, columns = columns, dtype = 'float')
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def prep_data(df):
df.sort_index(inplace = True)
df.drop_duplicates(inplace = True)
df = df[~df.index.duplicated(keep = 'first')]
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def process_irga_data(df):
print 'Parsing dates: '
result_df = make_result_dataframe(df)
levels_ref_dict = dict(zip([str(x) for x in range(1, 7)],
['CO2_{}m'.format(x) for x in heights_list]))
for date_pair in make_date_iterator(df):
sub_df = df.loc[date_pair[0]: date_pair[1]].copy()
index_name = ((sub_df.index[0] + (sub_df.index[-1] -
sub_df.index[0]) / 2)
.round('min'))
mean_df = sub_df.groupby('level').mean()['CO2_Li820']
for level in mean_df.index:
col_name = levels_ref_dict[level]
result_df.loc[index_name, col_name] = mean_df[level]
print index_name
return result_df
#------------------------------------------------------------------------------
# Set some constants
path = '/home/ian/ownCloud_dav/Shared/Monash-OzFlux/Profile_data/RobsonCreek'
#path = '/home/ian/Desktop/Robson'
heights_list = [1, 2, 3.5, 9, 21, 39]
bad_data_list = [['2017-05-10 00:00:00', '-']]
# Construct, process, smooth and downsample IRGA dataset
irga_fp_list = get_file_list(path, 'fast_profile')
irga_df = pd.concat(map(lambda x: get_irga_data(x), irga_fp_list))
prep_data(irga_df)
irga_df = irga_df.resample('2T').mean()
irga_df = irga_df.resample('30T').pad()
# Get met dataset
met_fp_list = get_file_list(path, 'RBS')
met_df = pd.concat(map(lambda x: get_met_data(x), met_fp_list))
prep_data(met_df)
# Join the datasets
df = align_data([irga_df, met_df])
drop_data(df, bad_data_list)