-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfractal_calculator.py
142 lines (114 loc) · 5.69 KB
/
fractal_calculator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/python
import numpy as np
import pandas as pd
import Preprocessor
import FractalDimensionsSerializer
import FractalDimensions
import Discretizer
#######################################################################
# Auxilary functions needed in pre-processing
#######################################################################
def add_minutes_diff_45or15(datetime_value):
"""Return the artificial diff in min for datetime values with time matching xx:45 and xx:15 mask."""
x = datetime_value
result = 0 #default diff
if x.minute == 45:
result = 5 # we want 45 -> 50
else:
if x.minute == 15:
result = 5 # we want 45 -> 50
else:
result = 0
return result
################################################
# Main execution loop
################################################
# NOTES:
# - Set do_smoothening to 0 if you do not want to smoothen the training data for Power Production
# - Set do_smoothening to 1, if you like to smoothen power production time series
do_smoothening = 0
############################################
# Step 1: Reading Raw data into memory
############################################
# read raw csv data into Pandas DF
file_name = 'training.csv'
df = pd.read_csv(file_name,
encoding = 'ISO-8859-1', low_memory=True, index_col='Time (CET+0100)',
parse_dates=['Time (CET+0100)', 'Zeitstempel'])
# filter out the subset of cols needed for the training set
valuable_df = df.filter(['Time (CET+0100)',
'Temperature, C (observation)',
'Wind speed, km/h (observation)',
'Wind direction, degrees (observation)',
'Zeitstempel', 'Produktion [kW]'],
axis=1)
# further split valuable data into DF for independent and dependent variables
# this will be used for future NA elimination and data series timestamp homohenization
observations_df = valuable_df.filter(['Time (CET+0100)',
'Temperature, C (observation)',
'Wind speed, km/h (observation)',
'Wind direction, degrees (observation)'],
axis=1)
results_df = valuable_df.filter(['Zeitstempel',
'Produktion [kW]'],
axis=1)
# free memory
valuable_df = None
df = None
############################################
# Step 2: Pre-processing
############################################
# Step 2a: Remove records with at least 1 NA from the observations df
# #TODO: In production mode, this can be replaced with a smart imputation, using one of
# climate/weather data homogenisation algorithms
# Drop the rows where any of the elements are nan
print("Dimentions of the training dataframe before NA elimantion: ", observations_df.shape)
observations_df["TMP"] = observations_df.index.values # TMP temp col introduced, index is a DateTimeIndex
observations_df = observations_df[observations_df.TMP.notnull()] # remove all NaT values
observations_df.drop(["TMP"], axis=1, inplace=True) # delete TMP again
print("Dimentions of the training dataframe after NA elimantion: ", observations_df.shape)
# Step 2b: replace hh:mm stamps of xx:50 with xx:45, xx:20 with xx:15,
# to make sure time stamps matches the timestamps in Generated Production Power result variable
#TODO: In production mode, this shall be revisited based on the imputation/homohenization algo chosen
results_df['Zeitstempel'] = results_df['Zeitstempel'] + results_df['Zeitstempel'].apply\
(lambda x: pd.Timedelta(add_minutes_diff_45or15(x), 'm'))
print("Preprocessing, step 2b, completed")
if do_smoothening == 1:
# Step 2c: Apply custom pre-processing and smoothening to power production result variable
Kt = 0.3 # TODO: make it a configurable parameter in production mode
preprocessor = Preprocessor.Preprocessor(results_df['Produktion [kW]'].values,Kt)
print("Preprocessing, step 3: Custom production power preprocessor initialized")
q = 4 # TODO: make it a configurable parameter in production mode
smooth_data = preprocessor.smoother(results_df['Produktion [kW]'].values, q)
results_df['Produktion [kW]'].values = smooth_data
print("Preprocessing, step 2c: Production power preprocessor data smoothened")
else:
print("Preprocessing, step 2c: Skipped")
# Step 2d: merge observations_df and results_df to create the final training set dataframe
# redo obervations df dynamically to avoid 'Time (CET+0100)' being an index field there
observations_df2 = observations_df.filter([
'Temperature, C (observation)',
'Wind speed, km/h (observation)',
'Wind direction, degrees (observation)'],
axis=1)
observations_df2['Timestamp'] = observations_df.index.values
# do the actual merge
training_df = pd.merge(observations_df2, results_df, left_on='Timestamp', right_on='Zeitstempel')
# remove 'Zeitstempel' as it is a duplocate colunm in the result training dataframe
training_df = training_df.filter(['Timestamp',
'Temperature, C (observation)',
'Wind speed, km/h (observation)',
'Wind direction, degrees (observation)',
'Produktion [kW]'],
axis=1)
print("Preprocessing, step 2d, completed")
print("Final dimentions of the training set: " , training_df.shape)
# free memory
observations_df2 = None
results_df = None
############################################
# Fractal calculations
############################################
fdms = FractalDimensionsSerializer.FractalDimensionsSerializer()
fdms.calculate_fractal_dimensions(training_df)
print ("Finished fractal dimensions calculation for the training set ...")