-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathodc.py
774 lines (648 loc) · 31.8 KB
/
odc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
"""
Author: Ashraf Alaghbari
ODC is a Python library for detecting and treating outliers in time-series data.
It can be used to clean various types of variables, including on-stream hours for
production and injection data, production rates, injection rates, average choke size,
reservoir pressure and temperature, and wellhead temperature.
ODC is built on top of the Pandas library and is compatible with NumPy and Matplotlib.
"""
"""Typing module for variable type annotations."""
from typing import Union, Tuple, Iterator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class DetectOutliers:
"""
This class is designed to identify, visualize, and remove outliers
in time series data from oil wells.
We have tested the outlier detection capability of this class on a
daily time series dataset retrieved
from the VOLVE field provided by Equinor (formerly known as Statoil).
`The index of the input DataFrame must be a pandas DatetimeIndex object.`
The class can clean four types of variables:
- `on_stream_var`: Indicates the on-stream hours for production or injection wells.
- `rate_var`: Indicates the production rate, such as oil, water, or gas, and
can also be used for injection rate, such as water injected volume.
- `avg_choke_var`: Indicates the average choke size (valve) that controls the production rates.
- `roc`: Denotes the rate of change. The `detect_outliers_with_roc` method in this class
can clean variables that behave in a relatively stable or gradual manner
(e.g. average reservoir pressure
and temperature and average wellhead temperature as well).
The function, that detects outliers based on roc, works by identifying sudden movements
that do not conform to
the distribution of observations within the specified window. Each window contains
a certain number of observations selected based on the window size, which can be
defined by the user.
The default values in all methods have been determined based on experimental results
and have been optimized for outlier detection and removal.However, the user can
change the default values.
Parameters:
----------
data (pd.DataFrame): The input DataFrame that contains the oil time series
data to be cleaned.
Attributes:
----------
data (pd.DataFrame): A copy of the input data.
_stored_vars (dict): A dictionary that stores the names of the variables
"""
def __init__(self, df: pd.DataFrame) -> None:
# Create a copy of the input data to avoid modifying the original data
self.data = df.copy()
# create an empty dictionary to store the variables names
self._stored_vars = {}
def plot_outliers(
self,
outliers: pd.Series,
color: str = 'red',
alpha: float = 0.3,
legend_loc: str = 'lower right'
) -> None:
"""
Visualizes the detected outliers
Parameters:
----------
`outliers (pd.Series)`: the outliers to mark
`color (str)`: color of the markers (default: 'red')
`alpha (float)`: opacity of the markers (default: 0.3)
`legend_loc (str)`: location of the legend (default: 'lower right')
Returns:
-------
`None`
"""
if isinstance(outliers, str):
print("No outliers detected.")
else:
# Set up the figure and axis
_, axis = plt.subplots(figsize=(18, 6))
# Plot the original time series
axis.plot(self.data[outliers.name].index,
self.data[outliers.name].values,
label=outliers.name,
color='black')
# Plot vertical lines at the location of the outliers
axis.vlines(outliers.index,
ymin=self.data[outliers.name].min(),
ymax=self.data[outliers.name].max(),
color=color,
alpha=alpha,
label='Outliers')
axis.set_xlabel('Date')
axis.set_ylabel('Value')
axis.set_title(f'Outliers Detected: {len(outliers)}')
axis.legend(loc = legend_loc)
plt.show()
def detect_outliers_in_time(
self,
on_stream_var: str,
rate_var: str,
verbose: bool = False
) -> Union[pd.Series, str]:
"""
This function detects and returns outliers in the on-stream hours
data for production and injection wells.
Parameters:
----------
- `on_stream_var (str)`: The name of the variable in the dataset that
represents on-stream hours.
- `rate_var (str)`: The name of the corresponding rate variable for
the on-stream hours.
For example, if the on-stream hours variable represents production time,
the rate variable could be oil rate.
Similarly, if the on-stream hours variable represents injection time for cleaning,
the rate variable could be injected water volume.
- `verbose (bool)`: If True, prints the name of the variable being analyzed
Returns:
-------
- a pandas Series containing the outliers in the on-stream hours for
production or inejction data.
- a string indicating that no outliers were detected.
"""
# store the variables in the dictionary
self._stored_vars['on_stream_var'] = on_stream_var
self._stored_vars['rate_var'] = rate_var
# Initialize an empty pandas Series to store outliers
outliers = pd.Series()
# Cap the maximum value of on-stream hours to 24
outliers = self.data.loc[self.data[on_stream_var] > 24, on_stream_var]
# If the rate variable is zero, the corresponding on-stream hours should also be zero
mask = (self.data[rate_var] == 0) & (self.data[on_stream_var] > 0)
outliers = pd.concat([outliers, self.data.loc[mask, on_stream_var]])
# Check if there are outliers
if outliers.empty:
return "No outliers detected."
# Print the name of the variable being analyzed
if verbose:
print(f'Variable: {on_stream_var}')
# Return outliers
return outliers.sort_index()
def detect_outliers_in_rate(
self,
rate_var: str,
on_stream_var: str,
verbose: bool = False
) -> Union[pd.Series, str]:
"""
This method is used to detect outliers in production and injection rates.
Parameters:
----------
`rate_var (str)`: The name of the variable indicating
the production rate or inejction rates.
`on_stream_var (str)`: The name of the variable indicating
the corresponding on-stream hours.
`verbose (bool)`: If True, prints the name of the variable being analyzed
Returns:
-------
`pd.Series`: A series containing the identified outliers
in the production or injection rates.
`str`: A string indicating that no outliers were detected.
"""
# store the variables in the dictionary
self._stored_vars['on_stream_var'] = on_stream_var
self._stored_vars['rate_var'] = rate_var
# Initialize an empty pandas Series to store outliers
# outliers = pd.Series()
# Select data where the on-stream variable is zero and
# the rate variable is greater than zero.
mask = (self.data[on_stream_var] == 0) & (self.data[rate_var] > 0)
# outliers = pd.concat([outliers, self.data.loc[mask, rate_var]])
outliers = self.data.loc[mask, rate_var]
# Check if there are outliers
if outliers.empty:
return "No outliers detected."
if verbose:
# Print the name of the variable being analyzed
print(f'Variable: {rate_var}')
# Return outliers
return outliers.sort_index()
def detect_outliers_in_choke(
self,
avg_choke_var: str,
on_stream_var: str,
verbose: bool = False
) -> pd.Series:
"""
Detects outliers in the average choke size variable in the production data.
Parameters:
----------
`avg_choke_var (str)`: The name of the column containing the average choke size data.
`on_stream_var (str)`: The name of the column containing the on-stream hours data.
`verbose (bool)`: If True, prints the name of the variable being analyzed
Returns:
-------
`pd.Series`: A series containing the outliers detected in
the average choke size variable.
"""
# store the variables in the dictionary
self._stored_vars['avg_choke_var'] = avg_choke_var
self._stored_vars['on_stream_var'] = on_stream_var
# The average choke size should be set to zero when the well is off
mask = (self.data[on_stream_var] == 0) & (self.data[avg_choke_var] > 0)
outliers = self.data.loc[mask, avg_choke_var]
# Check if there are outliers
if outliers.empty:
return "No outliers detected."
# Return outliers
if verbose:
# Print the name of the variable being analyzed
print(f'Variable: {avg_choke_var}')
return outliers.sort_index()
def __remove_extreme_outliers(
self,
series: pd.Series,
thd_z_score: Union[int, float] = 2
) -> Tuple[pd.Series, pd.Series]:
"""
Removes extreme outliers from a pandas Series object.
Extreme outliers are considered to be values of zero or dramatic drops in variables such as
downhole pressure, downhole temperature, or wellhead temperature.
For example, if Downhole_temperature = [200, 201, 200, 204, 199, 160, 0, 202],
then 0 and 160 are deemed to be outliers for this variable.
This private method is called by detect_outliers_with_roc to detect such anomalous behavior.
Parameters:
----------
`series (pd.Series)`: The input pandas Series object
from which to remove outliers.
`thd_z_score (Union[int, float])`: The threshold value for
the Z-score above which data points are
considered extreme outliers. Defaults to 2.
Returns:
-------
`Tuple[pd.Series, pd.Series]`: A tuple of two pandas Series objects:
- The first Series is the input Series with extreme outliers removed.
- The second Series contains the extreme outliers that were removed.
"""
# Create a copy of the input series to avoid modifying the original.
extreme_filtered_series = series.copy()
# Find any existing zeros in the series and consider them as outliers.
outliers = extreme_filtered_series[extreme_filtered_series == 0]
# Replace zeros with NaN values so that they are excluded
# from the outlier detection process.
extreme_filtered_series[extreme_filtered_series == 0] = np.nan
# Calculate the Z-score of each data point in the series.
mean = extreme_filtered_series.mean()
std = extreme_filtered_series.std()
z_score = (extreme_filtered_series - mean) / std
# Calculate the absolute value of the Z-score.
abs_z_score = abs(z_score)
# Find the data points whose Z-score exceeds the threshold value
# and consider them as outliers.
outliers = pd.concat([outliers,
extreme_filtered_series[abs_z_score > thd_z_score]])
# Replace the extreme outliers with NaN values so that they are
# excluded from the returned series.
extreme_filtered_series[abs_z_score > thd_z_score] = np.nan
# Return the cleaned series and the extreme outliers.
return extreme_filtered_series, outliers
def __get_window_mean(
self,
series: pd.Series,
window_size: int,
i: int,
verbose: bool = False
) -> Tuple[Union[float, int], pd.Series, int]:
"""
Calculates the mean value of a window of data points in a pandas Series object.
This method calculates the mean value of a window of data points
with a size of window_size, starting
from the index i of the input pandas Series object. If there are
enough data points in the Series object
to form a complete window of size window_size, the method calculates
the mean value of that window. If
the number of data points is less than window_size, the method merges
the last window with the previous
one and calculates the mean value.
Parameters:
----------
`series (pd.Series)`: The pandas Series object containing the data points.
`window_size (int)`: The number of data points to include in the window.
`i (int)`: The index of the first data point in the window.
`verbose (bool)`: If True, prints the start and end dates of the window.
Returns:
-------
`Tuple[Union[float, int], pd.Series, int]`: A tuple containing:
- The mean value of the window (either float or int).
- The pandas Series object representing the window.
- The index of the next data point after the window.
"""
if i + 2 * window_size <= len(series):
# If there are enough data points for a complete window
# Select the window and calculate its start and end dates
window = series.iloc[i:i + window_size]
start_date = self.data.iloc[i].name.strftime('%Y-%m-%d')
end_date = self.data.iloc[i + window_size].name.strftime('%Y-%m-%d')
i += window_size
else:
# If there are not enough data points for a complete window
# Merge the remaining data points with the last window
window = series.iloc[i:]
start_date = self.data.iloc[i].name.strftime('%Y-%m-%d')
end_date = self.data.iloc[len(series) - 1].name.strftime('%Y-%m-%d')
i = len(series)
if verbose:
print(f"The rate of change for segment [{start_date}, {end_date}]", end='')
# Return the mean value of the window, the window itself,
# and the index of the next data point
return window.mean(), window, i
def __get_window_outliers(
self,
window: pd.Series,
mean: Union[float, int],
rate_of_change_window: Union[float, int]
) -> pd.Series:
"""
Returns a Series object containing the outliers of
a given window of data points.
Outliers are defined as data points that fall outside of the upper and lower bounds,
which are calculated
as the mean of the window plus or minus the rate of change threshold.
Parameters:
----------
`window (pd.Series)`: The pandas Series object representing the window of data points.
`mean (Union[float, int])`: The mean value of the window.
`rate_of_change_window (Union[float, int])`: The threshold value for the rate of change.
Returns:
-------
`pd.Series`: A pandas Series object containing the outliers of the input window.
"""
upper_bound = mean + rate_of_change_window
lower_bound = mean - rate_of_change_window
window_outliers = window.loc[(window < lower_bound) | (window > upper_bound)]
return window_outliers
def __define_roc_with_quantile(
self,
series: pd.Series,
outliers: pd.Series,
window_size: int,
thd_quantile: Union[int, float] = .98,
verbose: bool = False
) -> pd.Series:
"""
Determines the rate of change for each window of data points
in a pandas Series object
using a given quantile value and identifies data points that are outliers.
This method is useful in case the user does not know the suitable rate of change
for each window (interval)
For each window of data points with a size of window_size, the method calculates
the mean value and the rate of change, which is determined using the given quantile
value. Data points that are outliers are identified by comparing their values
to the calculated upper and lower bounds. The method returns a pandas Series object
containing the outliers.
Parameters:
----------
`series (pd.Series)`: The pandas Series object containing the data points.
`outliers (pd.Series)`: A pandas Series object containing the data points that were
previously identified as extreme outliers by __remove_extreme_outliers.
`window_size (int)`: The number of data points to include in the window.
`thd_quantile (Union[int, float])`: The quantile value to use
in calculating the rate of change. Defaults to 0.98.
`verbose (bool)`: If True, prints the start and end dates of the window.
Returns:
-------
`pd.Series`: A pandas Series object containing the data points that were
extreme outliers and outliers indentified by quantile rate of change.
"""
rate = []
i = 0
while i < len(series):
# Calculate mean value and rate of change for the window
mean, window, i = self.__get_window_mean(series, window_size, i, verbose)
diff_window = np.abs(np.diff(window))
rate_of_change_window = np.nanquantile(diff_window, thd_quantile)
rate.append(rate_of_change_window)
if verbose:
print(f' is {np.round(rate_of_change_window, 2)}'
f' with a mean of {np.round(mean, 2)}')
# Identify outliers in the window and add them to the list of outliers
window_outliers = self.__get_window_outliers(window,
mean,
rate_of_change_window)
outliers = pd.concat([outliers, window_outliers])
if verbose:
print(f'Rate of change for each window of size {window_size}'
f' is: {[round(x, 2) for x in rate]}')
return outliers
def __validate_roc_input(
self,
rate_of_change: Union[list, np.ndarray],
num_windows: int,
all_same_rate: bool
) -> Iterator:
""" Validates the input for rate of change and returns an iterator
of the rate of change values.
Parameters:
----------
`rate_of_change (Union[list, np.ndarray])`: A list or an array of
rate of change values.
`num_windows (int)`: The number of windows.
`all_same_rate (bool)`: A flag to indicate whether all windows should
have the same rate of change.
Returns:
-------
Iterator: An iterator of the rate of change values.
Raises:
------
ValueError: If the input rate_of_change is not a list or an array.
ValueError: If all_same_rate is True and rate_of_change does
not contain exactly one value.
ValueError: If all_same_rate is False and the length of rate_of_change
is not equal to num_windows.
"""
if not isinstance(rate_of_change, (list, np.ndarray)):
raise ValueError("rate_of_change must be a list or an array.")
if all_same_rate:
if len(rate_of_change) != 1:
raise ValueError('When all_same_rate is True, rate_of_change must be a'
' list or an array with a single value.')
return iter([rate_of_change[0]] * num_windows)
if len(rate_of_change) != num_windows:
raise ValueError(f"Length of rate_of_change ({len(rate_of_change)})"
f"does not match the number of windows ({num_windows}).")
return iter(rate_of_change)
def __define_roc_manually(
self,
series: pd.Series,
outliers: pd.Series,
window_size: int,
num_windows: int,
rate_of_change: Union[list, np.ndarray],
all_same_rate: bool = False,
verbose: bool = False
) -> pd.Series:
"""
Detect outliers based on the defined rate of change by the user.
The method first validates if the input rate of change conforms to the expected format.
It then calculates the mean for each window of data points, and generates a boundary
for each window based on the user-defined rate of change and the calculated mean.
Data points that fall outside these boundaries are identified as outliers.
Parameters:
----------
`series (pd.Series)`: A pandas Series object containing the data points.
`outliers (pd.Series)`: A pandas Series object containing the
data points that were previously identified as extreme outliers
by __remove_extreme_outliers.
`window_size (int)`: The number of data points to include in the window.
`num_windows (int)`: The number of windows in the series.
`rate_of_change (Union[list[float], List[int], np.ndarray[float], np.ndarray[int]])`:
A list or array of rate of change values defined by the user.
If all_same_rate is True, this value
will be used for all windows. If all_same_rate is False,
the rate_of_change value should be a list or
array with length equal to num_windows.
`all_same_rate (bool)`: If True, one user-defined value is used for all windows.
If False, the rate_of_change value should be a list or array
with length equal to num_windows. Default is False.
`verbose (bool)`: If True, prints the rate of change
for each window. Default is False.
Returns:
-------
`pd.Series`: A pandas Series object containing the extreme outliers
and outliers that fall outside the boundary defined by
the user-defined rate of change.
"""
# Validate the user-defined rate of change input
rate_of_change_iter = self.__validate_roc_input(rate_of_change,
num_windows,
all_same_rate)
# Initialize window index
i = 0
# Iterate through each window of the data
while i < len(series):
# Calculate the mean and window of data points for the current window
mean, window, i= self.__get_window_mean(series, window_size, i, verbose)
# Get the rate of change value for the current window
rate_of_change_window = next(rate_of_change_iter)
if verbose:
print(f' is {np.round(rate_of_change_window, 2)} with a mean of {np.round(mean,2)}')
# Get the outliers for the current window
window_outliers = self.__get_window_outliers(window, mean, rate_of_change_window)
# Add the window outliers to the list of outliers
outliers = pd.concat([outliers, window_outliers])
# Return the list of all outliers detected
return outliers
def detect_outliers_with_roc(
self,
series: str,
window_size: int,
thd_z_score: int = 2,
thd_quantile: Union[float, int] = 0.98,
rate_of_change: Union[list, np.ndarray] = None,
all_same_rate: bool = False,
verbose: bool = False
) -> pd.Series:
"""
Detects outliers in variables(e.g. downhole pressure, downhole temperature,
or wellhead temperature) by creating boundaries using the rate of change,
either defined by the user or calculated based on the rate of change
of 98% of the data points in each window.
The user has three options to determine the rate of change:
- Input the expected rate of change for each window.
- Use one value of rate of change for all windows.
- Determine the rate of change for each window based on the quantile.
Parameters:
----------
`series (str)`: The name of the time series variable to be analyzed.
`window_size (int)`: The number of data points to include in the window.
`thd_z_score (int)`: The Z-score threshold value used to remove
extreme outliers. Default is 2.
`thd_quantile (Union[float, int])`: The quantile threshold used to identify
outliers when rate of change is not used. Default is 0.98.
`rate_of_change (Optional[Union[List[Union[float, int]],
np.ndarray[Union[float, int]]]])`:
A list or array of rate of change values defined by the user.
If None, the quantile threshold method is used. Default is None.
`all_same_rate (bool)`: If True,
the same user-defined rate of change value is used for all windows.
The user should input only one value, which will be used to calculate the outliers
for all windows. If False, the rate_of_change value should be a list or array with
length equal to the number of windows. The default value is False.
`verbose (bool)`: If True, the function prints
the rate of change value for each window. Default is False.
Returns:
----------
`pd.Series`: A pandas Series object containing the extreme outliers and
outliers that fall outside the boundary
defined by the rate of change or quantile threshold.
"""
# Retrieve the variable
series = self.data[series]
# Determine the number of intervals (windows) in the data
num_windows = int(len(series) / window_size)
if verbose:
print(f'Number of intervals: {num_windows}')
# Check if the input series is a pandas Series object
if not isinstance(series, pd.Series):
raise TypeError(f"Input 'series' must be a pandas Series object, not {type(series)}")
# Remove extreme outliers from the series using the remove_extreme_outliers function
extreme_filtered_series, outliers = self.__remove_extreme_outliers(series, thd_z_score)
# Detect outliers using either rate of change or quantile threshold
if all_same_rate and rate_of_change is None:
raise TypeError("If all rates of change are the same, "
"the rate_of_change argument must be provided")
if rate_of_change is None:
outliers = self.__define_roc_with_quantile(extreme_filtered_series,
outliers,
window_size,
thd_quantile,
verbose)
else:
outliers = self.__define_roc_manually(extreme_filtered_series,
outliers,
window_size,
num_windows,
rate_of_change,
all_same_rate,
verbose)
# Check if there are outliers
if outliers.empty:
return "No outliers detected."
# Return outliers
if verbose:
# Print the name of the variable being analyzed
print(f'Variable: {series.name}')
return outliers.sort_index()
def treat_outliers_in_time(
self,
)-> pd.Series:
"""
Treats outliers in the on-stream hours variable by
capping the maximum value at 24 hours
and setting the on-stream hours to zero if the rate is zero.
Returns:
----------
`pd.Series`: A pandas Series object containing the treated on-stream hours variable.
"""
# retrieve the stored variables from the dictionary
on_stream_var = self._stored_vars['on_stream_var']
rate_var = self._stored_vars['rate_var']
# Cap the maximum value of on-stream hours to 24
self.data.loc[self.data[on_stream_var] > 24, on_stream_var] = 24
# If the rate variable is zero, the corresponding on-stream hours should also be zero
mask = (self.data[rate_var] == 0) & (self.data[on_stream_var] > 0)
self.data.loc[mask, on_stream_var] = 0
# Return the on-stream hours variable
return self.data[on_stream_var]
def treat_outliers_in_rate(
self,
)-> pd.Series:
"""
Treats outliers in the rate variable by setting the rate to
zero if the on-stream hours is zero.
Returns:
----------
`pd.Series`: A pandas Series object containing the treated rate variable.
"""
# retrieve the stored variables from the dictionary
rate_var = self._stored_vars['rate_var']
on_stream_var = self._stored_vars['on_stream_var']
# If the on-stream hours variable is zero, the corresponding rate should also be zero
mask = (self.data[on_stream_var] == 0) & (self.data[rate_var] > 0)
# Set the rate to zero
self.data.loc[mask, rate_var] = 0
# Return the rate variable
return self.data[rate_var]
def treat_outliers_with_roc(
self,
roc_outliers: pd.Series,
intp_method: str = 'linear'
)-> pd.Series:
"""
Treats outliers in a time series variable by interpolating the values of the outliers.
Parameters:
----------
`data (pd.DataFrame)`: The dataframe containing the variable to be analyzed.
`roc_outliers (pd.Series)`: A pandas Series object containing
the outliers detected using the rate of change method.
`intp_method (str)`: The interpolation method to be used. Default is 'linear'.
Returns:
----------
`pd.Series`: A pandas Series object containing the variable with
the outliers interpolated.
"""
# Setting the outliers to NaN
var = roc_outliers.name
self.data[var][roc_outliers.index] = np.nan
# Interpolating the values of the outliers
self.data[var] = self.data[var].interpolate(method=intp_method, limit_direction='both')
# Return the variable
return self.data[var]
def treat_outliers_in_choke(
self,
)-> pd.Series:
"""
Cleans outliers in the average choke variable by setting the
average choke to zero if the on-stream hours is zero.
Returns:
----------
`pd.Series`: A pandas Series object containing the cleaned average
choke variable.
"""
# retrieve the stored variables from the dictionary
avg_choke_var = self._stored_vars['avg_choke_var']
on_stream_var = self._stored_vars['on_stream_var']
# If the on-stream hours variable is zero, the corresponding average
# choke should also be zero
mask = (self.data[on_stream_var] == 0) & (self.data[avg_choke_var] > 0)
# Set the average choke to zero
self.data.loc[mask, avg_choke_var] = 0
# Return the average choke variable
return self.data[avg_choke_var]