@@ -226,8 +226,12 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
226
226
if column_name not in self .categorical_column_names :
227
227
self .categorical_column_names .append (column_name )
228
228
229
- # get timestamp column from chunker incase the calculator is initialized with a chunker without directly
230
- # been provided the timestamp column name
229
+ # Get timestamp column from chunker incase the calculator is initialized with a chunker without directly
230
+ # been provided the timestamp column name.
231
+ #
232
+ # The reference data will be sorted according to the timestamp column (when available) to mimic
233
+ # Chunker behavior. This means the reference data will be "aligned" with chunked reference data.
234
+ # This way we can use chunk indices on the internal reference data copy.
231
235
if self .chunker .timestamp_column_name :
232
236
if self .chunker .timestamp_column_name not in list (reference_data .columns ):
233
237
raise InvalidArgumentsException (
@@ -293,7 +297,6 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
293
297
return self .result
294
298
295
299
def _calculate_chunk (self , chunk : Chunk ):
296
-
297
300
if self ._is_fitted :
298
301
chunk_X = chunk .data [self .feature_column_names ]
299
302
reference_X = self ._reference_X
@@ -302,7 +305,10 @@ def _calculate_chunk(self, chunk: Chunk):
302
305
X = pd .concat ([reference_X , chunk_X ], ignore_index = True )
303
306
y = np .concatenate ([reference_y , chunk_y ])
304
307
else :
305
- # Use information from chunk indices to identify reference chunk's location
308
+ # Use information from chunk indices to identify reference chunk's location. This is possible because
309
+ # both the internal reference data copy and the chunk data were sorted by timestamp, so these
310
+ # indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
311
+ # which is a costly operation.
306
312
X = self ._reference_X
307
313
y = np .zeros (len (X ))
308
314
y [chunk .start_index : chunk .end_index + 1 ] = 1
0 commit comments