Skip to content

Commit 3aaed3d

Browse files
committed
Added some comments on the workings
1 parent a2520f9 commit 3aaed3d

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

nannyml/drift/multivariate/domain_classifier/calculator.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,12 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
226226
if column_name not in self.categorical_column_names:
227227
self.categorical_column_names.append(column_name)
228228

229-
# get timestamp column from chunker incase the calculator is initialized with a chunker without directly
230-
# been provided the timestamp column name
229+
# Get timestamp column from chunker incase the calculator is initialized with a chunker without directly
230+
# been provided the timestamp column name.
231+
#
232+
# The reference data will be sorted according to the timestamp column (when available) to mimic
233+
# Chunker behavior. This means the reference data will be "aligned" with chunked reference data.
234+
# This way we can use chunk indices on the internal reference data copy.
231235
if self.chunker.timestamp_column_name:
232236
if self.chunker.timestamp_column_name not in list(reference_data.columns):
233237
raise InvalidArgumentsException(
@@ -293,7 +297,6 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
293297
return self.result
294298

295299
def _calculate_chunk(self, chunk: Chunk):
296-
297300
if self._is_fitted:
298301
chunk_X = chunk.data[self.feature_column_names]
299302
reference_X = self._reference_X
@@ -302,7 +305,10 @@ def _calculate_chunk(self, chunk: Chunk):
302305
X = pd.concat([reference_X, chunk_X], ignore_index=True)
303306
y = np.concatenate([reference_y, chunk_y])
304307
else:
305-
# Use information from chunk indices to identify reference chunk's location
308+
# Use information from chunk indices to identify reference chunk's location. This is possible because
309+
# both the internal reference data copy and the chunk data were sorted by timestamp, so these
310+
# indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
311+
# which is a costly operation.
306312
X = self._reference_X
307313
y = np.zeros(len(X))
308314
y[chunk.start_index : chunk.end_index + 1] = 1

0 commit comments

Comments
 (0)