diff --git a/ats/evaluators.py b/ats/evaluators.py index 6a354e0..3131576 100644 --- a/ats/evaluators.py +++ b/ats/evaluators.py @@ -47,11 +47,6 @@ def evaluate_anomaly_detector(evaluated_timeseries_df, anomaly_labels, details=F def _calculate_model_scores(single_model_evaluation={}): - dataset_anomalies = set() - for sample in single_model_evaluation.keys(): - sample_anomalies = set(single_model_evaluation[sample].keys()) - dataset_anomalies.update(sample_anomalies) - model_scores = {} anomalies_count = 0 false_positives_count = 0 @@ -73,9 +68,37 @@ def _calculate_model_scores(single_model_evaluation={}): model_scores['anomalies_ratio'] = None model_scores['false_positives_count'] = false_positives_count model_scores['false_positives_ratio'] = false_positives_ratio/len(single_model_evaluation) - return model_scores +def _get_breakdown_info(single_model_evaluation={}): + for sample in single_model_evaluation.keys(): + if 'anomalies_count' in single_model_evaluation[sample].keys(): + del single_model_evaluation[sample]['anomalies_count'] + if 'anomalies_ratio' in single_model_evaluation[sample].keys(): + del single_model_evaluation[sample]['anomalies_ratio'] + if 'false_positives_count' in single_model_evaluation[sample].keys(): + del single_model_evaluation[sample]['false_positives_count'] + if 'false_positives_ratio' in single_model_evaluation[sample].keys(): + del single_model_evaluation[sample]['false_positives_ratio'] + + breakdown_info = {} + # how many series in the dataset have that anomaly type + anomaly_series_count_by_type = {} + for sample, sample_evaluation in single_model_evaluation.items(): + for key in sample_evaluation.keys(): + if key in breakdown_info.keys(): + anomaly_series_count_by_type[key] +=1 + breakdown_info[key] += sample_evaluation[key] + else: + anomaly_series_count_by_type[key] =1 + breakdown_info[key] = sample_evaluation[key] + + for key in breakdown_info.keys(): + if '_ratio' in key: + breakdown_info[key] /= anomaly_series_count_by_type[key] + + return breakdown_info + class Evaluator(): def __init__(self,test_data): @@ -88,9 +111,10 @@ def _copy_dataset(self,dataset,models): dataset_copies.append(dataset_copy) return dataset_copies - def evaluate(self,models={},granularity='point',strategy='flags'): + def evaluate(self,models={},granularity='point',strategy='flags',breakdown=False): if strategy != 'flags': raise NotImplementedError(f'Evaluation strategy {strategy} is not implemented') + if not models: raise ValueError('There are no models to evaluate') if not self.test_data: @@ -112,15 +136,20 @@ def evaluate(self,models={},granularity='point',strategy='flags'): flagged_dataset = _get_model_output(dataset_copies[j],model) for i,sample_df in enumerate(flagged_dataset): if granularity == 'point': - single_model_evaluation[f'sample_{i+1}'] = _point_granularity_evaluation(sample_df,anomaly_labels_list[i]) + single_model_evaluation[f'sample_{i+1}'] = _point_granularity_evaluation(sample_df,anomaly_labels_list[i],breakdown=breakdown) elif granularity == 'variable': - single_model_evaluation[f'sample_{i+1}'] = _variable_granularity_evaluation(sample_df,anomaly_labels_list[i]) + single_model_evaluation[f'sample_{i+1}'] = _variable_granularity_evaluation(sample_df,anomaly_labels_list[i], breakdown = breakdown) elif granularity == 'series': - single_model_evaluation[f'sample_{i+1}'] = _series_granularity_evaluation(sample_df,anomaly_labels_list[i]) + single_model_evaluation[f'sample_{i+1}'] = _series_granularity_evaluation(sample_df,anomaly_labels_list[i], breakdown = breakdown) else: raise ValueError(f'Unknown granularity {granularity}') - - models_scores[model_name] = _calculate_model_scores(single_model_evaluation) + + if breakdown: + scores = _calculate_model_scores(single_model_evaluation) + breakdown_info = _get_breakdown_info(single_model_evaluation) + models_scores[model_name] = scores | breakdown_info + else: + models_scores[model_name] = _calculate_model_scores(single_model_evaluation) j+=1 return models_scores @@ -142,7 +171,7 @@ def _get_model_output(dataset,model): return flagged_dataset -def _variable_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df): +def _variable_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df,breakdown=False): one_series_evaluation_result = {} flag_columns_n = len(flagged_timeseries_df.filter(like='anomaly').columns) variables_n = len(flagged_timeseries_df.columns) - flag_columns_n @@ -152,7 +181,8 @@ def _variable_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df): total_inserted_anomalies_n = 0 total_detected_anomalies_n = 0 - detection_counts_by_anomaly_type = {} + breakdown_info = {} + false_positives_count = 0 for anomaly,frequency in anomaly_labels_df.value_counts(dropna=False).items(): if anomaly is not None: total_inserted_anomalies_n += frequency @@ -160,28 +190,36 @@ def _variable_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df): for timestamp in flagged_timeseries_df.index: if anomaly_labels_df[timestamp] == anomaly: for column in flagged_timeseries_df.filter(like='anomaly').columns: - anomaly_count += flagged_timeseries_df.loc[timestamp,column] + if anomaly is not None: + anomaly_count += flagged_timeseries_df.loc[timestamp,column] + else: + false_positives_count += flagged_timeseries_df.loc[timestamp,column] if anomaly is not None: total_detected_anomalies_n += anomaly_count - detection_counts_by_anomaly_type[anomaly] = anomaly_count + breakdown_info[anomaly + '_anomaly' + '_count'] = anomaly_count + breakdown_info[anomaly + '_anomaly' + '_ratio'] = anomaly_count/(frequency * variables_n) total_inserted_anomalies_n *= variables_n - one_series_evaluation_result['false_positives_count'] = detection_counts_by_anomaly_type.pop(None) - one_series_evaluation_result['false_positives_ratio'] = one_series_evaluation_result['false_positives_count']/normalization_factor + one_series_evaluation_result['false_positives_count'] = false_positives_count + one_series_evaluation_result['false_positives_ratio'] = false_positives_count/normalization_factor one_series_evaluation_result['anomalies_count'] = total_detected_anomalies_n if total_inserted_anomalies_n: one_series_evaluation_result['anomalies_ratio'] = total_detected_anomalies_n/total_inserted_anomalies_n else: one_series_evaluation_result['anomalies_ratio'] = None - return one_series_evaluation_result + if breakdown: + return one_series_evaluation_result | breakdown_info + else: + return one_series_evaluation_result -def _point_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df): +def _point_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df,breakdown=False): one_series_evaluation_result = {} normalization_factor = len(flagged_timeseries_df) total_inserted_anomalies_n = 0 total_detected_anomalies_n = 0 - detection_counts_by_anomaly_type = {} + breakdown_info = {} + false_positives_count = 0 for anomaly,frequency in anomaly_labels_df.value_counts(dropna=False).items(): if anomaly is not None: total_inserted_anomalies_n += frequency @@ -190,38 +228,54 @@ def _point_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df): if anomaly_labels_df[timestamp] == anomaly: for column in flagged_timeseries_df.filter(like='anomaly').columns: if flagged_timeseries_df.loc[timestamp,column]: - anomaly_count += 1 + if anomaly is not None: + anomaly_count += 1 + else: + false_positives_count += 1 break if anomaly is not None: total_detected_anomalies_n += anomaly_count - detection_counts_by_anomaly_type[anomaly] = anomaly_count - one_series_evaluation_result[anomaly] = anomaly_count / normalization_factor + breakdown_info[anomaly + '_anomaly_count'] = anomaly_count + breakdown_info[anomaly + '_anomaly_ratio'] = anomaly_count/frequency - one_series_evaluation_result['false_positives_count'] = detection_counts_by_anomaly_type.pop(None) - one_series_evaluation_result['false_positives_ratio'] = one_series_evaluation_result['false_positives_count']/normalization_factor + one_series_evaluation_result['false_positives_count'] = false_positives_count + one_series_evaluation_result['false_positives_ratio'] = false_positives_count/normalization_factor one_series_evaluation_result['anomalies_count'] = total_detected_anomalies_n if total_inserted_anomalies_n: one_series_evaluation_result['anomalies_ratio'] = total_detected_anomalies_n/total_inserted_anomalies_n else: one_series_evaluation_result['anomalies_ratio'] = None - return one_series_evaluation_result + if breakdown: + return one_series_evaluation_result | breakdown_info + else: + return one_series_evaluation_result -def _series_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df): +def _series_granularity_evaluation(flagged_timeseries_df,anomaly_labels_df,breakdown=False): anomalies = [] for anomaly,frequency in anomaly_labels_df.value_counts(dropna=False).items(): if anomaly is not None: anomalies.append(anomaly) + if len(anomalies) != 1 and breakdown: + raise ValueError('Series must have only 1 anomaly type for breakdown in mode granularity = "series"') one_series_evaluation_result = {} + breakdown_info = {} is_series_anomalous = 0 for timestamp in flagged_timeseries_df.index: for column in flagged_timeseries_df.filter(like='anomaly').columns: if flagged_timeseries_df.loc[timestamp,column]: is_series_anomalous = 1 + if anomalies: + inserted_anomaly = anomalies[0] + breakdown_info[inserted_anomaly + '_anomaly_count'] = 1 + breakdown_info[inserted_anomaly + '_anomaly_ratio'] = 1 break one_series_evaluation_result['false_positives_count'] = 1 if is_series_anomalous and not anomalies else 0 one_series_evaluation_result['false_positives_ratio'] = one_series_evaluation_result['false_positives_count'] one_series_evaluation_result['anomalies_count'] = 1 if is_series_anomalous and anomalies else 0 one_series_evaluation_result['anomalies_ratio'] = one_series_evaluation_result['anomalies_count'] if anomalies else None - return one_series_evaluation_result \ No newline at end of file + if breakdown: + return one_series_evaluation_result | breakdown_info + else: + return one_series_evaluation_result diff --git a/ats/tests/test_evaluators.py b/ats/tests/test_evaluators.py index e6a9de5..bf859ac 100644 --- a/ats/tests/test_evaluators.py +++ b/ats/tests/test_evaluators.py @@ -9,6 +9,7 @@ from ..evaluators import _variable_granularity_evaluation from ..evaluators import _point_granularity_evaluation from ..evaluators import _series_granularity_evaluation +from ..evaluators import _get_breakdown_info import unittest import pandas as pd import random as rnd @@ -375,6 +376,36 @@ def test_variable_granularity_evaluation(self): self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_count'],1) self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_ratio'],1/(7*2)) + def test_variable_granularity_evaluation_with_breakdown(self): + formatted_series,anomaly_labels = _format_for_anomaly_detector(self.series1) + minmax1 = MinMaxAnomalyDetector() + flagged_series = _get_model_output([formatted_series],minmax1) + evaluation_results = _variable_granularity_evaluation(flagged_series[0],anomaly_labels,breakdown=True) + + self.assertIn('anomalies_count',evaluation_results.keys()) + self.assertIn('anomalies_ratio',evaluation_results.keys()) + self.assertIn('false_positives_count',evaluation_results.keys()) + self.assertIn('false_positives_ratio',evaluation_results.keys()) + + self.assertIn('anomaly_1_anomaly_count',evaluation_results.keys()) + self.assertIn('anomaly_1_anomaly_ratio',evaluation_results.keys()) + self.assertIn('anomaly_2_anomaly_count',evaluation_results.keys()) + self.assertIn('anomaly_2_anomaly_ratio',evaluation_results.keys()) + + self.assertAlmostEqual(evaluation_results['anomaly_1_anomaly_count'],3) + self.assertAlmostEqual(evaluation_results['anomaly_1_anomaly_ratio'],3/4) + self.assertAlmostEqual(evaluation_results['anomaly_2_anomaly_count'],1) + self.assertAlmostEqual(evaluation_results['anomaly_2_anomaly_ratio'],1/2) + + formatted_series1,anomaly_labels1 = _format_for_anomaly_detector(self.series3) + flagged_series1 = _get_model_output([formatted_series1],minmax1) + evaluation_results1 = _variable_granularity_evaluation(flagged_series1[0],anomaly_labels1,breakdown=True) + + self.assertNotIn('anomaly_1_anomaly_count',evaluation_results1.keys()) + self.assertNotIn('anomaly_1_anomaly_ratio',evaluation_results1.keys()) + self.assertNotIn('anomaly_2_anomaly_count',evaluation_results1.keys()) + self.assertNotIn('anomaly_2_anomaly_ratio',evaluation_results1.keys()) + def test_point_granularity_evaluation(self): dataset = [self.series1] evaluator = Evaluator(test_data=dataset) @@ -401,6 +432,27 @@ def test_point_granularity_evaluation(self): self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_count'],1) self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_ratio'],1/7) + def test_point_granularity_evaluation_with_breakdown(self): + formatted_series,anomaly_labels = _format_for_anomaly_detector(self.series1) + minmax1 = MinMaxAnomalyDetector() + flagged_series = _get_model_output([formatted_series],minmax1) + evaluation_results = _point_granularity_evaluation(flagged_series[0],anomaly_labels,breakdown=True) + + self.assertIn('anomalies_count',evaluation_results.keys()) + self.assertIn('anomalies_ratio',evaluation_results.keys()) + self.assertIn('false_positives_count',evaluation_results.keys()) + self.assertIn('false_positives_ratio',evaluation_results.keys()) + + self.assertIn('anomaly_1_anomaly_count',evaluation_results.keys()) + self.assertIn('anomaly_1_anomaly_ratio',evaluation_results.keys()) + self.assertIn('anomaly_2_anomaly_count',evaluation_results.keys()) + self.assertIn('anomaly_2_anomaly_ratio',evaluation_results.keys()) + + self.assertAlmostEqual(evaluation_results['anomaly_1_anomaly_count'],2) + self.assertAlmostEqual(evaluation_results['anomaly_1_anomaly_ratio'],2/2) + self.assertAlmostEqual(evaluation_results['anomaly_2_anomaly_count'],1) + self.assertAlmostEqual(evaluation_results['anomaly_2_anomaly_ratio'],1/1) + def test_series_granularity_evaluation(self): dataset = [self.series1] evaluator = Evaluator(test_data=dataset) @@ -427,6 +479,144 @@ def test_series_granularity_evaluation(self): self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_count'],1) self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_ratio'],1) + def test_series_granularity_evaluation_with_breakdown(self): + series = generate_timeseries_df(entries=3, variables=2) + series['anomaly_label'] = [None,None,'anomaly_1'] + formatted_series,anomaly_labels = _format_for_anomaly_detector(series) + minmax1 = MinMaxAnomalyDetector() + flagged_series = _get_model_output([formatted_series],minmax1) + evaluation_results = _series_granularity_evaluation(flagged_series[0],anomaly_labels,breakdown=True) + + self.assertIn('anomalies_count',evaluation_results.keys()) + self.assertIn('anomalies_ratio',evaluation_results.keys()) + self.assertIn('false_positives_count',evaluation_results.keys()) + self.assertIn('false_positives_ratio',evaluation_results.keys()) + self.assertIn('anomaly_1_anomaly_count',evaluation_results.keys()) + self.assertIn('anomaly_1_anomaly_ratio',evaluation_results.keys()) + self.assertAlmostEqual(evaluation_results['anomaly_1_anomaly_count'],1) + self.assertAlmostEqual(evaluation_results['anomaly_1_anomaly_ratio'],1) + + formatted_series1,anomaly_labels1 = _format_for_anomaly_detector(self.series1) + flagged_series1 = _get_model_output([formatted_series1],minmax1) + try: + evaluation_results = _point_granularity_evaluation(flagged_series1[0],anomaly_labels1,breakdown=True) + except Exception as e: + self.assertIsInstance(e,ValueError) + + def test_get_breakdown_info(self): + single_model_evaluation = { 'sample_1': {'anomalies_count': 3, 'anomalies_ratio': 1.5, + 'false_positives_count': 1, + 'false_positives_ratio': 0.14, + 'spike_anomaly_count': 1, + 'spike_anomaly_ratio': 0.5}, + 'sample_2': {'anomalies_count': 3, 'anomalies_ratio': 1.5, + 'false_positives_count': 1, + 'false_positives_ratio': 0.14, + 'spike_anomaly_count': 1, + 'spike_anomaly_ratio': 0.5, + 'step_anomaly_count': 2, + 'step_anomaly_ratio': 2/3 + }, + 'sample_3': {'anomalies_count': 3, 'anomalies_ratio': 1.5, + 'false_positives_count': 1, + 'false_positives_ratio': 0.14, + 'step_anomaly_count': 3, + 'step_anomaly_ratio': 1, + 'pattern_anomaly_count': 2, + 'pattern_anomaly_ratio': 0.5 + } + } + breakdown = _get_breakdown_info(single_model_evaluation) + self.assertIn('spike_anomaly_count',breakdown.keys()) + self.assertIn('spike_anomaly_ratio',breakdown.keys()) + self.assertIn('step_anomaly_count',breakdown.keys()) + self.assertIn('step_anomaly_ratio',breakdown.keys()) + self.assertIn('pattern_anomaly_count',breakdown.keys()) + self.assertIn('pattern_anomaly_ratio',breakdown.keys()) + + self.assertAlmostEqual(breakdown['spike_anomaly_count'],2) + self.assertAlmostEqual(breakdown['spike_anomaly_ratio'],1/2) + self.assertAlmostEqual(breakdown['step_anomaly_count'],5) + self.assertAlmostEqual(breakdown['step_anomaly_ratio'],5/6) + self.assertAlmostEqual(breakdown['pattern_anomaly_count'],2) + self.assertAlmostEqual(breakdown['pattern_anomaly_ratio'],0.5) + + def test_variable_granularity_eval_with_breakdown(self): + dataset = [self.series1, self.series2, self.series3] + minmax1 = MinMaxAnomalyDetector() + minmax2 = MinMaxAnomalyDetector() + minmax3 = MinMaxAnomalyDetector() + models={'detector_1': minmax1, + 'detector_2': minmax2, + 'detector_3': minmax3 + } + evaluator = Evaluator(test_data=dataset) + evaluation_results = evaluator.evaluate(models=models,granularity='variable',breakdown=True) + self.assertAlmostEqual(evaluation_results['detector_1']['anomalies_count'],7) + self.assertAlmostEqual(evaluation_results['detector_1']['anomalies_ratio'],25/48) + self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_count'],5) + self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_ratio'],31/126) + + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_1_anomaly_count'],5) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_1_anomaly_ratio'],13/24) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_2_anomaly_count'],2) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_2_anomaly_ratio'],1/2) + + def test_point_granularity_eval_with_breakdown(self): + dataset = [self.series1, self.series2, self.series3] + minmax1 = MinMaxAnomalyDetector() + minmax2 = MinMaxAnomalyDetector() + minmax3 = MinMaxAnomalyDetector() + models={'detector_1': minmax1, + 'detector_2': minmax2, + 'detector_3': minmax3 + } + evaluator = Evaluator(test_data=dataset) + evaluation_results = evaluator.evaluate(models=models,granularity='point',breakdown=True) + self.assertAlmostEqual(evaluation_results['detector_1']['anomalies_count'],6) + self.assertAlmostEqual(evaluation_results['detector_1']['anomalies_ratio'],7/8) + self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_count'],4) + self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_ratio'],8/21) + + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_1_anomaly_count'],4) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_1_anomaly_ratio'],5/6) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_2_anomaly_count'],2) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_2_anomaly_ratio'],1) + + def test_series_granularity_eval_with_breakdown(self): + series_1 = generate_timeseries_df(entries=3, variables=2) + series_1['anomaly_label'] = [None,None,'anomaly_1'] + series_2 = generate_timeseries_df(entries=3, variables=2) + series_2['anomaly_label'] = ['anomaly_1',None,None] + series_3 = generate_timeseries_df(entries=3, variables=2) + series_3['anomaly_label'] = [None,'anomaly_2',None] + dataset = [series_1, series_2, series_3] + minmax1 = MinMaxAnomalyDetector() + minmax2 = MinMaxAnomalyDetector() + minmax3 = MinMaxAnomalyDetector() + models={'detector_1': minmax1, + 'detector_2': minmax2, + 'detector_3': minmax3 + } + evaluator = Evaluator(test_data=dataset) + evaluation_results = evaluator.evaluate(models=models,granularity='series',breakdown=True) + self.assertAlmostEqual(evaluation_results['detector_1']['anomalies_count'],3) + self.assertAlmostEqual(evaluation_results['detector_1']['anomalies_ratio'],1) + self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_count'],0) + self.assertAlmostEqual(evaluation_results['detector_1']['false_positives_ratio'],0) + + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_1_anomaly_count'],2) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_1_anomaly_ratio'],1) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_2_anomaly_count'],1) + self.assertAlmostEqual(evaluation_results['detector_1']['anomaly_2_anomaly_ratio'],1) + + try: + dataset = [self.series1, self.series2, self.series3] + evaluator = Evaluator(test_data=dataset) + evaluation_results = evaluator.evaluate(models=models,granularity='series',breakdown=True) + except Exception as e: + self.assertIsInstance(e,ValueError) + def test_double_evaluator(self): anomalies = ['step_uv'] effects = []