diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml index b88ee0d..0a1d340 100644 --- a/.github/workflows/pylint.yaml +++ b/.github/workflows/pylint.yaml @@ -28,4 +28,4 @@ jobs: - name: Analysing the code with pylint run: | - pylint -d C0103 -d R0912 $(git ls-files '*/*.py' '*.py') \ No newline at end of file + pylint -d C0103 -d R0912 -d R0917 $(git ls-files '*/*.py' '*.py') \ No newline at end of file diff --git a/README.md b/README.md index f6e9d02..22e144a 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,11 @@ Additionally, users can specify a custom path for the output CSV file using the Orion now supports anomaly detection for your data. Use the ```--anomaly-detection``` command to start the anomaly detection process. + +To be able to find significant percent differences in workload runs, use the ```--cmr``` command. This will compare the most recent run with any previous matching runs or baseline UUIDs. If more than 1 other run is found from the most recent, the values will be meaned together and then compared with the previous run. Use with *direction: 0* (set in the config) when using ```-o json``` format to see percent differences + +![cmr percent difference](percentdiff.jpg) + You can now constrain your look-back period using the ```--lookback``` option. The format for look-back is ```XdYh```, where X represents the number of days and Y represents the number of hours. To specify how many runs to look back, you can use the ```--lookback-size``` option. By default, this option is set to 10000. @@ -156,7 +161,7 @@ This is similar to how car manufacturers warranty plays out such as 5years or 60 You can open the match requirement by using the ```--node-count``` option to find any matching uuid based on the metadata and not have to have the same jobConfig.jobIterations. This variable is a ```True``` or ```False```, defaulted to False. -**_NOTE:_** The ```--hunter-analyze``` and ```--anomaly-detection``` flags are mutually exclusive. They cannot be used together because they represent different algorithms designed for distinct use cases. +**_NOTE:_** The ```cmr```, ```--hunter-analyze``` and ```--anomaly-detection``` flags are mutually exclusive. They cannot be used together because they represent different algorithms designed for distinct use cases. ### Daemon mode The core purpose of Daemon mode is to operate Orion as a self-contained server, dedicated to handling incoming requests. By sending a POST request accompanied by a test name of predefined tests, users can trigger change point detection on the provided metadata and metrics. Following the processing, the response is formatted in JSON, providing a structured output for seamless integration and analysis. To trigger daemon mode just use the following commands diff --git a/orion.py b/orion.py index 4935372..2a66e66 100644 --- a/orion.py +++ b/orion.py @@ -69,6 +69,14 @@ def cli(max_content_width=120): # pylint: disable=unused-argument # pylint: disable=too-many-locals @cli.command(name="cmd") +@click.option( + "--cmr", + is_flag=True, + help="Generate percent difference in comparison", + cls=MutuallyExclusiveOption, + mutually_exclusive=["anomaly_detection","hunter_analyze"], +) +@click.option("--filter", is_flag=True, help="Generate percent difference in comparison") @click.option("--config", default="config.yaml", help="Path to the configuration file") @click.option( "--save-data-path", default="data.csv", help="Path to save the output file" @@ -79,7 +87,7 @@ def cli(max_content_width=120): # pylint: disable=unused-argument is_flag=True, help="run hunter analyze", cls=MutuallyExclusiveOption, - mutually_exclusive=["anomaly_detection"], + mutually_exclusive=["anomaly_detection","cmr"], ) @click.option("--anomaly-window", type=int, callback=validate_anomaly_options, help="set window size for moving average for anomaly-detection") @click.option("--min-anomaly-percent", type=int, callback=validate_anomaly_options, help="set minimum percentage difference from moving average for data point to be detected as anomaly") @@ -88,7 +96,7 @@ def cli(max_content_width=120): # pylint: disable=unused-argument is_flag=True, help="run anomaly detection algorithm powered by isolation forest", cls=MutuallyExclusiveOption, - mutually_exclusive=["hunter_analyze"], + mutually_exclusive=["hunter_analyze","cmr"], ) @click.option( "-o", diff --git a/percentdiff.jpg b/percentdiff.jpg new file mode 100644 index 0000000..c66352b Binary files /dev/null and b/percentdiff.jpg differ diff --git a/pkg/algorithms/algorithmFactory.py b/pkg/algorithms/algorithmFactory.py index a4d47f8..720a35d 100644 --- a/pkg/algorithms/algorithmFactory.py +++ b/pkg/algorithms/algorithmFactory.py @@ -6,6 +6,7 @@ import pkg.constants as cnsts from .edivisive import EDivisive from .isolationforest import IsolationForestWeightedMean +from .cmr import CMR class AlgorithmFactory: # pylint: disable= too-few-public-methods, too-many-arguments, line-too-long @@ -30,4 +31,6 @@ def instantiate_algorithm(self, algorithm: str, matcher: Matcher, dataframe:pd.D return EDivisive(matcher, dataframe, test, options, metrics_config) if algorithm == cnsts.ISOLATION_FOREST: return IsolationForestWeightedMean(matcher, dataframe, test, options, metrics_config) + if algorithm == cnsts.CMR: + return CMR(matcher, dataframe, test, options, metrics_config) raise ValueError("Invalid algorithm called") diff --git a/pkg/algorithms/cmr/__init__.py b/pkg/algorithms/cmr/__init__.py new file mode 100644 index 0000000..f93d771 --- /dev/null +++ b/pkg/algorithms/cmr/__init__.py @@ -0,0 +1,4 @@ +""" +Init for CMR Algorithm +""" +from .cmr import CMR diff --git a/pkg/algorithms/cmr/cmr.py b/pkg/algorithms/cmr/cmr.py new file mode 100644 index 0000000..89f2ffc --- /dev/null +++ b/pkg/algorithms/cmr/cmr.py @@ -0,0 +1,108 @@ +"""CMR - Comparing Mean Responses Algorithm""" + +# pylint: disable = line-too-long +import pandas as pd +import numpy + +from fmatch.logrus import SingletonLogger +from hunter.series import ChangePoint, ComparativeStats +from pkg.algorithms.algorithm import Algorithm + + +class CMR(Algorithm): + """Implementation of the CMR algorithm + Will Combine metrics into 2 lines and compare with a tolerancy to set pass fail + + Args: + Algorithm (Algorithm): Inherits + """ + + + def _analyze(self): + """Analyze the dataframe with meaning any previous data and generate percent change with a current uuid + + Returns: + series: data series that contains attributes and full dataframe + change_points_by_metric: list of ChangePoints + """ + logger_instance = SingletonLogger.getLogger("Orion") + logger_instance.info("Starting analysis using CMR") + self.dataframe["timestamp"] = pd.to_datetime(self.dataframe["timestamp"]) + self.dataframe["timestamp"] = self.dataframe["timestamp"].astype(int) // 10**9 + + if len(self.dataframe.index) == 1: + series= self.setup_series() + series.data = self.dataframe + return series, {} + # if larger than 2 rows, need to get the mean of 0 through -2 + self.dataframe = self.combine_and_average_runs(self.dataframe) + + series= self.setup_series() + + df, change_points_by_metric = self.run_cmr(self.dataframe) + series.data= df + return series, change_points_by_metric + + + def run_cmr(self, dataframe_list: pd.DataFrame): + """ + Generate the percent difference in a 2 row dataframe + + Args: + dataframe_list (pd.DataFrame): data frame of all data to compare on + + Returns: + pd.Dataframe, dict[metric_name, ChangePoint]: Returned data frame and change points + """ + metric_columns = self.metrics_config.keys() + change_points_by_metric={ k:[] for k in metric_columns } + + for column in metric_columns: + + change_point = ChangePoint(metric=column, + index=1, + time=0, + stats=ComparativeStats( + mean_1=dataframe_list[column][0], + mean_2=dataframe_list[column][1], + std_1=0, + std_2=0, + pvalue=1 + )) + change_points_by_metric[column].append(change_point) + + # based on change point generate pass/fail + return dataframe_list, change_points_by_metric + + def combine_and_average_runs(self, dataFrame: pd.DataFrame): + """ + If more than 1 previous run, mean data together into 1 single row + Combine with current run into 1 data frame (current run being -1 index) + + Args: + dataFrame (pd.DataFrame): data to combine into 2 rows + + Returns: + pd.Dataframe: data frame of most recent run and averaged previous runs + """ + i = 0 + + last_row = dataFrame.tail(1) + dF = dataFrame[:-1] + data2 = {} + + metric_columns = list(dataFrame.columns) + for column in metric_columns: + + if isinstance(dF.loc[0, column], (numpy.float64, numpy.int64)): + mean = dF[column].mean() + data2[column] = [mean] + else: + column_list = dF[column].tolist() + non_numeric_joined_list = ','.join(column_list) + data2[column] = [non_numeric_joined_list] + i += 1 + df2 = pd.DataFrame(data2) + + result = pd.concat([df2, last_row], ignore_index=True) + return result diff --git a/pkg/constants.py b/pkg/constants.py index 87813a3..ea9a770 100644 --- a/pkg/constants.py +++ b/pkg/constants.py @@ -6,3 +6,4 @@ JSON="json" TEXT="text" JUNIT="junit" +CMR="cmr" diff --git a/pkg/runTest.py b/pkg/runTest.py index 5948651..990bfde 100644 --- a/pkg/runTest.py +++ b/pkg/runTest.py @@ -9,7 +9,24 @@ import pkg.constants as cnsts from pkg.utils import get_datasource, process_test, get_subtracted_timestamp +def get_algorithm_type(kwargs): + """Switch Case of getting algorithm name + Args: + kwargs (dict): passed command line arguments + + Returns: + str: algorithm name + """ + if kwargs["hunter_analyze"]: + algorithm_name = cnsts.EDIVISIVE + elif kwargs["anomaly_detection"]: + algorithm_name = cnsts.ISOLATION_FOREST + elif kwargs['cmr']: + algorithm_name = cnsts.CMR + else: + algorithm_name = None + return algorithm_name def run(**kwargs: dict[str, Any]) -> dict[str, Any]: #pylint: disable = R0914 """run method to start the tests @@ -48,11 +65,8 @@ def run(**kwargs: dict[str, Any]) -> dict[str, Any]: #pylint: disable = R0914 if fingerprint_matched_df is None: sys.exit(3) # No data present - if kwargs["hunter_analyze"]: - algorithm_name = cnsts.EDIVISIVE - elif kwargs["anomaly_detection"]: - algorithm_name = cnsts.ISOLATION_FOREST - else: + algorithm_name = get_algorithm_type(kwargs) + if algorithm_name is None: return None, None algorithmFactory = AlgorithmFactory() diff --git a/pkg/utils.py b/pkg/utils.py index e4add5e..94cf457 100644 --- a/pkg/utils.py +++ b/pkg/utils.py @@ -258,10 +258,12 @@ def process_test( shortener = pyshorteners.Shortener(timeout=10) merged_df["buildUrl"] = merged_df["uuid"].apply( lambda uuid: ( - shortener.tinyurl.short(buildUrls[uuid]) + shorten_url(shortener, buildUrls[uuid]) if options["convert_tinyurl"] else buildUrls[uuid] - ) # pylint: disable = cell-var-from-loop + ) + + # pylint: disable = cell-var-from-loop ) merged_df=merged_df.reset_index(drop=True) #save the dataframe @@ -269,6 +271,21 @@ def process_test( match.save_results(merged_df, csv_file_path=output_file_path) return merged_df, metrics_config +def shorten_url(shortener: any, uuids: str) -> str: + """Shorten url if there is a list of buildUrls + + Args: + shortener (any): shortener object to use tinyrl.short on + uuids (List[str]): List of uuids to shorten + + Returns: + str: a combined string of shortened urls + """ + short_url_list = [] + for buildUrl in uuids.split(","): + short_url_list.append(shortener.tinyurl.short(buildUrl)) + short_url = ','.join(short_url_list) + return short_url def get_metadata_with_uuid(uuid: str, match: Matcher) -> Dict[Any, Any]: """Gets metadata of the run from each test