From 5f7316261e77f9f59e9dc8aab40c55d357efdd7b Mon Sep 17 00:00:00 2001 From: Nitin Garg Date: Wed, 21 Aug 2024 16:09:03 +0000 Subject: [PATCH] Use monitoring api for cpu/memory utilization .. only when mash is not available on the VM in question. Add min/max split in cpu/memory values cpu/memory from monotoring api --- .../examples/dlio/parse_logs.py | 59 ++++-- .../testing_on_gke/examples/fio/parse_logs.py | 54 +++-- .../testing_on_gke/examples/run-gke-tests.sh | 15 +- .../examples/utils/parse_logs_common.py | 20 ++ .../testing_on_gke/examples/utils/utils.py | 195 +++++++++++++++++- .../examples/utils/utils_test.py | 54 +++++ 6 files changed, 364 insertions(+), 33 deletions(-) create mode 100644 perfmetrics/scripts/testing_on_gke/examples/utils/utils_test.py diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py index 59a604f1a7..d491e26e00 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py @@ -22,7 +22,7 @@ import dlio_workload sys.path.append("../") -from utils.utils import get_memory, get_cpu, standard_timestamp, is_mash_installed +from utils.utils import get_memory, get_cpu, unix_to_timestamp, standard_timestamp, is_mash_installed, get_memory_from_monitoring_api, get_cpu_from_monitoring_api, timestamp_to_epoch from utils.parse_logs_common import ensureDir, download_gcs_objects, parseLogParserArguments, SUPPORTED_SCENARIOS _LOCAL_LOGS_LOCATION = "../../bin/dlio-logs/logs" @@ -36,6 +36,8 @@ "train_throughput_samples_per_second": 0, "train_throughput_mb_per_second": 0, "throughput_over_local_ssd": 0, + "start_epoch": "", + "end_epoch": "", "start": "", "end": "", "highest_memory": 0, @@ -161,24 +163,51 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict: * int(output[key]["mean_file_size"]) / (1024**2) ) + r["start_epoch"] = timestamp_to_epoch( + per_epoch_stats_data[str(i + 1)]["start"] + ) + r["end_epoch"] = timestamp_to_epoch( + per_epoch_stats_data[str(i + 1)]["end"] + ) r["start"] = standard_timestamp( per_epoch_stats_data[str(i + 1)]["start"] ) r["end"] = standard_timestamp(per_epoch_stats_data[str(i + 1)]["end"]) - if r["scenario"] != "local-ssd" and mash_installed: - r["lowest_memory"], r["highest_memory"] = get_memory( - r["pod_name"], - r["start"], - r["end"], - project_number=args.project_number, - ) - r["lowest_cpu"], r["highest_cpu"] = get_cpu( - r["pod_name"], - r["start"], - r["end"], - project_number=args.project_number, - ) - pass + + if r["scenario"] != "local-ssd": + if mash_installed: + r["lowest_memory"], r["highest_memory"] = get_memory( + r["pod_name"], + r["start"], + r["end"], + project_number=args.project_number, + ) + r["lowest_cpu"], r["highest_cpu"] = get_cpu( + r["pod_name"], + r["start"], + r["end"], + project_number=args.project_number, + ) + else: + r["lowest_memory"], r["highest_memory"] = ( + get_memory_from_monitoring_api( + pod_name=r["pod_name"], + start_epoch=r["start_epoch"], + end_epoch=r["end_epoch"], + project_id=args.project_id, + cluster_name=args.cluster_name, + namespace_name=args.namespace_name, + ) + ) + r["lowest_cpu"], r["highest_cpu"] = get_cpu_from_monitoring_api( + pod_name=r["pod_name"], + start_epoch=r["start_epoch"], + end_epoch=r["end_epoch"], + project_id=args.project_id, + cluster_name=args.cluster_name, + namespace_name=args.namespace_name, + ) + pass r["gcsfuse_mount_options"] = gcsfuse_mount_options diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py index a1aaeafbb3..8073f77de0 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py @@ -22,7 +22,7 @@ import fio_workload sys.path.append("../") -from utils.utils import get_memory, get_cpu, unix_to_timestamp, is_mash_installed +from utils.utils import get_memory, get_cpu, unix_to_timestamp, is_mash_installed, get_memory_from_monitoring_api, get_cpu_from_monitoring_api from utils.parse_logs_common import ensureDir, download_gcs_objects, parseLogParserArguments, SUPPORTED_SCENARIOS _LOCAL_LOGS_LOCATION = "../../bin/fio-logs" @@ -35,6 +35,8 @@ "IOPS": 0, "throughput_mb_per_second": 0, "throughput_over_local_ssd": 0, + "start_epoch": "", + "end_epoch": "", "start": "", "end": "", "highest_memory": 0, @@ -203,24 +205,48 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict: r["throughput_mb_per_second"] = int( per_epoch_output_data["jobs"][0]["read"]["bw_bytes"] / (1024**2) ) + r["start_epoch"] = per_epoch_output_data["jobs"][0]["job_start"] // 1000 + r["end_epoch"] = per_epoch_output_data["timestamp_ms"] // 1000 r["start"] = unix_to_timestamp( per_epoch_output_data["jobs"][0]["job_start"] ) r["end"] = unix_to_timestamp(per_epoch_output_data["timestamp_ms"]) - if r["scenario"] != "local-ssd" and mash_installed: - r["lowest_memory"], r["highest_memory"] = get_memory( - r["pod_name"], - r["start"], - r["end"], - project_number=args.project_number, - ) - r["lowest_cpu"], r["highest_cpu"] = get_cpu( - r["pod_name"], - r["start"], - r["end"], - project_number=args.project_number, - ) + + if r["scenario"] != "local-ssd": + if mash_installed: + r["lowest_memory"], r["highest_memory"] = get_memory( + r["pod_name"], + r["start"], + r["end"], + project_number=args.project_number, + ) + r["lowest_cpu"], r["highest_cpu"] = get_cpu( + r["pod_name"], + r["start"], + r["end"], + project_number=args.project_number, + ) + else: + r["lowest_memory"], r["highest_memory"] = ( + get_memory_from_monitoring_api( + pod_name=r["pod_name"], + start_epoch=r["start_epoch"], + end_epoch=r["end_epoch"], + project_id=args.project_id, + cluster_name=args.cluster_name, + namespace_name=args.namespace_name, + ) + ) + r["lowest_cpu"], r["highest_cpu"] = get_cpu_from_monitoring_api( + pod_name=r["pod_name"], + start_epoch=r["start_epoch"], + end_epoch=r["end_epoch"], + project_id=args.project_id, + cluster_name=args.cluster_name, + namespace_name=args.namespace_name, + ) pass + r["gcsfuse_mount_options"] = gcsfuse_mount_options r["blockSize"] = bs r["filesPerThread"] = nrfiles diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh index 8fed0c3600..af07c24651 100755 --- a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh +++ b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh @@ -241,7 +241,16 @@ function installDependencies() { sudo apt install docker-ce -y fi # Ensure that mash is installed. - which mash || (sudo apt-get install -y monarch-tools) + if ! which mash ; then + if ! sudo apt-get install -y monarch-tools; then + # Ensure that gcloud monitoring tools are installed. This is alternative to + # mash on gce vm. + # pip install --upgrade google-cloud-storage + # pip install --ignore-installed --upgrade google-api-python-client + # pip install --ignore-installed --upgrade google-cloud + pip install --upgrade google-cloud-monitoring + fi + fi } # Make sure you have access to the necessary GCP resources. The easiest way to enable it is to use @google.com as active auth. @@ -529,14 +538,14 @@ function waitTillAllPodsComplete() { function fetchAndParseFioOutputs() { echo "Fetching and parsing fio outputs ..." cd "${gke_testing_dir}"/examples/fio - python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/fio/output.csv + python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/fio/output.csv --project-id=${project_id} --cluster-name=${cluster_name} --namespace-name=${appnamespace} cd - } function fetchAndParseDlioOutputs() { echo "Fetching and parsing dlio outputs ..." cd "${gke_testing_dir}"/examples/dlio - python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/dlio/output.csv + python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/dlio/output.csv --project-id=${project_id} --cluster-name=${cluster_name} --namespace-name=${appnamespace} cd - } diff --git a/perfmetrics/scripts/testing_on_gke/examples/utils/parse_logs_common.py b/perfmetrics/scripts/testing_on_gke/examples/utils/parse_logs_common.py index 9baf5c4c2e..91f19a745c 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/utils/parse_logs_common.py +++ b/perfmetrics/scripts/testing_on_gke/examples/utils/parse_logs_common.py @@ -74,8 +74,18 @@ def parseLogParserArguments() -> object: ), required=True, ) + parser.add_argument( + "--project-id", + metavar="GCP Project ID/name", + help=( + "project-id (e.g. gcs-fuse-test) is needed to fetch the cpu/memory" + " utilization data from GCP." + ), + required=True, + ) parser.add_argument( "--project-number", + metavar="GCP Project Number", help=( "project-number (e.g. 93817472919) is needed to fetch the cpu/memory" " utilization data from GCP." @@ -87,6 +97,16 @@ def parseLogParserArguments() -> object: help="unique string ID for current test-run", required=True, ) + parser.add_argument( + "--cluster-name", + help="Name of GKE cluster where the current test was run", + required=True, + ) + parser.add_argument( + "--namespace-name", + help="kubernestes namespace used for the current test-run", + required=True, + ) parser.add_argument( "-o", "--output-file", diff --git a/perfmetrics/scripts/testing_on_gke/examples/utils/utils.py b/perfmetrics/scripts/testing_on_gke/examples/utils/utils.py index a7a2a41829..0bb5fccb2b 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/utils/utils.py +++ b/perfmetrics/scripts/testing_on_gke/examples/utils/utils.py @@ -16,7 +16,10 @@ # limitations under the License. import datetime, subprocess +import math +import time from typing import Tuple +from google.cloud import monitoring_v3 def is_mash_installed() -> bool: @@ -138,10 +141,20 @@ def unix_to_timestamp(unix_timestamp: int) -> str: return utc_timestamp_string -def standard_timestamp(timestamp: int) -> str: +def standard_timestamp(timestamp: str) -> str: return timestamp.split(".")[0].replace("T", " ") + " UTC" +def timestamp_to_epoch(timestamp: str) -> int: + return int( + time.mktime( + time.strptime( + timestamp.split(".")[0].replace("T", " "), "%Y-%m-%d %H:%M:%S" + ) + ) + ) + + class UnknownMachineTypeError(Exception): """Defines custom exception for unknown machine-type scenario. @@ -167,3 +180,183 @@ def resource_limits(nodeType: str) -> Tuple[dict, dict]: " resource-limits for it.", nodeType, ) + + +def isRelevantMonitoringResult( + result, + cluster_name: str, + pod_name: str, + # container_name: str, + namespace_name: str, +) -> bool: + return ( + True + if ( + hasattr(result, "resource") + and hasattr(result.resource, "type") + and result.resource.type == "k8s_container" + and hasattr(result.resource, "labels") + # and "cluster_name" in result.resource.labels + # and result.resource.labels["cluster_name"] == cluster_name + # and "pod_name" in result.resource.labels + # and result.resource.labels["pod_name"] == pod_name + # and "container_name" in result.resource.labels + # and result.resource.labels["container_name"] == container_name + # and "namespace_name" in result.resource.labels + # and result.resource.labels["namespace_name"] == namespace_name + and hasattr(result, "points") + ) + else False + ) + + +def get_memory_from_monitoring_api( + project_id: str, + cluster_name: str, + pod_name: str, + # container_name: str, + namespace_name: str, + start_epoch: int, + end_epoch: int, +) -> Tuple[int, int]: + """Returns min,max memory usage of the given gke-cluster/namespace/pod/container/start/end scenario in MiB .""" + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project_id}" + + interval = monitoring_v3.TimeInterval({ + "start_time": {"seconds": start_epoch, "nanos": 0}, + "end_time": {"seconds": end_epoch, "nanos": 0}, + }) + aggregation = monitoring_v3.Aggregation({ + "alignment_period": {"seconds": 60}, # 1 minute + "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_MAX, + }) + + results = client.list_time_series( + request={ + "name": project_name, + "filter": ( + 'metric.type = "kubernetes.io/container/memory/used_bytes"' + # ' AND metric.memory_type = "non-evictable"' # for some reason, + # this throws error, so commented it out. + f" AND resource.labels.cluster_name = {cluster_name}" + f" AND resource.labels.pod_name = {pod_name}" + # f" AND resource.labels.container_name = {container_name}" + f" AND resource.labels.namespace_name = {namespace_name}" + ), + "interval": interval, + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + "aggregation": aggregation, + } + ) + + relevant_results = [ + result + for result in results + if isRelevantMonitoringResult( + result, + cluster_name, + pod_name, + # container_name, + namespace_name, + ) + ] + return round( + min( + min( + (point.value.int64_value if point.value.int64_value >= 0 else 0) + for point in result.points + ) + for result in relevant_results + ) + / 2**20, # convert to MiB/s + 0, # round to integer. + ), round( + max( + max( + (point.value.int64_value if point.value.int64_value > 0 else 0) + for point in result.points + ) + for result in relevant_results + ) + / 2**20, # convert to MiB/s + 0, # round to integer. + ) + + +def get_cpu_from_monitoring_api( + project_id: str, + cluster_name: str, + pod_name: str, + # container_name: str, + namespace_name: str, + start_epoch: int, + end_epoch: int, +) -> Tuple[float, float]: + """Returns min,max cpu usage of the given gke-cluster/namespace/pod/container/start/end scenario.""" + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project_id}" + + interval = monitoring_v3.TimeInterval({ + "start_time": {"seconds": start_epoch, "nanos": 0}, + "end_time": {"seconds": end_epoch, "nanos": 0}, + }) + aggregation = monitoring_v3.Aggregation({ + "alignment_period": {"seconds": 60}, # 1 minute + "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_RATE, + }) + + results = client.list_time_series( + request={ + "name": project_name, + "filter": ( + 'metric.type = "kubernetes.io/container/cpu/core_usage_time"' + f" AND resource.labels.cluster_name = {cluster_name}" + f" AND resource.labels.pod_name = {pod_name}" + # f" AND resource.labels.container_name = {container_name}" + f" AND resource.labels.namespace_name = {namespace_name}" + ), + "interval": interval, + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + "aggregation": aggregation, + } + ) + + relevant_results = [ + result + for result in results + if isRelevantMonitoringResult( + result, + cluster_name, + pod_name, + # container_name, + namespace_name, + ) + ] + return round( + min( + min( + ( + point.value.double_value + if point.value.double_value != math.nan + else 0 + ) + for point in result.points + ) + for result in relevant_results + ), + 5, # round up to 5 decimal places. + ), round( + max( + max( + ( + point.value.double_value + if point.value.double_value != math.nan + else 0 + ) + for point in result.points + ) + for result in relevant_results + ), + 5, # round up to 5 decimal places. + ) diff --git a/perfmetrics/scripts/testing_on_gke/examples/utils/utils_test.py b/perfmetrics/scripts/testing_on_gke/examples/utils/utils_test.py new file mode 100644 index 0000000000..df91d0257f --- /dev/null +++ b/perfmetrics/scripts/testing_on_gke/examples/utils/utils_test.py @@ -0,0 +1,54 @@ +"""This file defines unit tests for functionalities in utils.py""" + +import unittest +import utils +from utils import get_cpu_from_monitoring_api, get_memory_from_monitoring_api, timestamp_to_epoch + + +class UtilsTest(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.project_id = "gcs-fuse-test" + self.cluster_name = "gargnitin-dryrun-us-west1-6" + self.pod_name = "fio-tester-gcsfuse-rr-64k-1670041227260535313" + # self.container_name = "fio-tester" + self.namespace_name = "default" + self.start_epoch = 1724233283 + self.end_epoch = 1724233442 + + def test_get_memory_from_monitoring_api(self): + print( + get_memory_from_monitoring_api( + self.project_id, + self.cluster_name, + self.pod_name, + # self.container_name, + self.namespace_name, + self.start_epoch, + self.end_epoch, + ) + ) + + def test_get_cpu_from_monitoring_api(self): + print( + get_cpu_from_monitoring_api( + self.project_id, + self.cluster_name, + self.pod_name, + # self.container_name, + self.namespace_name, + self.start_epoch, + self.end_epoch, + ) + ) + + def test_timestamp_to_epoch(self): + timestamp = "2024-08-21T19:20:25" + expected_epoch = 1724268025 + self.assertEqual(timestamp_to_epoch(timestamp), expected_epoch) + pass + + +if __name__ == "__main__": + unittest.main()