From d8fa5c86cc118bfcd242f1715bd5c616f8eccfa5 Mon Sep 17 00:00:00 2001 From: akenO8 Date: Wed, 24 May 2023 20:47:04 +0800 Subject: [PATCH 1/5] [NEW]init_code --- .idea/.gitignore | 3 + .idea/codeStyles/Project.xml | 28 + .idea/dbnavigator.xml | 410 ++++++++ .idea/hadoop_jmx_exporter.iml | 11 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + common.py | 794 +++++++-------- hadoop_jmx_exporter.py | 88 +- hdfs_datanode.py | 326 +++--- hdfs_journalnode.py | 284 +++--- hdfs_namenode.py | 956 +++++++++--------- metrics/common/JvmMetrics.json | 56 +- metrics/common/MetricsSystem.json | 28 +- metrics/common/OperatingSystem.json | 26 +- metrics/common/RpcActivity.json | 32 +- metrics/common/RpcDetailedActivity.json | 6 +- metrics/common/Runtime.json | 6 +- metrics/common/UgiMetrics.json | 18 +- metrics/datanode/DataNodeActivity.json | 138 +-- metrics/datanode/DataNodeInfo.json | 8 +- metrics/datanode/FSDatasetState.json | 24 +- metrics/journalnode/JournalNode.json | 56 +- metrics/namenode/FSNamesystem.json | 70 +- metrics/namenode/FSNamesystemState.json | 30 +- metrics/namenode/NameNodeActivity.json | 82 +- metrics/namenode/RetryCache.json | 10 +- metrics/namenode/StartupProgress.json | 38 +- metrics/nodemanager/NodeManagerMetrics.json | 38 +- metrics/nodemanager/ShuffleMetrics.json | 12 +- metrics/resourcemanager/ClusterMetrics.json | 20 +- metrics/resourcemanager/QueueMetrics.json | 64 +- metrics/resourcemanager/RMNMInfo.json | 10 +- requirements.txt | 6 +- scraper.py | 104 +- utils.py | 140 +-- yarn_nodemanager.py | 176 ++-- yarn_resourcemanager.py | 506 ++++----- 39 files changed, 2552 insertions(+), 2076 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/codeStyles/Project.xml create mode 100644 .idea/dbnavigator.xml create mode 100644 .idea/hadoop_jmx_exporter.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..0e40fe8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ + +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 0000000..daa337d --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/dbnavigator.xml b/.idea/dbnavigator.xml new file mode 100644 index 0000000..16c89ca --- /dev/null +++ b/.idea/dbnavigator.xml @@ -0,0 +1,410 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/hadoop_jmx_exporter.iml b/.idea/hadoop_jmx_exporter.iml new file mode 100644 index 0000000..19e0466 --- /dev/null +++ b/.idea/hadoop_jmx_exporter.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..c05018a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..26b086a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..9661ac7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/common.py b/common.py index 9941896..fdef10b 100644 --- a/common.py +++ b/common.py @@ -1,397 +1,397 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import re -from prometheus_client.core import GaugeMetricFamily - -import utils - - -logger = utils.get_module_logger(__name__) - - -class MetricCollector(object): - def __init__(self, cluster, component, service): - self.cluster = cluster - self.component = component - self.prefix = 'hadoop_{0}_{1}'.format(component, service) - - self.file_list = utils.get_file_list(service) - self.metrics = {} - for i in range(len(self.file_list)): - self.metrics.setdefault(self.file_list[i], utils.read_json_file(service, self.file_list[i])) - - common_file = utils.get_file_list("common") - self.merge_list = self.file_list + common_file - - def collect(self): - pass - - def _setup_metrics_labels(self): - pass - - def _get_metrics(self, metrics): - pass - - -class CommonMetricCollector(): - def __init__(self, cluster, component, service): - self.cluster = cluster - self.componet = component - self.service = service - self.prefix = 'hadoop_{0}_{1}'.format(component, service) - self.common_metrics = {} - self.tmp_metrics = {} - file_list = utils.get_file_list("common") - for i in range(len(file_list)): - self.common_metrics.setdefault(file_list[i], {}) - self.tmp_metrics.setdefault(file_list[i], utils.read_json_file("common", file_list[i])) - - def setup_labels(self, beans): - for i in range(len(beans)): - if 'name=JvmMetrics' in beans[i]['name']: - self.setup_jvm_labels() - if 'OperatingSystem' in beans[i]['name']: - self.setup_os_labels() - if 'RpcActivity' in beans[i]['name']: - self.setup_rpc_labels() - if 'RpcDetailedActivity' in beans[i]['name']: - self.setup_rpc_detailed_labels() - if 'UgiMetrics' in beans[i]['name']: - self.setup_ugi_labels() - if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']: - self.setup_metric_system_labels() - if 'Runtime' in beans[i]['name']: - self.setup_runtime_labels() - - def get_metrics(self, beans, target): - self.target = target - for i in range(len(beans)): - if 'name=JvmMetrics' in beans[i]['name']: - self.get_jvm_metrics(beans[i]) - if 'OperatingSystem' in beans[i]['name']: - self.get_os_metrics(beans[i]) - if 'RpcActivity' in beans[i]['name']: - self.get_rpc_metrics(beans[i]) - if 'RpcDetailedActivity' in beans[i]['name']: - self.get_rpc_detailed_metrics(beans[i]) - if 'UgiMetrics' in beans[i]['name']: - self.get_ugi_metrics(beans[i]) - if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']: - self.get_metric_system_metrics(beans[i]) - if 'Runtime' in beans[i]['name']: - self.get_runtime_metrics(beans[i]) - return self.common_metrics - - def setup_jvm_labels(self): - for metric in self.tmp_metrics["JvmMetrics"]: - snake_case = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) - if 'Mem' in metric: - name = "".join([snake_case, "ebibytes"]) - label = ["cluster", "mode"] - if "Used" in metric: - key = "jvm_mem_used_mebibytes" - descriptions = "Current memory used in mebibytes." - elif "Committed" in metric: - key = "jvm_mem_committed_mebibytes" - descriptions = "Current memory committed in mebibytes." - elif "Max" in metric: - key = "jvm_mem_max_mebibytes" - descriptions = "Current max memory in mebibytes." - else: - key = name - label = ["cluster"] - descriptions = self.tmp_metrics['JvmMetrics'][metric] - elif 'Gc' in metric: - label = ["cluster", "type"] - if "GcCount" in metric: - key = "jvm_gc_count" - descriptions = "GC count of each type GC." - elif "GcTimeMillis" in metric: - key = "jvm_gc_time_milliseconds" - descriptions = "Each type GC time in milliseconds." - elif "ThresholdExceeded" in metric: - key = "jvm_gc_exceeded_threshold_total" - descriptions = "Number of times that the GC threshold is exceeded." - else: - key = snake_case - label = ["cluster"] - descriptions = self.tmp_metrics['JvmMetrics'][metric] - elif 'Threads' in metric: - label = ["cluster", "state"] - key = "jvm_threads_state_total" - descriptions = "Current number of different threads." - elif 'Log' in metric: - label = ["cluster", "level"] - key = "jvm_log_level_total" - descriptions = "Total number of each level logs." - else: - label = ["cluster"] - key = snake_case - descriptions = self.tmp_metrics['JvmMetrics'][metric] - label.append("_target") - self.common_metrics['JvmMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, key]), descriptions, labels=label) - - def setup_os_labels(self): - for metric in self.tmp_metrics['OperatingSystem']: - label = ["cluster", "_target"] - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case]) - self.common_metrics['OperatingSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['OperatingSystem'][metric], labels=label) - - def setup_rpc_labels(self): - num_rpc_flag, avg_rpc_flag = 1, 1 - for metric in self.tmp_metrics["RpcActivity"]: - snake_case = "_".join(["rpc", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) - if 'Rpc' in metric: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - label = ["cluster", "tag"] - if "NumOps" in metric: - if num_rpc_flag: - key = "MethodNumOps" - label.extend(["method", "_target"]) - name = "_".join([self.prefix, "rpc_method_called_total"]) - description = "Total number of the times the method is called." - self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, description, labels=label) - num_rpc_flag = 0 - else: - continue - elif "AvgTime" in metric: - if avg_rpc_flag: - key = "MethodAvgTime" - label.extend(["method", "_target"]) - name = "_".join([self.prefix, "rpc_method_avg_time_milliseconds"]) - descrption = "Average turn around time of the method in milliseconds." - self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, descrption, labels=label) - avg_rpc_flag = 0 - else: - continue - else: - key = metric - label.append("_target") - name = "_".join([self.prefix, snake_case]) - self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcActivity'][metric], labels=label) - - def setup_rpc_detailed_labels(self): - for metric in self.tmp_metrics['RpcDetailedActivity']: - label = ["cluster", "tag", "method", "_target"] - if "NumOps" in metric: - key = "NumOps" - name = "_".join([self.prefix, 'rpc_detailed_method_called_total']) - elif "AvgTime" in metric: - key = "AvgTime" - name = "_".join([self.prefix, 'rpc_detailed_method_avg_time_milliseconds']) - else: - continue - self.common_metrics['RpcDetailedActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcDetailedActivity'][metric], labels=label) - return self.common_metrics - - def setup_ugi_labels(self): - ugi_num_flag, ugi_avg_flag = 1, 1 - for metric in self.tmp_metrics['UgiMetrics']: - label = ["cluster"] - if 'NumOps' in metric: - if ugi_num_flag: - key = 'NumOps' - label.extend(["method", "state", "_target"]) - ugi_num_flag = 0 - name = "_".join([self.prefix, 'ugi_method_called_total']) - description = "Total number of the times the method is called." - self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label) - else: - continue - elif 'AvgTime' in metric: - if ugi_avg_flag: - key = 'AvgTime' - label.extend(["method", "state", "_target"]) - ugi_avg_flag = 0 - name = "_".join([self.prefix, 'ugi_method_avg_time_milliseconds']) - description = "Average turn around time of the method in milliseconds." - self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label) - else: - continue - else: - label.append("_target") - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, 'ugi', snake_case]) - self.common_metrics['UgiMetrics'][metric] = GaugeMetricFamily(name, self.tmp_metrics['UgiMetrics'][metric], labels=label) - - def setup_metric_system_labels(self): - metric_num_flag, metric_avg_flag = 1, 1 - for metric in self.tmp_metrics['MetricsSystem']: - label = ["cluster"] - if 'NumOps' in metric: - if metric_num_flag: - key = 'NumOps' - label.extend(["oper", "_target"]) - metric_num_flag = 0 - name = "_".join([self.prefix, 'metricssystem_operations_total']) - self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, "Total number of operations", labels=label) - else: - continue - elif 'AvgTime' in metric: - if metric_avg_flag: - key = 'AvgTime' - label.extend(["oper", "_target"]) - metric_avg_flag = 0 - name = "_".join([self.prefix, 'metricssystem_method_avg_time_milliseconds']) - description = "Average turn around time of the operations in milliseconds." - self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, description, labels=label) - else: - continue - else: - label.append("_target") - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, 'metricssystem', snake_case]) - self.common_metrics['MetricsSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['MetricsSystem'][metric], labels=label) - - def setup_runtime_labels(self): - for metric in self.tmp_metrics['Runtime']: - label = ["cluster", "host", "_target"] - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case, "milliseconds"]) - self.common_metrics['Runtime'][metric] = GaugeMetricFamily(name, self.tmp_metrics['Runtime'][metric], labels=label) - - def get_jvm_metrics(self, bean): - for metric in self.tmp_metrics['JvmMetrics']: - name = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) - if 'Mem' in metric: - if "Used" in metric: - key = "jvm_mem_used_mebibytes" - mode = metric.split("Used")[0].split("Mem")[1] - label = [self.cluster, mode] - elif "Committed" in metric: - key = "jvm_mem_committed_mebibytes" - mode = metric.split("Committed")[0].split("Mem")[1] - label = [self.cluster, mode] - elif "Max" in metric: - key = "jvm_mem_max_mebibytes" - if "Heap" in metric: - mode = metric.split("Max")[0].split("Mem")[1] - else: - mode = "max" - label = [self.cluster, mode] - else: - key = "".join([name, 'ebibytes']) - label = [self.cluster] - elif 'Gc' in metric: - if "GcCount" in metric: - key = "jvm_gc_count" - if "GcCount" == metric: - typo = "total" - else: - typo = metric.split("GcCount")[1] - label = [self.cluster, typo] - elif "GcTimeMillis" in metric: - key = "jvm_gc_time_milliseconds" - if "GcTimeMillis" == metric: - typo = "total" - else: - typo = metric.split("GcTimeMillis")[1] - label = [self.cluster, typo] - elif "ThresholdExceeded" in metric: - key = "jvm_gc_exceeded_threshold_total" - typo = metric.split("ThresholdExceeded")[ - 0].split("GcNum")[1] - label = [self.cluster, typo] - else: - key = name - label = [self.cluster] - elif 'Threads' in metric: - key = "jvm_threads_state_total" - state = metric.split("Threads")[1] - label = [self.cluster, state] - elif 'Log' in metric: - key = "jvm_log_level_total" - level = metric.split("Log")[1] - label = [self.cluster, level] - else: - key = name - label = [self.cluster] - label.append(self.target) - self.common_metrics['JvmMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_os_metrics(self, bean): - for metric in self.tmp_metrics['OperatingSystem']: - label = [self.cluster] - label.append(self.target) - self.common_metrics['OperatingSystem'][metric].add_metric(label, bean[metric] if metric in bean else 0) - - def get_rpc_metrics(self, bean): - rpc_tag = bean['tag.port'] - for metric in self.tmp_metrics['RpcActivity']: - if "NumOps" in metric: - method = metric.split('NumOps')[0] - label = [self.cluster, rpc_tag, method] - key = "MethodNumOps" - elif "AvgTime" in metric: - method = metric.split('AvgTime')[0] - label = [self.cluster, rpc_tag, method] - key = "MethodAvgTime" - else: - label = [self.cluster, rpc_tag] - key = metric - label.append(self.target) - self.common_metrics['RpcActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_rpc_detailed_metrics(self, bean): - detail_tag = bean['tag.port'] - for metric in bean: - if metric[0].isupper(): - if "NumOps" in metric: - key = "NumOps" - method = metric.split('NumOps')[0] - elif "AvgTime" in metric: - key = "AvgTime" - method = metric.split("AvgTime")[0] - else: - continue - label = [self.cluster, detail_tag, method, self.target] - self.common_metrics['RpcDetailedActivity'][key].add_metric(label, bean[metric]) - - def get_ugi_metrics(self, bean): - for metric in self.tmp_metrics['UgiMetrics']: - if 'NumOps' in metric: - key = 'NumOps' - if 'Login' in metric: - method = 'Login' - state = metric.split('Login')[1].split('NumOps')[0] - label = [self.cluster, method, state] - else: - method = metric.split('NumOps')[0] - label = [self.cluster, method, "-"] - elif 'AvgTime' in metric: - key = 'AvgTime' - if 'Login' in metric: - method = 'Login' - state = metric.split('Login')[1].split('AvgTime')[0] - label = [self.cluster, method, state] - else: - method = metric.split('AvgTime')[0] - label = [self.cluster, method, "-"] - else: - key = metric - label = [self.cluster] - label.append(self.target) - self.common_metrics['UgiMetrics'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) - - def get_metric_system_metrics(self, bean): - for metric in self.tmp_metrics['MetricsSystem']: - if 'NumOps' in metric: - key = 'NumOps' - oper = metric.split('NumOps')[0] - label = [self.cluster, oper] - elif 'AvgTime' in metric: - key = 'AvgTime' - oper = metric.split('AvgTime')[0] - label = [self.cluster, oper] - else: - key = metric - label = [self.cluster] - label.append(self.target) - self.common_metrics['MetricsSystem'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) - - def get_runtime_metrics(self, bean): - for metric in self.tmp_metrics['Runtime']: - label = [self.cluster, bean['Name'].split("@")[1], self.target] - self.common_metrics['Runtime'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import re +from prometheus_client.core import GaugeMetricFamily + +import utils + + +logger = utils.get_module_logger(__name__) + + +class MetricCollector(object): + def __init__(self, cluster, component, service): + self.cluster = cluster + self.component = component + self.prefix = 'hadoop_{0}_{1}'.format(component, service) + + self.file_list = utils.get_file_list(service) + self.metrics = {} + for i in range(len(self.file_list)): + self.metrics.setdefault(self.file_list[i], utils.read_json_file(service, self.file_list[i])) + + common_file = utils.get_file_list("common") + self.merge_list = self.file_list + common_file + + def collect(self): + pass + + def _setup_metrics_labels(self): + pass + + def _get_metrics(self, metrics): + pass + + +class CommonMetricCollector(): + def __init__(self, cluster, component, service): + self.cluster = cluster + self.componet = component + self.service = service + self.prefix = 'hadoop_{0}_{1}'.format(component, service) + self.common_metrics = {} + self.tmp_metrics = {} + file_list = utils.get_file_list("common") + for i in range(len(file_list)): + self.common_metrics.setdefault(file_list[i], {}) + self.tmp_metrics.setdefault(file_list[i], utils.read_json_file("common", file_list[i])) + + def setup_labels(self, beans): + for i in range(len(beans)): + if 'name=JvmMetrics' in beans[i]['name']: + self.setup_jvm_labels() + if 'OperatingSystem' in beans[i]['name']: + self.setup_os_labels() + if 'RpcActivity' in beans[i]['name']: + self.setup_rpc_labels() + if 'RpcDetailedActivity' in beans[i]['name']: + self.setup_rpc_detailed_labels() + if 'UgiMetrics' in beans[i]['name']: + self.setup_ugi_labels() + if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']: + self.setup_metric_system_labels() + if 'Runtime' in beans[i]['name']: + self.setup_runtime_labels() + + def get_metrics(self, beans, target): + self.target = target + for i in range(len(beans)): + if 'name=JvmMetrics' in beans[i]['name']: + self.get_jvm_metrics(beans[i]) + if 'OperatingSystem' in beans[i]['name']: + self.get_os_metrics(beans[i]) + if 'RpcActivity' in beans[i]['name']: + self.get_rpc_metrics(beans[i]) + if 'RpcDetailedActivity' in beans[i]['name']: + self.get_rpc_detailed_metrics(beans[i]) + if 'UgiMetrics' in beans[i]['name']: + self.get_ugi_metrics(beans[i]) + if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']: + self.get_metric_system_metrics(beans[i]) + if 'Runtime' in beans[i]['name']: + self.get_runtime_metrics(beans[i]) + return self.common_metrics + + def setup_jvm_labels(self): + for metric in self.tmp_metrics["JvmMetrics"]: + snake_case = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) + if 'Mem' in metric: + name = "".join([snake_case, "ebibytes"]) + label = ["cluster", "mode"] + if "Used" in metric: + key = "jvm_mem_used_mebibytes" + descriptions = "Current memory used in mebibytes." + elif "Committed" in metric: + key = "jvm_mem_committed_mebibytes" + descriptions = "Current memory committed in mebibytes." + elif "Max" in metric: + key = "jvm_mem_max_mebibytes" + descriptions = "Current max memory in mebibytes." + else: + key = name + label = ["cluster"] + descriptions = self.tmp_metrics['JvmMetrics'][metric] + elif 'Gc' in metric: + label = ["cluster", "type"] + if "GcCount" in metric: + key = "jvm_gc_count" + descriptions = "GC count of each type GC." + elif "GcTimeMillis" in metric: + key = "jvm_gc_time_milliseconds" + descriptions = "Each type GC time in milliseconds." + elif "ThresholdExceeded" in metric: + key = "jvm_gc_exceeded_threshold_total" + descriptions = "Number of times that the GC threshold is exceeded." + else: + key = snake_case + label = ["cluster"] + descriptions = self.tmp_metrics['JvmMetrics'][metric] + elif 'Threads' in metric: + label = ["cluster", "state"] + key = "jvm_threads_state_total" + descriptions = "Current number of different threads." + elif 'Log' in metric: + label = ["cluster", "level"] + key = "jvm_log_level_total" + descriptions = "Total number of each level logs." + else: + label = ["cluster"] + key = snake_case + descriptions = self.tmp_metrics['JvmMetrics'][metric] + label.append("_target") + self.common_metrics['JvmMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, key]), descriptions, labels=label) + + def setup_os_labels(self): + for metric in self.tmp_metrics['OperatingSystem']: + label = ["cluster", "_target"] + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case]) + self.common_metrics['OperatingSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['OperatingSystem'][metric], labels=label) + + def setup_rpc_labels(self): + num_rpc_flag, avg_rpc_flag = 1, 1 + for metric in self.tmp_metrics["RpcActivity"]: + snake_case = "_".join(["rpc", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) + if 'Rpc' in metric: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + label = ["cluster", "tag"] + if "NumOps" in metric: + if num_rpc_flag: + key = "MethodNumOps" + label.extend(["method", "_target"]) + name = "_".join([self.prefix, "rpc_method_called_total"]) + description = "Total number of the times the method is called." + self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, description, labels=label) + num_rpc_flag = 0 + else: + continue + elif "AvgTime" in metric: + if avg_rpc_flag: + key = "MethodAvgTime" + label.extend(["method", "_target"]) + name = "_".join([self.prefix, "rpc_method_avg_time_milliseconds"]) + descrption = "Average turn around time of the method in milliseconds." + self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, descrption, labels=label) + avg_rpc_flag = 0 + else: + continue + else: + key = metric + label.append("_target") + name = "_".join([self.prefix, snake_case]) + self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcActivity'][metric], labels=label) + + def setup_rpc_detailed_labels(self): + for metric in self.tmp_metrics['RpcDetailedActivity']: + label = ["cluster", "tag", "method", "_target"] + if "NumOps" in metric: + key = "NumOps" + name = "_".join([self.prefix, 'rpc_detailed_method_called_total']) + elif "AvgTime" in metric: + key = "AvgTime" + name = "_".join([self.prefix, 'rpc_detailed_method_avg_time_milliseconds']) + else: + continue + self.common_metrics['RpcDetailedActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcDetailedActivity'][metric], labels=label) + return self.common_metrics + + def setup_ugi_labels(self): + ugi_num_flag, ugi_avg_flag = 1, 1 + for metric in self.tmp_metrics['UgiMetrics']: + label = ["cluster"] + if 'NumOps' in metric: + if ugi_num_flag: + key = 'NumOps' + label.extend(["method", "state", "_target"]) + ugi_num_flag = 0 + name = "_".join([self.prefix, 'ugi_method_called_total']) + description = "Total number of the times the method is called." + self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label) + else: + continue + elif 'AvgTime' in metric: + if ugi_avg_flag: + key = 'AvgTime' + label.extend(["method", "state", "_target"]) + ugi_avg_flag = 0 + name = "_".join([self.prefix, 'ugi_method_avg_time_milliseconds']) + description = "Average turn around time of the method in milliseconds." + self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label) + else: + continue + else: + label.append("_target") + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, 'ugi', snake_case]) + self.common_metrics['UgiMetrics'][metric] = GaugeMetricFamily(name, self.tmp_metrics['UgiMetrics'][metric], labels=label) + + def setup_metric_system_labels(self): + metric_num_flag, metric_avg_flag = 1, 1 + for metric in self.tmp_metrics['MetricsSystem']: + label = ["cluster"] + if 'NumOps' in metric: + if metric_num_flag: + key = 'NumOps' + label.extend(["oper", "_target"]) + metric_num_flag = 0 + name = "_".join([self.prefix, 'metricssystem_operations_total']) + self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, "Total number of operations", labels=label) + else: + continue + elif 'AvgTime' in metric: + if metric_avg_flag: + key = 'AvgTime' + label.extend(["oper", "_target"]) + metric_avg_flag = 0 + name = "_".join([self.prefix, 'metricssystem_method_avg_time_milliseconds']) + description = "Average turn around time of the operations in milliseconds." + self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, description, labels=label) + else: + continue + else: + label.append("_target") + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, 'metricssystem', snake_case]) + self.common_metrics['MetricsSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['MetricsSystem'][metric], labels=label) + + def setup_runtime_labels(self): + for metric in self.tmp_metrics['Runtime']: + label = ["cluster", "host", "_target"] + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case, "milliseconds"]) + self.common_metrics['Runtime'][metric] = GaugeMetricFamily(name, self.tmp_metrics['Runtime'][metric], labels=label) + + def get_jvm_metrics(self, bean): + for metric in self.tmp_metrics['JvmMetrics']: + name = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) + if 'Mem' in metric: + if "Used" in metric: + key = "jvm_mem_used_mebibytes" + mode = metric.split("Used")[0].split("Mem")[1] + label = [self.cluster, mode] + elif "Committed" in metric: + key = "jvm_mem_committed_mebibytes" + mode = metric.split("Committed")[0].split("Mem")[1] + label = [self.cluster, mode] + elif "Max" in metric: + key = "jvm_mem_max_mebibytes" + if "Heap" in metric: + mode = metric.split("Max")[0].split("Mem")[1] + else: + mode = "max" + label = [self.cluster, mode] + else: + key = "".join([name, 'ebibytes']) + label = [self.cluster] + elif 'Gc' in metric: + if "GcCount" in metric: + key = "jvm_gc_count" + if "GcCount" == metric: + typo = "total" + else: + typo = metric.split("GcCount")[1] + label = [self.cluster, typo] + elif "GcTimeMillis" in metric: + key = "jvm_gc_time_milliseconds" + if "GcTimeMillis" == metric: + typo = "total" + else: + typo = metric.split("GcTimeMillis")[1] + label = [self.cluster, typo] + elif "ThresholdExceeded" in metric: + key = "jvm_gc_exceeded_threshold_total" + typo = metric.split("ThresholdExceeded")[ + 0].split("GcNum")[1] + label = [self.cluster, typo] + else: + key = name + label = [self.cluster] + elif 'Threads' in metric: + key = "jvm_threads_state_total" + state = metric.split("Threads")[1] + label = [self.cluster, state] + elif 'Log' in metric: + key = "jvm_log_level_total" + level = metric.split("Log")[1] + label = [self.cluster, level] + else: + key = name + label = [self.cluster] + label.append(self.target) + self.common_metrics['JvmMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_os_metrics(self, bean): + for metric in self.tmp_metrics['OperatingSystem']: + label = [self.cluster] + label.append(self.target) + self.common_metrics['OperatingSystem'][metric].add_metric(label, bean[metric] if metric in bean else 0) + + def get_rpc_metrics(self, bean): + rpc_tag = bean['tag.port'] + for metric in self.tmp_metrics['RpcActivity']: + if "NumOps" in metric: + method = metric.split('NumOps')[0] + label = [self.cluster, rpc_tag, method] + key = "MethodNumOps" + elif "AvgTime" in metric: + method = metric.split('AvgTime')[0] + label = [self.cluster, rpc_tag, method] + key = "MethodAvgTime" + else: + label = [self.cluster, rpc_tag] + key = metric + label.append(self.target) + self.common_metrics['RpcActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_rpc_detailed_metrics(self, bean): + detail_tag = bean['tag.port'] + for metric in bean: + if metric[0].isupper(): + if "NumOps" in metric: + key = "NumOps" + method = metric.split('NumOps')[0] + elif "AvgTime" in metric: + key = "AvgTime" + method = metric.split("AvgTime")[0] + else: + continue + label = [self.cluster, detail_tag, method, self.target] + self.common_metrics['RpcDetailedActivity'][key].add_metric(label, bean[metric]) + + def get_ugi_metrics(self, bean): + for metric in self.tmp_metrics['UgiMetrics']: + if 'NumOps' in metric: + key = 'NumOps' + if 'Login' in metric: + method = 'Login' + state = metric.split('Login')[1].split('NumOps')[0] + label = [self.cluster, method, state] + else: + method = metric.split('NumOps')[0] + label = [self.cluster, method, "-"] + elif 'AvgTime' in metric: + key = 'AvgTime' + if 'Login' in metric: + method = 'Login' + state = metric.split('Login')[1].split('AvgTime')[0] + label = [self.cluster, method, state] + else: + method = metric.split('AvgTime')[0] + label = [self.cluster, method, "-"] + else: + key = metric + label = [self.cluster] + label.append(self.target) + self.common_metrics['UgiMetrics'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + + def get_metric_system_metrics(self, bean): + for metric in self.tmp_metrics['MetricsSystem']: + if 'NumOps' in metric: + key = 'NumOps' + oper = metric.split('NumOps')[0] + label = [self.cluster, oper] + elif 'AvgTime' in metric: + key = 'AvgTime' + oper = metric.split('AvgTime')[0] + label = [self.cluster, oper] + else: + key = metric + label = [self.cluster] + label.append(self.target) + self.common_metrics['MetricsSystem'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + + def get_runtime_metrics(self, bean): + for metric in self.tmp_metrics['Runtime']: + label = [self.cluster, bean['Name'].split("@")[1], self.target] + self.common_metrics['Runtime'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) diff --git a/hadoop_jmx_exporter.py b/hadoop_jmx_exporter.py index f85d5fc..ef590dd 100755 --- a/hadoop_jmx_exporter.py +++ b/hadoop_jmx_exporter.py @@ -1,44 +1,44 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import time -from prometheus_client import start_http_server -from prometheus_client.core import REGISTRY - -import utils -from utils import get_module_logger -from hdfs_namenode import NameNodeMetricCollector -from hdfs_datanode import DataNodeMetricCollector -from hdfs_journalnode import JournalNodeMetricCollector -from yarn_resourcemanager import ResourceManagerMetricCollector -from yarn_nodemanager import NodeManagerMetricCollector - -logger = get_module_logger(__name__) - - -def register_prometheus(cluster, args): - if args.nns is not None and len(args.nns) > 0: - nnc = NameNodeMetricCollector(cluster, args.nns) - nnc.collect() - REGISTRY.register(nnc) - REGISTRY.register(DataNodeMetricCollector(cluster, nnc)) - if args.rms is not None and len(args.rms) > 0: - rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue) - rmc.collect() - REGISTRY.register(rmc) - REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) - if args.jns is not None and len(args.jns) > 0: - REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns)) -def main(): - args = utils.parse_args() - host = args.host - port = int(args.port) - start_http_server(port, host) - print "Listen at %s:%s" % (host, port) - register_prometheus(args.cluster, args) - while True: - time.sleep(300) - - -if __name__ == "__main__": - main() +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import time +from prometheus_client import start_http_server +from prometheus_client.core import REGISTRY + +import utils +from utils import get_module_logger +from hdfs_namenode import NameNodeMetricCollector +from hdfs_datanode import DataNodeMetricCollector +from hdfs_journalnode import JournalNodeMetricCollector +from yarn_resourcemanager import ResourceManagerMetricCollector +from yarn_nodemanager import NodeManagerMetricCollector + +logger = get_module_logger(__name__) + + +def register_prometheus(cluster, args): + if args.nns is not None and len(args.nns) > 0: + nnc = NameNodeMetricCollector(cluster, args.nns) + nnc.collect() + REGISTRY.register(nnc) + REGISTRY.register(DataNodeMetricCollector(cluster, nnc)) + if args.rms is not None and len(args.rms) > 0: + rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue) + rmc.collect() + REGISTRY.register(rmc) + REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) + if args.jns is not None and len(args.jns) > 0: + REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns)) +def main(): + args = utils.parse_args() + host = args.host + port = int(args.port) + start_http_server(port, host) + print "Listen at %s:%s" % (host, port) + register_prometheus(args.cluster, args) + while True: + time.sleep(300) + + +if __name__ == "__main__": + main() diff --git a/hdfs_datanode.py b/hdfs_datanode.py index db962c0..8e55a75 100644 --- a/hdfs_datanode.py +++ b/hdfs_datanode.py @@ -1,163 +1,163 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import yaml -import re -from prometheus_client.core import GaugeMetricFamily - -from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector -from scraper import ScrapeMetrics - -logger = get_module_logger(__name__) - - -class DataNodeMetricCollector(MetricCollector): - def __init__(self, cluster, nnc): - MetricCollector.__init__(self, cluster, "hdfs", "datanode") - self.target = "-" - self.nnc = nnc - - self.hadoop_datanode_metrics = {} - for i in range(len(self.file_list)): - self.hadoop_datanode_metrics.setdefault(self.file_list[i], {}) - - self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "datanode") - - def collect(self): - isSetup = False - if self.nnc.dns == "": - return - beans_list = ScrapeMetrics(self.nnc.dns).scrape() - for beans in beans_list: - if not isSetup: - self.common_metric_collector.setup_labels(beans) - self.setup_metrics_labels(beans) - isSetup = True - for i in range(len(beans)): - if 'tag.Hostname' in beans[i]: - self.target = beans[i]["tag.Hostname"] - break - self.hadoop_datanode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) - self.get_metrics(beans) - - for i in range(len(self.merge_list)): - service = self.merge_list[i] - if service in self.hadoop_datanode_metrics: - for metric in self.hadoop_datanode_metrics[service]: - yield self.hadoop_datanode_metrics[service][metric] - - def setup_dninfo_labels(self): - for metric in self.metrics['DataNodeInfo']: - if 'VolumeInfo' in metric: - label = ["cluster", "version", "path", "state"] - name = "_".join([self.prefix, 'volume_state']) - else: - label = ["cluster", "version"] - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case]) - label.append("_target") - self.hadoop_datanode_metrics['DataNodeInfo'][metric] = GaugeMetricFamily(name, self.metrics['DataNodeInfo'][metric], labels=label) - - def setup_dnactivity_labels(self): - block_flag, client_flag = 1, 1 - for metric in self.metrics['DataNodeActivity']: - if 'Blocks' in metric: - if block_flag: - label = ['cluster', 'host', 'oper'] - key = "Blocks" - name = "block_operations_total" - descriptions = "Total number of blocks in different oprations" - block_flag = 0 - else: - continue - elif 'Client' in metric: - if client_flag: - label = ['cluster', 'host', 'oper', 'client'] - key = "Client" - name = "from_client_total" - descriptions = "Total number of each operations from different client" - client_flag = 0 - else: - continue - else: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - label = ['cluster', 'host'] - key = metric - name = snake_case - descriptions = self.metrics['DataNodeActivity'][metric] - label.append("_target") - self.hadoop_datanode_metrics['DataNodeActivity'][key] = GaugeMetricFamily("_".join([self.prefix, name]), descriptions, labels=label) - - def setup_fsdatasetstate_labels(self): - for metric in self.metrics['FSDatasetState']: - label = ['cluster', 'host', "_target"] - if "Num" in metric: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric.split("Num")[1]).lower() - else: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case]) - self.hadoop_datanode_metrics['FSDatasetState'][metric] = GaugeMetricFamily(name, self.metrics['FSDatasetState'][metric], labels=label) - - def setup_metrics_labels(self, beans): - for i in range(len(beans)): - if 'DataNodeInfo' in beans[i]['name']: - self.setup_dninfo_labels() - if 'DataNodeActivity' in beans[i]['name']: - self.setup_dnactivity_labels() - if 'FSDatasetState' in beans[i]['name']: - self.setup_fsdatasetstate_labels() - - def get_dninfo_metrics(self, bean): - for metric in self.metrics['DataNodeInfo']: - version = bean['Version'] - if 'VolumeInfo' in metric: - if 'VolumeInfo' in bean: - volume_info_dict = yaml.safe_load(bean['VolumeInfo']) - for k, v in volume_info_dict.items(): - path = k - for key, val in v.items(): - if key != "storageType": - state = key - label = [self.cluster, version, path, state, self.target] - value = val - self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value) - else: - continue - else: - label = [self.cluster, version, self.target] - value = bean[metric] - self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value) - - def get_dnactivity_metrics(self, bean): - for metric in self.metrics['DataNodeActivity']: - host = bean['tag.Hostname'] - label = [self.cluster, host] - if 'Blocks' in metric: - oper = metric.split("Blocks")[1] - label.append(oper) - key = "Blocks" - elif 'Client' in metric: - oper = metric.split("Client")[0].split("From")[0] - client = metric.split("Client")[0].split("From")[1] - label.extend([oper, client]) - key = "Client" - else: - key = metric - label.append(self.target) - self.hadoop_datanode_metrics['DataNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_fsdatasetstate_metrics(self, bean): - for metric in self.metrics['FSDatasetState']: - label = [self.cluster, self.target, self.target] - self.hadoop_datanode_metrics['FSDatasetState'][metric].add_metric( - label, bean[metric] if metric in bean else 0) - - def get_metrics(self, beans): - for i in range(len(beans)): - if 'DataNodeInfo' in beans[i]['name']: - self.get_dninfo_metrics(beans[i]) - if 'DataNodeActivity' in beans[i]['name']: - self.get_dnactivity_metrics(beans[i]) - if 'FSDatasetState' in beans[i]['name']: - self.get_fsdatasetstate_metrics(beans[i]) +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import yaml +import re +from prometheus_client.core import GaugeMetricFamily + +from utils import get_module_logger +from common import MetricCollector, CommonMetricCollector +from scraper import ScrapeMetrics + +logger = get_module_logger(__name__) + + +class DataNodeMetricCollector(MetricCollector): + def __init__(self, cluster, nnc): + MetricCollector.__init__(self, cluster, "hdfs", "datanode") + self.target = "-" + self.nnc = nnc + + self.hadoop_datanode_metrics = {} + for i in range(len(self.file_list)): + self.hadoop_datanode_metrics.setdefault(self.file_list[i], {}) + + self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "datanode") + + def collect(self): + isSetup = False + if self.nnc.dns == "": + return + beans_list = ScrapeMetrics(self.nnc.dns).scrape() + for beans in beans_list: + if not isSetup: + self.common_metric_collector.setup_labels(beans) + self.setup_metrics_labels(beans) + isSetup = True + for i in range(len(beans)): + if 'tag.Hostname' in beans[i]: + self.target = beans[i]["tag.Hostname"] + break + self.hadoop_datanode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.hadoop_datanode_metrics: + for metric in self.hadoop_datanode_metrics[service]: + yield self.hadoop_datanode_metrics[service][metric] + + def setup_dninfo_labels(self): + for metric in self.metrics['DataNodeInfo']: + if 'VolumeInfo' in metric: + label = ["cluster", "version", "path", "state"] + name = "_".join([self.prefix, 'volume_state']) + else: + label = ["cluster", "version"] + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case]) + label.append("_target") + self.hadoop_datanode_metrics['DataNodeInfo'][metric] = GaugeMetricFamily(name, self.metrics['DataNodeInfo'][metric], labels=label) + + def setup_dnactivity_labels(self): + block_flag, client_flag = 1, 1 + for metric in self.metrics['DataNodeActivity']: + if 'Blocks' in metric: + if block_flag: + label = ['cluster', 'host', 'oper'] + key = "Blocks" + name = "block_operations_total" + descriptions = "Total number of blocks in different oprations" + block_flag = 0 + else: + continue + elif 'Client' in metric: + if client_flag: + label = ['cluster', 'host', 'oper', 'client'] + key = "Client" + name = "from_client_total" + descriptions = "Total number of each operations from different client" + client_flag = 0 + else: + continue + else: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + label = ['cluster', 'host'] + key = metric + name = snake_case + descriptions = self.metrics['DataNodeActivity'][metric] + label.append("_target") + self.hadoop_datanode_metrics['DataNodeActivity'][key] = GaugeMetricFamily("_".join([self.prefix, name]), descriptions, labels=label) + + def setup_fsdatasetstate_labels(self): + for metric in self.metrics['FSDatasetState']: + label = ['cluster', 'host', "_target"] + if "Num" in metric: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric.split("Num")[1]).lower() + else: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case]) + self.hadoop_datanode_metrics['FSDatasetState'][metric] = GaugeMetricFamily(name, self.metrics['FSDatasetState'][metric], labels=label) + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + if 'DataNodeInfo' in beans[i]['name']: + self.setup_dninfo_labels() + if 'DataNodeActivity' in beans[i]['name']: + self.setup_dnactivity_labels() + if 'FSDatasetState' in beans[i]['name']: + self.setup_fsdatasetstate_labels() + + def get_dninfo_metrics(self, bean): + for metric in self.metrics['DataNodeInfo']: + version = bean['Version'] + if 'VolumeInfo' in metric: + if 'VolumeInfo' in bean: + volume_info_dict = yaml.safe_load(bean['VolumeInfo']) + for k, v in volume_info_dict.items(): + path = k + for key, val in v.items(): + if key != "storageType": + state = key + label = [self.cluster, version, path, state, self.target] + value = val + self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value) + else: + continue + else: + label = [self.cluster, version, self.target] + value = bean[metric] + self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value) + + def get_dnactivity_metrics(self, bean): + for metric in self.metrics['DataNodeActivity']: + host = bean['tag.Hostname'] + label = [self.cluster, host] + if 'Blocks' in metric: + oper = metric.split("Blocks")[1] + label.append(oper) + key = "Blocks" + elif 'Client' in metric: + oper = metric.split("Client")[0].split("From")[0] + client = metric.split("Client")[0].split("From")[1] + label.extend([oper, client]) + key = "Client" + else: + key = metric + label.append(self.target) + self.hadoop_datanode_metrics['DataNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_fsdatasetstate_metrics(self, bean): + for metric in self.metrics['FSDatasetState']: + label = [self.cluster, self.target, self.target] + self.hadoop_datanode_metrics['FSDatasetState'][metric].add_metric( + label, bean[metric] if metric in bean else 0) + + def get_metrics(self, beans): + for i in range(len(beans)): + if 'DataNodeInfo' in beans[i]['name']: + self.get_dninfo_metrics(beans[i]) + if 'DataNodeActivity' in beans[i]['name']: + self.get_dnactivity_metrics(beans[i]) + if 'FSDatasetState' in beans[i]['name']: + self.get_fsdatasetstate_metrics(beans[i]) diff --git a/hdfs_journalnode.py b/hdfs_journalnode.py index dcd20e8..75d7191 100644 --- a/hdfs_journalnode.py +++ b/hdfs_journalnode.py @@ -1,142 +1,142 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import re -from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily - -from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector -from scraper import ScrapeMetrics - -logger = get_module_logger(__name__) - - -class JournalNodeMetricCollector(MetricCollector): - def __init__(self, cluster, urls): - MetricCollector.__init__(self, cluster, "hdfs", "journalnode") - self.target = "-" - self.urls = urls - - self.hadoop_journalnode_metrics = {} - for i in range(len(self.file_list)): - self.hadoop_journalnode_metrics.setdefault(self.file_list[i], {}) - - self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "journalnode") - - self.scrape_metrics = ScrapeMetrics(urls) - - def collect(self): - isSetup = False - beans_list = self.scrape_metrics.scrape() - for beans in beans_list: - if not isSetup: - self.common_metric_collector.setup_labels(beans) - self.setup_metrics_labels(beans) - isSetup = True - for i in range(len(beans)): - if 'tag.Hostname' in beans[i]: - self.target = beans[i]["tag.Hostname"] - break - self.hadoop_journalnode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) - self.get_metrics(beans) - - for i in range(len(self.merge_list)): - service = self.merge_list[i] - if service in self.hadoop_journalnode_metrics: - for metric in self.hadoop_journalnode_metrics[service]: - yield self.hadoop_journalnode_metrics[service][metric] - - def setup_journalnode_labels(self): - a_60_latency_flag, a_300_latency_flag, a_3600_latency_flag = 1, 1, 1 - for metric in self.metrics['JournalNode']: - label = ["cluster", "host", "_target"] - if 'Syncs60s' in metric: - if a_60_latency_flag: - a_60_latency_flag = 0 - key = "Syncs60" - name = "_".join([self.prefix, 'sync60s_latency_microseconds']) - descriptions = "The percentile of sync latency in microseconds in 60s granularity" - self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) - else: - continue - elif 'Syncs300s' in metric: - if a_300_latency_flag: - a_300_latency_flag = 0 - key = "Syncs300" - name = "_".join([self.prefix, 'sync300s_latency_microseconds']) - descriptions = "The percentile of sync latency in microseconds in 300s granularity" - self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) - else: - continue - elif 'Syncs3600s' in metric: - if a_3600_latency_flag: - a_3600_latency_flag = 0 - key = "Syncs3600" - name = "_".join([self.prefix, 'sync3600s_latency_microseconds']) - descriptions = "The percentile of sync latency in microseconds in 3600s granularity" - self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) - else: - continue - else: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case]) - self.hadoop_journalnode_metrics['JournalNode'][metric] = GaugeMetricFamily(name, self.metrics['JournalNode'][metric], labels=label) - - def setup_metrics_labels(self, beans): - for i in range(len(beans)): - if 'name=Journal-' in beans[i]['name']: - self.setup_journalnode_labels() - - def get_metrics(self, beans): - for i in range(len(beans)): - if 'name=Journal-' in beans[i]['name'] and 'JournalNode' in self.metrics: - host = beans[i]['tag.Hostname'] - label = [self.cluster, host, self.target] - - a_60_sum, a_300_sum, a_3600_sum = 0.0, 0.0, 0.0 - a_60_value, a_300_value, a_3600_value = [], [], [] - a_60_percentile, a_300_percentile, a_3600_percentile = [], [], [] - - for metric in beans[i]: - if not metric[0].isupper(): - continue - if "Syncs60s" in metric: - if 'NumOps' in metric: - a_60_count = beans[i][metric] - else: - tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") - a_60_percentile.append(str(float(tmp[1]) / 100.0)) - a_60_value.append(beans[i][metric]) - a_60_sum += beans[i][metric] - elif 'Syncs300' in metric: - if 'NumOps' in metric: - a_300_count = beans[i][metric] - else: - tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") - a_300_percentile.append(str(float(tmp[1]) / 100.0)) - a_300_value.append(beans[i][metric]) - a_300_sum += beans[i][metric] - elif 'Syncs3600' in metric: - if 'NumOps' in metric: - a_3600_count = beans[i][metric] - else: - tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") - a_3600_percentile.append(str(float(tmp[1]) / 100.0)) - a_3600_value.append(beans[i][metric]) - a_3600_sum += beans[i][metric] - else: - key = metric - if key in self.hadoop_journalnode_metrics['JournalNode']: - self.hadoop_journalnode_metrics['JournalNode'][key].add_metric(label, beans[i][metric]) - a_60_bucket = zip(a_60_percentile, a_60_value) - a_300_bucket = zip(a_300_percentile, a_300_value) - a_3600_bucket = zip(a_3600_percentile, a_3600_value) - a_60_bucket.sort() - a_300_bucket.sort() - a_3600_bucket.sort() - a_60_bucket.append(("+Inf", a_60_count)) - a_300_bucket.append(("+Inf", a_300_count)) - a_3600_bucket.append(("+Inf", a_3600_count)) - self.hadoop_journalnode_metrics['JournalNode']['Syncs60'].add_metric(label, buckets=a_60_bucket, sum_value=a_60_sum) - self.hadoop_journalnode_metrics['JournalNode']['Syncs300'].add_metric(label, buckets=a_300_bucket, sum_value=a_300_sum) - self.hadoop_journalnode_metrics['JournalNode']['Syncs3600'].add_metric(label, buckets=a_3600_bucket, sum_value=a_3600_sum) +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import re +from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily + +from utils import get_module_logger +from common import MetricCollector, CommonMetricCollector +from scraper import ScrapeMetrics + +logger = get_module_logger(__name__) + + +class JournalNodeMetricCollector(MetricCollector): + def __init__(self, cluster, urls): + MetricCollector.__init__(self, cluster, "hdfs", "journalnode") + self.target = "-" + self.urls = urls + + self.hadoop_journalnode_metrics = {} + for i in range(len(self.file_list)): + self.hadoop_journalnode_metrics.setdefault(self.file_list[i], {}) + + self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "journalnode") + + self.scrape_metrics = ScrapeMetrics(urls) + + def collect(self): + isSetup = False + beans_list = self.scrape_metrics.scrape() + for beans in beans_list: + if not isSetup: + self.common_metric_collector.setup_labels(beans) + self.setup_metrics_labels(beans) + isSetup = True + for i in range(len(beans)): + if 'tag.Hostname' in beans[i]: + self.target = beans[i]["tag.Hostname"] + break + self.hadoop_journalnode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.hadoop_journalnode_metrics: + for metric in self.hadoop_journalnode_metrics[service]: + yield self.hadoop_journalnode_metrics[service][metric] + + def setup_journalnode_labels(self): + a_60_latency_flag, a_300_latency_flag, a_3600_latency_flag = 1, 1, 1 + for metric in self.metrics['JournalNode']: + label = ["cluster", "host", "_target"] + if 'Syncs60s' in metric: + if a_60_latency_flag: + a_60_latency_flag = 0 + key = "Syncs60" + name = "_".join([self.prefix, 'sync60s_latency_microseconds']) + descriptions = "The percentile of sync latency in microseconds in 60s granularity" + self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) + else: + continue + elif 'Syncs300s' in metric: + if a_300_latency_flag: + a_300_latency_flag = 0 + key = "Syncs300" + name = "_".join([self.prefix, 'sync300s_latency_microseconds']) + descriptions = "The percentile of sync latency in microseconds in 300s granularity" + self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) + else: + continue + elif 'Syncs3600s' in metric: + if a_3600_latency_flag: + a_3600_latency_flag = 0 + key = "Syncs3600" + name = "_".join([self.prefix, 'sync3600s_latency_microseconds']) + descriptions = "The percentile of sync latency in microseconds in 3600s granularity" + self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) + else: + continue + else: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case]) + self.hadoop_journalnode_metrics['JournalNode'][metric] = GaugeMetricFamily(name, self.metrics['JournalNode'][metric], labels=label) + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + if 'name=Journal-' in beans[i]['name']: + self.setup_journalnode_labels() + + def get_metrics(self, beans): + for i in range(len(beans)): + if 'name=Journal-' in beans[i]['name'] and 'JournalNode' in self.metrics: + host = beans[i]['tag.Hostname'] + label = [self.cluster, host, self.target] + + a_60_sum, a_300_sum, a_3600_sum = 0.0, 0.0, 0.0 + a_60_value, a_300_value, a_3600_value = [], [], [] + a_60_percentile, a_300_percentile, a_3600_percentile = [], [], [] + + for metric in beans[i]: + if not metric[0].isupper(): + continue + if "Syncs60s" in metric: + if 'NumOps' in metric: + a_60_count = beans[i][metric] + else: + tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") + a_60_percentile.append(str(float(tmp[1]) / 100.0)) + a_60_value.append(beans[i][metric]) + a_60_sum += beans[i][metric] + elif 'Syncs300' in metric: + if 'NumOps' in metric: + a_300_count = beans[i][metric] + else: + tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") + a_300_percentile.append(str(float(tmp[1]) / 100.0)) + a_300_value.append(beans[i][metric]) + a_300_sum += beans[i][metric] + elif 'Syncs3600' in metric: + if 'NumOps' in metric: + a_3600_count = beans[i][metric] + else: + tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") + a_3600_percentile.append(str(float(tmp[1]) / 100.0)) + a_3600_value.append(beans[i][metric]) + a_3600_sum += beans[i][metric] + else: + key = metric + if key in self.hadoop_journalnode_metrics['JournalNode']: + self.hadoop_journalnode_metrics['JournalNode'][key].add_metric(label, beans[i][metric]) + a_60_bucket = zip(a_60_percentile, a_60_value) + a_300_bucket = zip(a_300_percentile, a_300_value) + a_3600_bucket = zip(a_3600_percentile, a_3600_value) + a_60_bucket.sort() + a_300_bucket.sort() + a_3600_bucket.sort() + a_60_bucket.append(("+Inf", a_60_count)) + a_300_bucket.append(("+Inf", a_300_count)) + a_3600_bucket.append(("+Inf", a_3600_count)) + self.hadoop_journalnode_metrics['JournalNode']['Syncs60'].add_metric(label, buckets=a_60_bucket, sum_value=a_60_sum) + self.hadoop_journalnode_metrics['JournalNode']['Syncs300'].add_metric(label, buckets=a_300_bucket, sum_value=a_300_sum) + self.hadoop_journalnode_metrics['JournalNode']['Syncs3600'].add_metric(label, buckets=a_3600_bucket, sum_value=a_3600_sum) diff --git a/hdfs_namenode.py b/hdfs_namenode.py index da06d7f..93a15a9 100644 --- a/hdfs_namenode.py +++ b/hdfs_namenode.py @@ -1,478 +1,478 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import yaml -import re - -from prometheus_client.core import GaugeMetricFamily - -from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector -from scraper import ScrapeMetrics - -logger = get_module_logger(__name__) - - -class NameNodeMetricCollector(MetricCollector): - def __init__(self, cluster, urls): - MetricCollector.__init__(self, cluster, "hdfs", "namenode") - self.target = "-" - self.urls = urls - self.dns = set() - - self.hadoop_namenode_metrics = {} - for i in range(len(self.file_list)): - self.hadoop_namenode_metrics.setdefault(self.file_list[i], {}) - - self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "namenode") - - self.scrape_metrics = ScrapeMetrics(urls) - - def collect(self): - isSetup = False - beans_list = self.scrape_metrics.scrape() - for beans in beans_list: - if not isSetup: - self.common_metric_collector.setup_labels(beans) - self.setup_metrics_labels(beans) - isSetup = True - for i in range(len(beans)): - if 'tag.Hostname' in beans[i]: - self.target = beans[i]["tag.Hostname"] - break - self.hadoop_namenode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) - self.get_metrics(beans) - - for i in range(len(self.merge_list)): - service = self.merge_list[i] - if service in self.hadoop_namenode_metrics: - for metric in self.hadoop_namenode_metrics[service]: - yield self.hadoop_namenode_metrics[service][metric] - - def setup_nnactivity_labels(self): - num_namenode_flag, avg_namenode_flag, ops_namenode_flag = 1, 1, 1 - for metric in self.metrics['NameNodeActivity']: - label = ["cluster", "method", "_target"] - if "NumOps" in metric: - if num_namenode_flag: - key = "MethodNumOps" - name = "_".join([self.prefix, "nnactivity_method_ops_total"]) - description = "Total number of the times the method is called." - self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label) - num_namenode_flag = 0 - else: - continue - elif "AvgTime" in metric: - if avg_namenode_flag: - key = "MethodAvgTime" - name = "_".join([self.prefix, "nnactivity_method_avg_time_milliseconds"]) - descripton = "Average turn around time of the method in milliseconds." - self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, descripton, labels=label) - avg_namenode_flag = 0 - else: - continue - elif ops_namenode_flag: - key = "Operations" - name = "_".join([self.prefix, "nnactivity_operations_total"]) - description = "Total number of each operation." - self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label) - ops_namenode_flag = 0 - - def setup_startupprogress_labels(self): - sp_count_flag, sp_elapsed_flag, sp_total_flag, sp_complete_flag = 1, 1, 1, 1 - for metric in self.metrics['StartupProgress']: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - if "ElapsedTime" == metric: - key = "ElapsedTime" - name = "total_elapsed_time_milliseconds" - descriptions = "Total elapsed time in milliseconds." - elif "PercentComplete" == metric: - key = "PercentComplete" - name = "complete_rate" - descriptions = "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0)." - elif "Count" in metric: - if sp_count_flag: - sp_count_flag = 0 - key = "PhaseCount" - name = "phase_count" - descriptions = "Total number of steps completed in the phase." - else: - continue - elif "ElapsedTime" in metric: - if sp_elapsed_flag: - sp_elapsed_flag = 0 - key = "PhaseElapsedTime" - name = "phase_elapsed_time_milliseconds" - descriptions = "Total elapsed time in the phase in milliseconds." - else: - continue - elif "Total" in metric: - if sp_total_flag: - sp_total_flag = 0 - key = "PhaseTotal" - name = "phase_total" - descriptions = "Total number of steps in the phase." - else: - continue - elif "PercentComplete" in metric: - if sp_complete_flag: - sp_complete_flag = 0 - key = "PhasePercentComplete" - name = "phase_complete_rate" - descriptions = "Current rate completed in the phase (The max value is not 100 but 1.0)." - else: - continue - else: - key = metric - name = snake_case - descriptions = self.metrics['StartupProgress'][metric] - label = ["cluster", "phase", "_target"] - name = "_".join([self.prefix, "startup_process", name]) - self.hadoop_namenode_metrics['StartupProgress'][key] = GaugeMetricFamily(name, descriptions, labels=label) - - def setup_fsnamesystem_labels(self): - cap_flag = 1 - for metric in self.metrics['FSNamesystem']: - if metric.startswith('Capacity'): - if cap_flag: - cap_flag = 0 - key = "capacity" - label = ["cluster", "mode"] - name = "capacity_bytes" - descriptions = "Current DataNodes capacity in each mode in bytes" - else: - continue - else: - key = metric - label = ["cluster"] - name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - descriptions = self.metrics['FSNamesystem'][metric] - label.append("_target") - name = "_".join([self.prefix, "fsname_system", name]) - self.hadoop_namenode_metrics['FSNamesystem'][key] = GaugeMetricFamily(name, descriptions, labels=label) - - def setup_fsnamesystem_state_labels(self): - num_flag = 1 - for metric in self.metrics['FSNamesystemState']: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - if 'DataNodes' in metric: - if num_flag: - num_flag = 0 - key = "datanodes_num" - label = ["cluster", "state"] - descriptions = "Number of datanodes in each state" - else: - continue - else: - key = metric - label = ["cluster"] - descriptions = self.metrics['FSNamesystemState'][metric] - label.append("_target") - name = "_".join([self.prefix, "fsname_system_state", snake_case]) - self.hadoop_namenode_metrics['FSNamesystemState'][key] = GaugeMetricFamily(name, descriptions, labels=label) - - def setup_retrycache_labels(self): - cache_flag = 1 - for metric in self.metrics['RetryCache']: - if cache_flag: - cache_flag = 0 - key = "cache" - label = ["cluster", "mode", "_target"] - name = "_".join([self.prefix, "cache_total"]) - description = "Total number of RetryCache in each mode" - self.hadoop_namenode_metrics['RetryCache'][key] = GaugeMetricFamily(name, description, labels=label) - - def setup_nninfo_labels(self): - for metric in self.metrics['NameNodeInfo']: - if "LiveNodes" in metric: - name = "_".join([self.prefix, "nninfo_live_nodes_count"]) - description = "Count of live data node" - self.hadoop_namenode_metrics['NameNodeInfo']["LiveNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) - - label = ["cluster", "datanode", "infoAddr", "infoSecureAddr", "xferaddr", "version", "_target"] - items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", - "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] - for item in items: - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - name = "_".join([self.prefix, "nninfo_live_nodes", item]) - key = "LiveNodes-" + item - description = "Live node " + item - if item == "admin_state": - description += " 0: In Service, 1: Decommission In Progress, 2: Decommissioned" - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) - continue - elif "DeadNodes" in metric: - name = "_".join([self.prefix, "nninfo_dead_nodes_count"]) - description = "Count of dead data node" - self.hadoop_namenode_metrics['NameNodeInfo']["DeadNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) - - label = ["cluster", "datanode", "decommissioned", "xferaddr"] - name = "_".join([self.prefix, "nninfo_dead_nodes_last_contact"]) - key = "DeadNodes" - description = "Dead node last contact in milions" - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) - continue - elif "DecomNodes" in metric: - name = "_".join([self.prefix, "nninfo_decom_nodes_count"]) - description = "Count of decommissioned data node" - self.hadoop_namenode_metrics['NameNodeInfo']["DecomNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) - - label = ["cluster", "datanode", "xferaddr", "_target"] - items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"] - for item in items: - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - name = "_".join([self.prefix, "nninfo_decom_nodes", item]) - key = "DecomNodes-" + item - description = "Decom Node " + item - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) - continue - elif "EnteringMaintenanceNodes" in metric: - name = "_".join([self.prefix, "nninfo_maintenance_nodes_count"]) - description = "Count of maintenance data node" - self.hadoop_namenode_metrics['NameNodeInfo']["MaintenanceNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) - - label = ["cluster", "datanode", "xferaddr", "_target"] - items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"] - for item in items: - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - name = "_".join([self.prefix, "nninfo_entering_maintenance_nodes", item]) - key = "EnteringMaintenanceNodes-" + item - description = "Entering maintenance node " + item - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) - continue - elif "CorruptFiles" in metric: - label = ["cluster", "_target"] - name = "_".join([self.prefix, "nninfo_corrupt_file_count"]) - key = "CorruptFiles" - description = "Corrupt file count" - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) - continue - elif "NodeUsage" in metric: - label = ["cluster", "_target"] - items = ["min", "median", "max", "stdDev"] - for item in items: - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - name = "_".join([self.prefix, "nninfo_node_usage", item]) - key = "NodeUsage-" + item - description = "Node usage " + item - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) - continue - elif "SoftwareVersion" in metric: - label = ["cluster", "software_version"] - name = "_".join([self.prefix, "nninfo_software_version"]) - key = "SoftwareVersion" - elif "Safemode" in metric: - label = ["cluster"] - name = "_".join([self.prefix, "nninfo_safe_mode"]) - key = "Safemode" - else: - label = ["cluster"] - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, "nninfo", snake_case]) - key = metric - label.append("_target") - self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, self.metrics["NameNodeInfo"][metric], labels=label) - - def setup_metrics_labels(self, beans): - for i in range(len(beans)): - if 'NameNodeActivity' in beans[i]['name']: - self.setup_nnactivity_labels() - if 'StartupProgress' in beans[i]['name']: - self.setup_startupprogress_labels() - if 'FSNamesystem' in beans[i]['name']: - self.setup_fsnamesystem_labels() - if 'FSNamesystemState' in beans[i]['name']: - self.setup_fsnamesystem_state_labels() - if 'RetryCache' in beans[i]['name']: - self.setup_retrycache_labels() - if "NameNodeInfo" in beans[i]['name']: - self.setup_nninfo_labels() - - def get_nnactivity_metrics(self, bean): - for metric in self.metrics['NameNodeActivity']: - if "NumOps" in metric: - method = metric.split('NumOps')[0] - key = "MethodNumOps" - elif "AvgTime" in metric: - method = metric.split('AvgTime')[0] - key = "MethodAvgTime" - else: - if "Ops" in metric: - method = metric.split('Ops')[0] - else: - method = metric - key = "Operations" - label = [self.cluster, method, self.target] - self.hadoop_namenode_metrics['NameNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_startupprogress_metrics(self, bean): - for metric in self.metrics['StartupProgress']: - if "Count" in metric: - key = "PhaseCount" - phase = metric.split("Count")[0] - elif "ElapsedTime" in metric and "ElapsedTime" != metric: - key = "PhaseElapsedTime" - phase = metric.split("ElapsedTime")[0] - elif "Total" in metric: - key = "PhaseTotal" - phase = metric.split("Total")[0] - elif "PercentComplete" in metric and "PercentComplete" != metric: - key = "PhasePercentComplete" - phase = metric.split("PercentComplete")[0] - else: - key = metric - phase = "-" - label = [self.cluster, phase, self.target] - self.hadoop_namenode_metrics['StartupProgress'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_fsnamesystem_metrics(self, bean): - for metric in self.metrics['FSNamesystem']: - key = metric - if 'HAState' in metric: - label = [self.cluster] - if 'initializing' == bean['tag.HAState']: - value = 0.0 - elif 'active' == bean['tag.HAState']: - value = 1.0 - elif 'standby' == bean['tag.HAState']: - value = 2.0 - elif 'stopping' == bean['tag.HAState']: - value = 3.0 - else: - value = 9999 - label.append(self.target) - self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, value) - elif metric.startswith("Capacity"): - key = 'capacity' - mode = metric.split("Capacity")[1] - label = [self.cluster, mode] - label.append(self.target) - self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0) - else: - label = [self.cluster] - label.append(self.target) - self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_fsnamesystem_state_metrics(self, bean): - for metric in self.metrics['FSNamesystemState']: - label = [self.cluster] - key = metric - if 'FSState' in metric: - if 'Safemode' == bean['FSState']: - value = 0.0 - elif 'Operational' == bean['FSState']: - value = 1.0 - else: - value = 9999 - label.append(self.target) - self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, value) - elif "TotalSyncTimes" in metric: - label.append(self.target) - self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, float( - re.sub(r'\s', '', bean[metric])) if metric in bean and bean[metric] else 0) - elif "DataNodes" in metric: - key = 'datanodes_num' - state = metric.split("DataNodes")[0].split("Num")[1] - label = [self.cluster, state, self.target] - self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) - else: - label.append(self.target) - self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) - - def get_retrycache_metrics(self, bean): - for metric in self.metrics['RetryCache']: - key = "cache" - label = [self.cluster, metric.split('Cache')[1], self.target] - self.hadoop_namenode_metrics['RetryCache'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) - - def get_nninfo_metrics(self, bean): - for metric in self.metrics["NameNodeInfo"]: - if "LiveNodes" in metric and "LiveNodes" in bean: - live_node_dict = yaml.safe_load(bean["LiveNodes"]) - self.hadoop_namenode_metrics["NameNodeInfo"]["LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict)) - dns = set() - for node, info in live_node_dict.items(): - label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target] - items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", - "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] - dns.add("http://"+info["infoAddr"]+"/jmx") - for item in items: - value = info[item] if item in info else 0 - if item == "adminState": - if value == "In Service": - value = 0 - elif value == "Decommission In Progress": - value = 1 - else: # Decommissioned - value = 2 - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - key = "LiveNodes-" + item - self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) - self.dns = dns - elif "DeadNodes" in metric and "DeadNodes" in bean: - dead_node_dict = yaml.safe_load(bean["DeadNodes"]) - self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict)) - for node, info in dead_node_dict.items(): - label = [self.cluster, node, str(info["decommissioned"]), info["xferaddr"], self.target] - value = info["lastContact"] - self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodes"].add_metric(label, value) - elif "DecomNodes" in metric and "DecomNodes" in bean: - decom_node_dict = yaml.safe_load(bean["DecomNodes"]) - self.hadoop_namenode_metrics["NameNodeInfo"]["DecomNodeCount"].add_metric([self.cluster, self.target], len(decom_node_dict)) - for node, info in decom_node_dict.items(): - label = [self.cluster, node, info["xferaddr"], self.target] - items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"] - for item in items: - value = info[item] if item in info else 0 - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - key = "DecomNodes-" + item - self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) - elif "EnteringMaintenanceNodes" in metric and "EnteringMaintenanceNodes" in bean: - node_dict = yaml.safe_load(bean["EnteringMaintenanceNodes"]) - self.hadoop_namenode_metrics["NameNodeInfo"]["MaintenanceNodeCount"].add_metric([self.cluster, self.target], len(node_dict)) - for node, info in node_dict.items(): - label = [self.cluster, node, info["xferaddr"], self.target] - items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"] - for item in items: - value = info[item] if item in info else 0 - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - key = "EnteringMaintenanceNodes-" + item - self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) - elif "CorruptFiles" in metric and "CorruptFiles" in bean: - file_list = yaml.safe_load(bean["CorruptFiles"]) - label = [self.cluster, self.target] - self.hadoop_namenode_metrics["NameNodeInfo"]["CorruptFiles"].add_metric(label, len(file_list)) - elif "NodeUsage" in metric and "NodeUsage" in bean: - node_usage_dict = yaml.safe_load(bean["NodeUsage"])["nodeUsage"] - label = [self.cluster, self.target] - items = ["min", "median", "max", "stdDev"] - for item in items: - value = node_usage_dict[item] if item in node_usage_dict else 0 - value = float(value.strip("%")) - item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() - key = "NodeUsage-" + item - self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) - elif "SoftwareVersion" in metric and "SoftwareVersion" in bean: - label = [self.cluster, bean["SoftwareVersion"], self.target] - self.hadoop_namenode_metrics["NameNodeInfo"]["SoftwareVersion"].add_metric(label, 0) - elif "Safemode" in metric and "Safemode" in bean: - label = [self.cluster, self.target] - self.hadoop_namenode_metrics["NameNodeInfo"]["Safemode"].add_metric(label, 0 if metric in bean and bean[metric] == "" else 1) - else: - label = [self.cluster, self.target] - self.hadoop_namenode_metrics['NameNodeInfo'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) - - def get_metrics(self, beans): - for i in range(len(beans)): - if 'NameNodeActivity' in beans[i]['name']: - self.get_nnactivity_metrics(beans[i]) - if 'StartupProgress' in beans[i]['name']: - self.get_startupprogress_metrics(beans[i]) - if 'FSNamesystem' in beans[i]['name'] and 'FSNamesystemState' not in beans[i]['name']: - self.get_fsnamesystem_metrics(beans[i]) - if 'FSNamesystemState' in beans[i]['name']: - self.get_fsnamesystem_state_metrics(beans[i]) - if 'RetryCache' in beans[i]['name']: - self.get_retrycache_metrics(beans[i]) - if 'NameNodeInfo' in beans[i]['name']: - self.get_nninfo_metrics(beans[i]) +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import yaml +import re + +from prometheus_client.core import GaugeMetricFamily + +from utils import get_module_logger +from common import MetricCollector, CommonMetricCollector +from scraper import ScrapeMetrics + +logger = get_module_logger(__name__) + + +class NameNodeMetricCollector(MetricCollector): + def __init__(self, cluster, urls): + MetricCollector.__init__(self, cluster, "hdfs", "namenode") + self.target = "-" + self.urls = urls + self.dns = set() + + self.hadoop_namenode_metrics = {} + for i in range(len(self.file_list)): + self.hadoop_namenode_metrics.setdefault(self.file_list[i], {}) + + self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "namenode") + + self.scrape_metrics = ScrapeMetrics(urls) + + def collect(self): + isSetup = False + beans_list = self.scrape_metrics.scrape() + for beans in beans_list: + if not isSetup: + self.common_metric_collector.setup_labels(beans) + self.setup_metrics_labels(beans) + isSetup = True + for i in range(len(beans)): + if 'tag.Hostname' in beans[i]: + self.target = beans[i]["tag.Hostname"] + break + self.hadoop_namenode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.hadoop_namenode_metrics: + for metric in self.hadoop_namenode_metrics[service]: + yield self.hadoop_namenode_metrics[service][metric] + + def setup_nnactivity_labels(self): + num_namenode_flag, avg_namenode_flag, ops_namenode_flag = 1, 1, 1 + for metric in self.metrics['NameNodeActivity']: + label = ["cluster", "method", "_target"] + if "NumOps" in metric: + if num_namenode_flag: + key = "MethodNumOps" + name = "_".join([self.prefix, "nnactivity_method_ops_total"]) + description = "Total number of the times the method is called." + self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label) + num_namenode_flag = 0 + else: + continue + elif "AvgTime" in metric: + if avg_namenode_flag: + key = "MethodAvgTime" + name = "_".join([self.prefix, "nnactivity_method_avg_time_milliseconds"]) + descripton = "Average turn around time of the method in milliseconds." + self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, descripton, labels=label) + avg_namenode_flag = 0 + else: + continue + elif ops_namenode_flag: + key = "Operations" + name = "_".join([self.prefix, "nnactivity_operations_total"]) + description = "Total number of each operation." + self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label) + ops_namenode_flag = 0 + + def setup_startupprogress_labels(self): + sp_count_flag, sp_elapsed_flag, sp_total_flag, sp_complete_flag = 1, 1, 1, 1 + for metric in self.metrics['StartupProgress']: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + if "ElapsedTime" == metric: + key = "ElapsedTime" + name = "total_elapsed_time_milliseconds" + descriptions = "Total elapsed time in milliseconds." + elif "PercentComplete" == metric: + key = "PercentComplete" + name = "complete_rate" + descriptions = "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0)." + elif "Count" in metric: + if sp_count_flag: + sp_count_flag = 0 + key = "PhaseCount" + name = "phase_count" + descriptions = "Total number of steps completed in the phase." + else: + continue + elif "ElapsedTime" in metric: + if sp_elapsed_flag: + sp_elapsed_flag = 0 + key = "PhaseElapsedTime" + name = "phase_elapsed_time_milliseconds" + descriptions = "Total elapsed time in the phase in milliseconds." + else: + continue + elif "Total" in metric: + if sp_total_flag: + sp_total_flag = 0 + key = "PhaseTotal" + name = "phase_total" + descriptions = "Total number of steps in the phase." + else: + continue + elif "PercentComplete" in metric: + if sp_complete_flag: + sp_complete_flag = 0 + key = "PhasePercentComplete" + name = "phase_complete_rate" + descriptions = "Current rate completed in the phase (The max value is not 100 but 1.0)." + else: + continue + else: + key = metric + name = snake_case + descriptions = self.metrics['StartupProgress'][metric] + label = ["cluster", "phase", "_target"] + name = "_".join([self.prefix, "startup_process", name]) + self.hadoop_namenode_metrics['StartupProgress'][key] = GaugeMetricFamily(name, descriptions, labels=label) + + def setup_fsnamesystem_labels(self): + cap_flag = 1 + for metric in self.metrics['FSNamesystem']: + if metric.startswith('Capacity'): + if cap_flag: + cap_flag = 0 + key = "capacity" + label = ["cluster", "mode"] + name = "capacity_bytes" + descriptions = "Current DataNodes capacity in each mode in bytes" + else: + continue + else: + key = metric + label = ["cluster"] + name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + descriptions = self.metrics['FSNamesystem'][metric] + label.append("_target") + name = "_".join([self.prefix, "fsname_system", name]) + self.hadoop_namenode_metrics['FSNamesystem'][key] = GaugeMetricFamily(name, descriptions, labels=label) + + def setup_fsnamesystem_state_labels(self): + num_flag = 1 + for metric in self.metrics['FSNamesystemState']: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + if 'DataNodes' in metric: + if num_flag: + num_flag = 0 + key = "datanodes_num" + label = ["cluster", "state"] + descriptions = "Number of datanodes in each state" + else: + continue + else: + key = metric + label = ["cluster"] + descriptions = self.metrics['FSNamesystemState'][metric] + label.append("_target") + name = "_".join([self.prefix, "fsname_system_state", snake_case]) + self.hadoop_namenode_metrics['FSNamesystemState'][key] = GaugeMetricFamily(name, descriptions, labels=label) + + def setup_retrycache_labels(self): + cache_flag = 1 + for metric in self.metrics['RetryCache']: + if cache_flag: + cache_flag = 0 + key = "cache" + label = ["cluster", "mode", "_target"] + name = "_".join([self.prefix, "cache_total"]) + description = "Total number of RetryCache in each mode" + self.hadoop_namenode_metrics['RetryCache'][key] = GaugeMetricFamily(name, description, labels=label) + + def setup_nninfo_labels(self): + for metric in self.metrics['NameNodeInfo']: + if "LiveNodes" in metric: + name = "_".join([self.prefix, "nninfo_live_nodes_count"]) + description = "Count of live data node" + self.hadoop_namenode_metrics['NameNodeInfo']["LiveNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) + + label = ["cluster", "datanode", "infoAddr", "infoSecureAddr", "xferaddr", "version", "_target"] + items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", + "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] + for item in items: + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + name = "_".join([self.prefix, "nninfo_live_nodes", item]) + key = "LiveNodes-" + item + description = "Live node " + item + if item == "admin_state": + description += " 0: In Service, 1: Decommission In Progress, 2: Decommissioned" + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) + continue + elif "DeadNodes" in metric: + name = "_".join([self.prefix, "nninfo_dead_nodes_count"]) + description = "Count of dead data node" + self.hadoop_namenode_metrics['NameNodeInfo']["DeadNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) + + label = ["cluster", "datanode", "decommissioned", "xferaddr"] + name = "_".join([self.prefix, "nninfo_dead_nodes_last_contact"]) + key = "DeadNodes" + description = "Dead node last contact in milions" + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) + continue + elif "DecomNodes" in metric: + name = "_".join([self.prefix, "nninfo_decom_nodes_count"]) + description = "Count of decommissioned data node" + self.hadoop_namenode_metrics['NameNodeInfo']["DecomNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) + + label = ["cluster", "datanode", "xferaddr", "_target"] + items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"] + for item in items: + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + name = "_".join([self.prefix, "nninfo_decom_nodes", item]) + key = "DecomNodes-" + item + description = "Decom Node " + item + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) + continue + elif "EnteringMaintenanceNodes" in metric: + name = "_".join([self.prefix, "nninfo_maintenance_nodes_count"]) + description = "Count of maintenance data node" + self.hadoop_namenode_metrics['NameNodeInfo']["MaintenanceNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) + + label = ["cluster", "datanode", "xferaddr", "_target"] + items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"] + for item in items: + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + name = "_".join([self.prefix, "nninfo_entering_maintenance_nodes", item]) + key = "EnteringMaintenanceNodes-" + item + description = "Entering maintenance node " + item + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) + continue + elif "CorruptFiles" in metric: + label = ["cluster", "_target"] + name = "_".join([self.prefix, "nninfo_corrupt_file_count"]) + key = "CorruptFiles" + description = "Corrupt file count" + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) + continue + elif "NodeUsage" in metric: + label = ["cluster", "_target"] + items = ["min", "median", "max", "stdDev"] + for item in items: + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + name = "_".join([self.prefix, "nninfo_node_usage", item]) + key = "NodeUsage-" + item + description = "Node usage " + item + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) + continue + elif "SoftwareVersion" in metric: + label = ["cluster", "software_version"] + name = "_".join([self.prefix, "nninfo_software_version"]) + key = "SoftwareVersion" + elif "Safemode" in metric: + label = ["cluster"] + name = "_".join([self.prefix, "nninfo_safe_mode"]) + key = "Safemode" + else: + label = ["cluster"] + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, "nninfo", snake_case]) + key = metric + label.append("_target") + self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, self.metrics["NameNodeInfo"][metric], labels=label) + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + if 'NameNodeActivity' in beans[i]['name']: + self.setup_nnactivity_labels() + if 'StartupProgress' in beans[i]['name']: + self.setup_startupprogress_labels() + if 'FSNamesystem' in beans[i]['name']: + self.setup_fsnamesystem_labels() + if 'FSNamesystemState' in beans[i]['name']: + self.setup_fsnamesystem_state_labels() + if 'RetryCache' in beans[i]['name']: + self.setup_retrycache_labels() + if "NameNodeInfo" in beans[i]['name']: + self.setup_nninfo_labels() + + def get_nnactivity_metrics(self, bean): + for metric in self.metrics['NameNodeActivity']: + if "NumOps" in metric: + method = metric.split('NumOps')[0] + key = "MethodNumOps" + elif "AvgTime" in metric: + method = metric.split('AvgTime')[0] + key = "MethodAvgTime" + else: + if "Ops" in metric: + method = metric.split('Ops')[0] + else: + method = metric + key = "Operations" + label = [self.cluster, method, self.target] + self.hadoop_namenode_metrics['NameNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_startupprogress_metrics(self, bean): + for metric in self.metrics['StartupProgress']: + if "Count" in metric: + key = "PhaseCount" + phase = metric.split("Count")[0] + elif "ElapsedTime" in metric and "ElapsedTime" != metric: + key = "PhaseElapsedTime" + phase = metric.split("ElapsedTime")[0] + elif "Total" in metric: + key = "PhaseTotal" + phase = metric.split("Total")[0] + elif "PercentComplete" in metric and "PercentComplete" != metric: + key = "PhasePercentComplete" + phase = metric.split("PercentComplete")[0] + else: + key = metric + phase = "-" + label = [self.cluster, phase, self.target] + self.hadoop_namenode_metrics['StartupProgress'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_fsnamesystem_metrics(self, bean): + for metric in self.metrics['FSNamesystem']: + key = metric + if 'HAState' in metric: + label = [self.cluster] + if 'initializing' == bean['tag.HAState']: + value = 0.0 + elif 'active' == bean['tag.HAState']: + value = 1.0 + elif 'standby' == bean['tag.HAState']: + value = 2.0 + elif 'stopping' == bean['tag.HAState']: + value = 3.0 + else: + value = 9999 + label.append(self.target) + self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, value) + elif metric.startswith("Capacity"): + key = 'capacity' + mode = metric.split("Capacity")[1] + label = [self.cluster, mode] + label.append(self.target) + self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0) + else: + label = [self.cluster] + label.append(self.target) + self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_fsnamesystem_state_metrics(self, bean): + for metric in self.metrics['FSNamesystemState']: + label = [self.cluster] + key = metric + if 'FSState' in metric: + if 'Safemode' == bean['FSState']: + value = 0.0 + elif 'Operational' == bean['FSState']: + value = 1.0 + else: + value = 9999 + label.append(self.target) + self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, value) + elif "TotalSyncTimes" in metric: + label.append(self.target) + self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, float( + re.sub(r'\s', '', bean[metric])) if metric in bean and bean[metric] else 0) + elif "DataNodes" in metric: + key = 'datanodes_num' + state = metric.split("DataNodes")[0].split("Num")[1] + label = [self.cluster, state, self.target] + self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + else: + label.append(self.target) + self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + + def get_retrycache_metrics(self, bean): + for metric in self.metrics['RetryCache']: + key = "cache" + label = [self.cluster, metric.split('Cache')[1], self.target] + self.hadoop_namenode_metrics['RetryCache'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + + def get_nninfo_metrics(self, bean): + for metric in self.metrics["NameNodeInfo"]: + if "LiveNodes" in metric and "LiveNodes" in bean: + live_node_dict = yaml.safe_load(bean["LiveNodes"]) + self.hadoop_namenode_metrics["NameNodeInfo"]["LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict)) + dns = set() + for node, info in live_node_dict.items(): + label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target] + items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", + "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] + dns.add("http://"+info["infoAddr"]+"/jmx") + for item in items: + value = info[item] if item in info else 0 + if item == "adminState": + if value == "In Service": + value = 0 + elif value == "Decommission In Progress": + value = 1 + else: # Decommissioned + value = 2 + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + key = "LiveNodes-" + item + self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) + self.dns = dns + elif "DeadNodes" in metric and "DeadNodes" in bean: + dead_node_dict = yaml.safe_load(bean["DeadNodes"]) + self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict)) + for node, info in dead_node_dict.items(): + label = [self.cluster, node, str(info["decommissioned"]), info["xferaddr"], self.target] + value = info["lastContact"] + self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodes"].add_metric(label, value) + elif "DecomNodes" in metric and "DecomNodes" in bean: + decom_node_dict = yaml.safe_load(bean["DecomNodes"]) + self.hadoop_namenode_metrics["NameNodeInfo"]["DecomNodeCount"].add_metric([self.cluster, self.target], len(decom_node_dict)) + for node, info in decom_node_dict.items(): + label = [self.cluster, node, info["xferaddr"], self.target] + items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"] + for item in items: + value = info[item] if item in info else 0 + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + key = "DecomNodes-" + item + self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) + elif "EnteringMaintenanceNodes" in metric and "EnteringMaintenanceNodes" in bean: + node_dict = yaml.safe_load(bean["EnteringMaintenanceNodes"]) + self.hadoop_namenode_metrics["NameNodeInfo"]["MaintenanceNodeCount"].add_metric([self.cluster, self.target], len(node_dict)) + for node, info in node_dict.items(): + label = [self.cluster, node, info["xferaddr"], self.target] + items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"] + for item in items: + value = info[item] if item in info else 0 + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + key = "EnteringMaintenanceNodes-" + item + self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) + elif "CorruptFiles" in metric and "CorruptFiles" in bean: + file_list = yaml.safe_load(bean["CorruptFiles"]) + label = [self.cluster, self.target] + self.hadoop_namenode_metrics["NameNodeInfo"]["CorruptFiles"].add_metric(label, len(file_list)) + elif "NodeUsage" in metric and "NodeUsage" in bean: + node_usage_dict = yaml.safe_load(bean["NodeUsage"])["nodeUsage"] + label = [self.cluster, self.target] + items = ["min", "median", "max", "stdDev"] + for item in items: + value = node_usage_dict[item] if item in node_usage_dict else 0 + value = float(value.strip("%")) + item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() + key = "NodeUsage-" + item + self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) + elif "SoftwareVersion" in metric and "SoftwareVersion" in bean: + label = [self.cluster, bean["SoftwareVersion"], self.target] + self.hadoop_namenode_metrics["NameNodeInfo"]["SoftwareVersion"].add_metric(label, 0) + elif "Safemode" in metric and "Safemode" in bean: + label = [self.cluster, self.target] + self.hadoop_namenode_metrics["NameNodeInfo"]["Safemode"].add_metric(label, 0 if metric in bean and bean[metric] == "" else 1) + else: + label = [self.cluster, self.target] + self.hadoop_namenode_metrics['NameNodeInfo'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + + def get_metrics(self, beans): + for i in range(len(beans)): + if 'NameNodeActivity' in beans[i]['name']: + self.get_nnactivity_metrics(beans[i]) + if 'StartupProgress' in beans[i]['name']: + self.get_startupprogress_metrics(beans[i]) + if 'FSNamesystem' in beans[i]['name'] and 'FSNamesystemState' not in beans[i]['name']: + self.get_fsnamesystem_metrics(beans[i]) + if 'FSNamesystemState' in beans[i]['name']: + self.get_fsnamesystem_state_metrics(beans[i]) + if 'RetryCache' in beans[i]['name']: + self.get_retrycache_metrics(beans[i]) + if 'NameNodeInfo' in beans[i]['name']: + self.get_nninfo_metrics(beans[i]) diff --git a/metrics/common/JvmMetrics.json b/metrics/common/JvmMetrics.json index 57f961c..c60adce 100644 --- a/metrics/common/JvmMetrics.json +++ b/metrics/common/JvmMetrics.json @@ -1,28 +1,28 @@ -{ - "MemNonHeapUsedM": "Current non-heap memory used in MB.", - "MemNonHeapCommittedM": "Current non-heap memory committed in MB.", - "MemNonHeapMaxM": "Max non-heap memory size in MB.", - "MemHeapUsedM": "Current heap memory used in MB.", - "MemHeapCommittedM": "Current heap memory committed in MB.", - "MemHeapMaxM": "Max heap memory size in MB.", - "MemMaxM": "Max memory size in MB.", - "ThreadsNew": "Current number of NEW threads.", - "ThreadsRunnable": "Current number of RUNNABLE threads.", - "ThreadsBlocked": "Current number of BLOCKED threads.", - "ThreadsWaiting": "Current number of WAITING threads.", - "ThreadsTimedWaiting": "Current number of TIMED_WAITING threads.", - "ThreadsTerminated": "Current number of TERMINATED threads.", - "GcCount": "Total number of Gc count", - "GcTimeMillis": "Total GC time in msec.", - "GcCountParNew": "ParNew GC count.", - "GcTimeMillisParNew": "ParNew GC time in msec.", - "GcCountConcurrentMarkSweep": "ConcurrentMarkSweep GC count.", - "GcTimeMillisConcurrentMarkSweep": "ConcurrentMarkSweep GC time in msec.", - "GcNumWarnThresholdExceeded": "Number of times that the GC warn threshold is exceeded.", - "GcNumInfoThresholdExceeded": "Number of times that the GC info threshold is exceeded.", - "GcTotalExtraSleepTime": "Total GC extra sleep time in msec.", - "LogFatal": "Total number of FATAL logs.", - "LogError": "Total number of ERROR logs.", - "LogWarn": "Total number of WARN logs.", - "LogInfo": "Total number of INFO logs." -} +{ + "MemNonHeapUsedM": "Current non-heap memory used in MB.", + "MemNonHeapCommittedM": "Current non-heap memory committed in MB.", + "MemNonHeapMaxM": "Max non-heap memory size in MB.", + "MemHeapUsedM": "Current heap memory used in MB.", + "MemHeapCommittedM": "Current heap memory committed in MB.", + "MemHeapMaxM": "Max heap memory size in MB.", + "MemMaxM": "Max memory size in MB.", + "ThreadsNew": "Current number of NEW threads.", + "ThreadsRunnable": "Current number of RUNNABLE threads.", + "ThreadsBlocked": "Current number of BLOCKED threads.", + "ThreadsWaiting": "Current number of WAITING threads.", + "ThreadsTimedWaiting": "Current number of TIMED_WAITING threads.", + "ThreadsTerminated": "Current number of TERMINATED threads.", + "GcCount": "Total number of Gc count", + "GcTimeMillis": "Total GC time in msec.", + "GcCountParNew": "ParNew GC count.", + "GcTimeMillisParNew": "ParNew GC time in msec.", + "GcCountConcurrentMarkSweep": "ConcurrentMarkSweep GC count.", + "GcTimeMillisConcurrentMarkSweep": "ConcurrentMarkSweep GC time in msec.", + "GcNumWarnThresholdExceeded": "Number of times that the GC warn threshold is exceeded.", + "GcNumInfoThresholdExceeded": "Number of times that the GC info threshold is exceeded.", + "GcTotalExtraSleepTime": "Total GC extra sleep time in msec.", + "LogFatal": "Total number of FATAL logs.", + "LogError": "Total number of ERROR logs.", + "LogWarn": "Total number of WARN logs.", + "LogInfo": "Total number of INFO logs." +} diff --git a/metrics/common/MetricsSystem.json b/metrics/common/MetricsSystem.json index 8c2e6ae..9b09717 100644 --- a/metrics/common/MetricsSystem.json +++ b/metrics/common/MetricsSystem.json @@ -1,15 +1,15 @@ -{ - "NumActiveSources": "Current number of active metrics sources.", - "NumAllSources": "Total number of metrics sources.", - "NumActiveSinks": "Current number of active sinks.", - "NumAllSinks": "Total number of sinks (BUT usually less than NumActiveSinks, see HADOOP-9946).", - "SnapshotNumOps": "Total number of operations to snapshot statistics from a metrics source.", - "SnapshotAvgTime": "Average time in milliseconds to snapshot statistics from a metrics source.", - "PublishNumOps": "Total number of operations to publish statistics to a sink.", - "PublishAvgTime": "Average time in milliseconds to publish statistics to a sink.", - "DroppedPubAll": "Total number of dropped publishes.", - "Sink_instanceNumOps": "Total number of sink operations for the instance.", - "Sink_instanceAvgTime": "Average time in milliseconds of sink operations for the instance.", - "Sink_instanceDropped": "Total number of dropped sink operations for the instance.", - "Sink_instanceQsize": "Current queue length of sink operations (BUT always set to 0 because nothing to increment this metrics, see HADOOP-9941)." +{ + "NumActiveSources": "Current number of active metrics sources.", + "NumAllSources": "Total number of metrics sources.", + "NumActiveSinks": "Current number of active sinks.", + "NumAllSinks": "Total number of sinks (BUT usually less than NumActiveSinks, see HADOOP-9946).", + "SnapshotNumOps": "Total number of operations to snapshot statistics from a metrics source.", + "SnapshotAvgTime": "Average time in milliseconds to snapshot statistics from a metrics source.", + "PublishNumOps": "Total number of operations to publish statistics to a sink.", + "PublishAvgTime": "Average time in milliseconds to publish statistics to a sink.", + "DroppedPubAll": "Total number of dropped publishes.", + "Sink_instanceNumOps": "Total number of sink operations for the instance.", + "Sink_instanceAvgTime": "Average time in milliseconds of sink operations for the instance.", + "Sink_instanceDropped": "Total number of dropped sink operations for the instance.", + "Sink_instanceQsize": "Current queue length of sink operations (BUT always set to 0 because nothing to increment this metrics, see HADOOP-9941)." } \ No newline at end of file diff --git a/metrics/common/OperatingSystem.json b/metrics/common/OperatingSystem.json index e86a103..364419b 100644 --- a/metrics/common/OperatingSystem.json +++ b/metrics/common/OperatingSystem.json @@ -1,14 +1,14 @@ -{ - "OpenFileDescriptorCount": "Total number of open file descriptor", - "MaxFileDescriptorCount": "Total number of max file descriptor", - "CommittedVirtualMemorySize": "The size of committed virtual memory in bytes", - "TotalSwapSpaceSize": "The size of total swap space in bytes", - "FreeSwapSpaceSize": "The size of free swap space in bytes", - "ProcessCpuTime": "Total process cpu time in microseconds", - "FreePhysicalMemorySize": "The size of free physical memory in bytes", - "TotalPhysicalMemorySize": "The size of total physical memory in bytes", - "SystemCpuLoad": "Average of system CPU load", - "ProcessCpuLoad": "Average of process CPU load", - "SystemLoadAverage": "Average of system load", - "AvailableProcessors": "Total number of available processors", +{ + "OpenFileDescriptorCount": "Total number of open file descriptor", + "MaxFileDescriptorCount": "Total number of max file descriptor", + "CommittedVirtualMemorySize": "The size of committed virtual memory in bytes", + "TotalSwapSpaceSize": "The size of total swap space in bytes", + "FreeSwapSpaceSize": "The size of free swap space in bytes", + "ProcessCpuTime": "Total process cpu time in microseconds", + "FreePhysicalMemorySize": "The size of free physical memory in bytes", + "TotalPhysicalMemorySize": "The size of total physical memory in bytes", + "SystemCpuLoad": "Average of system CPU load", + "ProcessCpuLoad": "Average of process CPU load", + "SystemLoadAverage": "Average of system load", + "AvailableProcessors": "Total number of available processors", } \ No newline at end of file diff --git a/metrics/common/RpcActivity.json b/metrics/common/RpcActivity.json index 1ce5ea1..432d58d 100644 --- a/metrics/common/RpcActivity.json +++ b/metrics/common/RpcActivity.json @@ -1,16 +1,16 @@ -{ - "ReceivedBytes": "Total number of received bytes", - "SentBytes": "Total number of sent bytes", - "RpcQueueTimeNumOps": "Total number of RPC calls", - "RpcQueueTimeAvgTime": "Average queue time in milliseconds", - "RpcProcessingTimeNumOps": "Total number of RPC calls (same to RpcQueueTimeNumOps)", - "RpcProcessingTimeAvgTime": "Average Processing time in milliseconds", - "RpcAuthenticationFailures": "Total number of authentication failures", - "RpcAuthenticationSuccesses": "Total number of authentication successes", - "RpcAuthorizationFailures": "Total number of authorization failures", - "RpcAuthorizationSuccesses": "Total number of authorization successes", - "RpcClientBackoff": "Total number of RPC client back off", - "RpcSlowCalls": "Total number of RPC slow calls", - "NumOpenConnections": "Current number of open connections", - "CallQueueLength": "Current length of the call queue" -} +{ + "ReceivedBytes": "Total number of received bytes", + "SentBytes": "Total number of sent bytes", + "RpcQueueTimeNumOps": "Total number of RPC calls", + "RpcQueueTimeAvgTime": "Average queue time in milliseconds", + "RpcProcessingTimeNumOps": "Total number of RPC calls (same to RpcQueueTimeNumOps)", + "RpcProcessingTimeAvgTime": "Average Processing time in milliseconds", + "RpcAuthenticationFailures": "Total number of authentication failures", + "RpcAuthenticationSuccesses": "Total number of authentication successes", + "RpcAuthorizationFailures": "Total number of authorization failures", + "RpcAuthorizationSuccesses": "Total number of authorization successes", + "RpcClientBackoff": "Total number of RPC client back off", + "RpcSlowCalls": "Total number of RPC slow calls", + "NumOpenConnections": "Current number of open connections", + "CallQueueLength": "Current length of the call queue" +} diff --git a/metrics/common/RpcDetailedActivity.json b/metrics/common/RpcDetailedActivity.json index c325e6f..85f1d97 100644 --- a/metrics/common/RpcDetailedActivity.json +++ b/metrics/common/RpcDetailedActivity.json @@ -1,4 +1,4 @@ -{ - "methodNumOps": "Total number of the times the method is called", - "methodAvgTime": "Average turn around time of the method in milliseconds" +{ + "methodNumOps": "Total number of the times the method is called", + "methodAvgTime": "Average turn around time of the method in milliseconds" } \ No newline at end of file diff --git a/metrics/common/Runtime.json b/metrics/common/Runtime.json index 05bdd53..267db55 100644 --- a/metrics/common/Runtime.json +++ b/metrics/common/Runtime.json @@ -1,4 +1,4 @@ -{ - "Uptime": "components uptime in milliseconds", - "StartTime": "components start time in milliseconds" +{ + "Uptime": "components uptime in milliseconds", + "StartTime": "components start time in milliseconds" } \ No newline at end of file diff --git a/metrics/common/UgiMetrics.json b/metrics/common/UgiMetrics.json index 951a270..920bb05 100644 --- a/metrics/common/UgiMetrics.json +++ b/metrics/common/UgiMetrics.json @@ -1,10 +1,10 @@ -{ - "LoginSuccessNumOps": "Total number of successful kerberos logins.", - "LoginSuccessAvgTime": "Average time for successful kerberos logins in milliseconds.", - "LoginFailureNumOps": "Total number of failed kerberos logins.", - "LoginFailureAvgTime": "Average time for failed kerberos logins in milliseconds.", - "GetGroupsNumOps": "Total number of group resolutions.", - "GetGroupsAvgTime": "Average time for group resolution in milliseconds.", - "RenewalFailuresTotal": "Total number of renewal failures.", - "RenewalFailures": "Current number of renewal failures." +{ + "LoginSuccessNumOps": "Total number of successful kerberos logins.", + "LoginSuccessAvgTime": "Average time for successful kerberos logins in milliseconds.", + "LoginFailureNumOps": "Total number of failed kerberos logins.", + "LoginFailureAvgTime": "Average time for failed kerberos logins in milliseconds.", + "GetGroupsNumOps": "Total number of group resolutions.", + "GetGroupsAvgTime": "Average time for group resolution in milliseconds.", + "RenewalFailuresTotal": "Total number of renewal failures.", + "RenewalFailures": "Current number of renewal failures." } \ No newline at end of file diff --git a/metrics/datanode/DataNodeActivity.json b/metrics/datanode/DataNodeActivity.json index 404f8c3..55ff752 100644 --- a/metrics/datanode/DataNodeActivity.json +++ b/metrics/datanode/DataNodeActivity.json @@ -1,70 +1,70 @@ -{ - "BytesWritten": "Total number of bytes written to DataNode", - "BytesRead": "Total number of bytes read from DataNode", - "TotalWriteTime": "Total number of milliseconds spent on write operation", - "TotalReadTime": "Total number of milliseconds spent on read operation", - "BlocksWritten": "Total number of blocks written to DataNode", - "BlocksRead": "Total number of blocks read from DataNode", - "BlocksReplicated": "Total number of blocks replicated", - "BlocksRemoved": "Total number of blocks removed", - "BlocksVerified": "Total number of blocks verified", - "BlockVerificationFailures": "Total number of verifications failures", - "BlocksCached": "Total number of blocks cached", - "BlocksUncached": "Total number of blocks uncached", - "ReadsFromLocalClient": "Total number of read operations from local client", - "ReadsFromRemoteClient": "Total number of read operations from remote client", - "WritesFromLocalClient": "Total number of write operations from local client", - "WritesFromRemoteClient": "Total number of write operations from remote client", - "BlocksGetLocalPathInfo": "Total number of operations to get local path names of blocks", - "RemoteBytesRead": "Number of bytes read by remote clients", - "RemoteBytesWritten": "Number of bytes written by remote clients", - "RamDiskBlocksWrite": "Total number of blocks written to memory", - "RamDiskBlocksWriteFallback": "Total number of blocks written to memory but not satisfied (failed-over to disk)", - "RamDiskBytesWrite": "Total number of bytes written to memory", - "RamDiskBlocksReadHits": "Total number of times a block in memory was read", - "RamDiskBlocksEvicted": "Total number of blocks evicted in memory", - "RamDiskBlocksEvictedWithoutRead": "Total number of blocks evicted in memory without ever being read from memory", - "RamDiskBlocksEvictionWindowMsNumOps": "Number of blocks evicted in memory", - "RamDiskBlocksEvictionWindowMsAvgTime": "Average time of blocks in memory before being evicted in milliseconds", - "RamDiskBlocksLazyPersisted": "Total number of blocks written to disk by lazy writer", - "RamDiskBlocksDeletedBeforeLazyPersisted": "Total number of blocks deleted by application before being persisted to disk", - "RamDiskBytesLazyPersisted": "Total number of bytes written to disk by lazy writer", - "RamDiskBlocksLazyPersistWindowMsNumOps": "Number of blocks written to disk by lazy writer", - "RamDiskBlocksLazyPersistWindowMsAvgTime": "Average time of blocks written to disk by lazy writer in milliseconds", - "FsyncCount": "Total number of fsync", - "VolumeFailures": "Total number of volume failures occurred", - "DatanodeNetworkErrors" : "Total number of datanode network error", - "DataNodeActiveXceiversCount" : "Total number of datanode active Xceivers", - "ReadBlockOpNumOps": "Total number of read operations", - "ReadBlockOpAvgTime": "Average time of read operations in milliseconds", - "WriteBlockOpNumOps": "Total number of write operations", - "WriteBlockOpAvgTime": "Average time of write operations in milliseconds", - "BlockChecksumOpNumOps": "Total number of blockChecksum operations", - "BlockChecksumOpAvgTime": "Average time of blockChecksum operations in milliseconds", - "CopyBlockOpNumOps": "Total number of block copy operations", - "CopyBlockOpAvgTime": "Average time of block copy operations in milliseconds", - "ReplaceBlockOpNumOps": "Total number of block replace operations", - "ReplaceBlockOpAvgTime": "Average time of block replace operations in milliseconds", - "HeartbeatsNumOps": "Total number of heartbeats", - "HeartbeatsAvgTime": "Average heartbeat time in milliseconds", - "HeartbeatsTotalNumOps": "Total number of heartbeats which is a duplicate of HeartbeatsNumOps", - "HeartbeatsTotalAvgTime": "Average total heartbeat time in milliseconds", - "LifelinesNumOps": "Total number of lifeline messages", - "LifelinesAvgTime": "Average lifeline message processing time in milliseconds", - "BlockReportsNumOps": "Total number of block report operations", - "BlockReportsAvgTime": "Average time of block report operations in milliseconds", - "IncrementalBlockReportsNumOps": "Total number of incremental block report operations", - "IncrementalBlockReportsAvgTime": "Average time of incremental block report operations in milliseconds", - "CacheReportsNumOps": "Total number of cache report operations", - "CacheReportsAvgTime": "Average time of cache report operations in milliseconds", - "PacketAckRoundTripTimeNanosNumOps": "Total number of ack round trip", - "PacketAckRoundTripTimeNanosAvgTime": "Average time from ack send to receive minus the downstream ack time in nanoseconds", - "FlushNanosNumOps": "Total number of flushes", - "FlushNanosAvgTime": "Average flush time in nanoseconds", - "FsyncNanosNumOps": "Total number of fsync", - "FsyncNanosAvgTime": "Average fsync time in nanoseconds", - "SendDataPacketBlockedOnNetworkNanosNumOps": "Total number of sending packets", - "SendDataPacketBlockedOnNetworkNanosAvgTime": "Average waiting time of sending packets in nanoseconds", - "SendDataPacketTransferNanosNumOps": "Total number of sending packets", - "SendDataPacketTransferNanosAvgTime": "Average transfer time of sending packets in nanoseconds" +{ + "BytesWritten": "Total number of bytes written to DataNode", + "BytesRead": "Total number of bytes read from DataNode", + "TotalWriteTime": "Total number of milliseconds spent on write operation", + "TotalReadTime": "Total number of milliseconds spent on read operation", + "BlocksWritten": "Total number of blocks written to DataNode", + "BlocksRead": "Total number of blocks read from DataNode", + "BlocksReplicated": "Total number of blocks replicated", + "BlocksRemoved": "Total number of blocks removed", + "BlocksVerified": "Total number of blocks verified", + "BlockVerificationFailures": "Total number of verifications failures", + "BlocksCached": "Total number of blocks cached", + "BlocksUncached": "Total number of blocks uncached", + "ReadsFromLocalClient": "Total number of read operations from local client", + "ReadsFromRemoteClient": "Total number of read operations from remote client", + "WritesFromLocalClient": "Total number of write operations from local client", + "WritesFromRemoteClient": "Total number of write operations from remote client", + "BlocksGetLocalPathInfo": "Total number of operations to get local path names of blocks", + "RemoteBytesRead": "Number of bytes read by remote clients", + "RemoteBytesWritten": "Number of bytes written by remote clients", + "RamDiskBlocksWrite": "Total number of blocks written to memory", + "RamDiskBlocksWriteFallback": "Total number of blocks written to memory but not satisfied (failed-over to disk)", + "RamDiskBytesWrite": "Total number of bytes written to memory", + "RamDiskBlocksReadHits": "Total number of times a block in memory was read", + "RamDiskBlocksEvicted": "Total number of blocks evicted in memory", + "RamDiskBlocksEvictedWithoutRead": "Total number of blocks evicted in memory without ever being read from memory", + "RamDiskBlocksEvictionWindowMsNumOps": "Number of blocks evicted in memory", + "RamDiskBlocksEvictionWindowMsAvgTime": "Average time of blocks in memory before being evicted in milliseconds", + "RamDiskBlocksLazyPersisted": "Total number of blocks written to disk by lazy writer", + "RamDiskBlocksDeletedBeforeLazyPersisted": "Total number of blocks deleted by application before being persisted to disk", + "RamDiskBytesLazyPersisted": "Total number of bytes written to disk by lazy writer", + "RamDiskBlocksLazyPersistWindowMsNumOps": "Number of blocks written to disk by lazy writer", + "RamDiskBlocksLazyPersistWindowMsAvgTime": "Average time of blocks written to disk by lazy writer in milliseconds", + "FsyncCount": "Total number of fsync", + "VolumeFailures": "Total number of volume failures occurred", + "DatanodeNetworkErrors" : "Total number of datanode network error", + "DataNodeActiveXceiversCount" : "Total number of datanode active Xceivers", + "ReadBlockOpNumOps": "Total number of read operations", + "ReadBlockOpAvgTime": "Average time of read operations in milliseconds", + "WriteBlockOpNumOps": "Total number of write operations", + "WriteBlockOpAvgTime": "Average time of write operations in milliseconds", + "BlockChecksumOpNumOps": "Total number of blockChecksum operations", + "BlockChecksumOpAvgTime": "Average time of blockChecksum operations in milliseconds", + "CopyBlockOpNumOps": "Total number of block copy operations", + "CopyBlockOpAvgTime": "Average time of block copy operations in milliseconds", + "ReplaceBlockOpNumOps": "Total number of block replace operations", + "ReplaceBlockOpAvgTime": "Average time of block replace operations in milliseconds", + "HeartbeatsNumOps": "Total number of heartbeats", + "HeartbeatsAvgTime": "Average heartbeat time in milliseconds", + "HeartbeatsTotalNumOps": "Total number of heartbeats which is a duplicate of HeartbeatsNumOps", + "HeartbeatsTotalAvgTime": "Average total heartbeat time in milliseconds", + "LifelinesNumOps": "Total number of lifeline messages", + "LifelinesAvgTime": "Average lifeline message processing time in milliseconds", + "BlockReportsNumOps": "Total number of block report operations", + "BlockReportsAvgTime": "Average time of block report operations in milliseconds", + "IncrementalBlockReportsNumOps": "Total number of incremental block report operations", + "IncrementalBlockReportsAvgTime": "Average time of incremental block report operations in milliseconds", + "CacheReportsNumOps": "Total number of cache report operations", + "CacheReportsAvgTime": "Average time of cache report operations in milliseconds", + "PacketAckRoundTripTimeNanosNumOps": "Total number of ack round trip", + "PacketAckRoundTripTimeNanosAvgTime": "Average time from ack send to receive minus the downstream ack time in nanoseconds", + "FlushNanosNumOps": "Total number of flushes", + "FlushNanosAvgTime": "Average flush time in nanoseconds", + "FsyncNanosNumOps": "Total number of fsync", + "FsyncNanosAvgTime": "Average fsync time in nanoseconds", + "SendDataPacketBlockedOnNetworkNanosNumOps": "Total number of sending packets", + "SendDataPacketBlockedOnNetworkNanosAvgTime": "Average waiting time of sending packets in nanoseconds", + "SendDataPacketTransferNanosNumOps": "Total number of sending packets", + "SendDataPacketTransferNanosAvgTime": "Average transfer time of sending packets in nanoseconds" } \ No newline at end of file diff --git a/metrics/datanode/DataNodeInfo.json b/metrics/datanode/DataNodeInfo.json index 4aa27c6..d837254 100644 --- a/metrics/datanode/DataNodeInfo.json +++ b/metrics/datanode/DataNodeInfo.json @@ -1,4 +1,4 @@ -{ - "VolumeInfo": "Volume infomation in each path and in each mode", - "XceiverCount": "Total number of datanode Xceivers" -} +{ + "VolumeInfo": "Volume infomation in each path and in each mode", + "XceiverCount": "Total number of datanode Xceivers" +} diff --git a/metrics/datanode/FSDatasetState.json b/metrics/datanode/FSDatasetState.json index 5939208..aeb14af 100644 --- a/metrics/datanode/FSDatasetState.json +++ b/metrics/datanode/FSDatasetState.json @@ -1,13 +1,13 @@ -{ - "Capacity" : "Current raw capacity of DataNode in bytes", - "DfsUsed" : "Current space used by DataNodes for DFS purposes in bytes", - "Remaining" : "Current remaining capacity in bytes", - "NumFailedVolumes" : "Total number of failed volumes", - "LastVolumeFailureDate" : "Last time of volume failures", - "EstimatedCapacityLostTotal" : "An estimate of the total capacity lost due to volume failures", - "CacheUsed" : "Total number of cache used", - "CacheCapacity" : "Current raw capacity of cache in bytes", - "NumBlocksCached" : "Total number of blocks cached", - "NumBlocksFailedToCache" : "Total number of blocks failed to cache", - "NumBlocksFailedToUnCache" : "Total number of blocks failed to uncached" +{ + "Capacity" : "Current raw capacity of DataNode in bytes", + "DfsUsed" : "Current space used by DataNodes for DFS purposes in bytes", + "Remaining" : "Current remaining capacity in bytes", + "NumFailedVolumes" : "Total number of failed volumes", + "LastVolumeFailureDate" : "Last time of volume failures", + "EstimatedCapacityLostTotal" : "An estimate of the total capacity lost due to volume failures", + "CacheUsed" : "Total number of cache used", + "CacheCapacity" : "Current raw capacity of cache in bytes", + "NumBlocksCached" : "Total number of blocks cached", + "NumBlocksFailedToCache" : "Total number of blocks failed to cache", + "NumBlocksFailedToUnCache" : "Total number of blocks failed to uncached" } \ No newline at end of file diff --git a/metrics/journalnode/JournalNode.json b/metrics/journalnode/JournalNode.json index 6a90ba6..adc2693 100644 --- a/metrics/journalnode/JournalNode.json +++ b/metrics/journalnode/JournalNode.json @@ -1,29 +1,29 @@ -{ - "Syncs60sNumOps": "Number of sync operations (1 minute granularity)", - "Syncs60s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 minute granularity)", - "Syncs60s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 minute granularity)", - "Syncs60s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 minute granularity)", - "Syncs60s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 minute granularity)", - "Syncs60s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 minute granularity)", - "Syncs300sNumOps": "Number of sync operations (5 minutes granularity)", - "Syncs300s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (5 minutes granularity)", - "Syncs300s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (5 minutes granularity)", - "Syncs300s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (5 minutes granularity)", - "Syncs300s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (5 minutes granularity)", - "Syncs300s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (5 minutes granularity)", - "Syncs3600sNumOps": "Number of sync operations (1 hour granularity)", - "Syncs3600s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 hour granularity)", - "Syncs3600s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 hour granularity)", - "Syncs3600s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 hour granularity)", - "Syncs3600s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 hour granularity)", - "Syncs3600s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 hour granularity)", - "BatchesWritten": "Total number of batches written since startup", - "TxnsWritten": "Total number of transactions written since startup", - "BytesWritten": "Total number of bytes written since startup", - "BatchesWrittenWhileLagging": "Total number of batches written where this node was lagging", - "LastWriterEpoch": "Current writer’s epoch number", - "CurrentLagTxns": "The number of transactions that this JournalNode is lagging", - "LastWrittenTxId": "The highest transaction id stored on this JournalNode", - "LastPromisedEpoch": "The last epoch number which this node has promised not to accept any lower epoch, or 0 if no promises have been made", - "LastJournalTimestamp": "The timestamp of last successfully written transaction" +{ + "Syncs60sNumOps": "Number of sync operations (1 minute granularity)", + "Syncs60s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs300sNumOps": "Number of sync operations (5 minutes granularity)", + "Syncs300s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs3600sNumOps": "Number of sync operations (1 hour granularity)", + "Syncs3600s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 hour granularity)", + "BatchesWritten": "Total number of batches written since startup", + "TxnsWritten": "Total number of transactions written since startup", + "BytesWritten": "Total number of bytes written since startup", + "BatchesWrittenWhileLagging": "Total number of batches written where this node was lagging", + "LastWriterEpoch": "Current writer’s epoch number", + "CurrentLagTxns": "The number of transactions that this JournalNode is lagging", + "LastWrittenTxId": "The highest transaction id stored on this JournalNode", + "LastPromisedEpoch": "The last epoch number which this node has promised not to accept any lower epoch, or 0 if no promises have been made", + "LastJournalTimestamp": "The timestamp of last successfully written transaction" } \ No newline at end of file diff --git a/metrics/namenode/FSNamesystem.json b/metrics/namenode/FSNamesystem.json index 2d7f5a5..6ee64c9 100644 --- a/metrics/namenode/FSNamesystem.json +++ b/metrics/namenode/FSNamesystem.json @@ -1,36 +1,36 @@ -{ - "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state", - "MissingBlocks": "Current number of missing blocks", - "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1", - "ExpiredHeartbeats": "Total number of expired heartbeats", - "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint", - "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll", - "LastWrittenTransactionId": "Last transaction ID written to the edit log", - "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint", - "CapacityTotal": "Current raw capacity of DataNodes in bytes", - "CapacityUsed": "Current used capacity across all DataNodes in bytes", - "CapacityRemaining": "Current remaining capacity in bytes", - "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes", - "TotalLoad": "Current number of connections", - "SnapshottableDirectories": "Current number of snapshottable directories", - "Snapshots": "Current number of snapshots", - "NumEncryptionZones": "Current number of encryption zones", - "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock", - "BlocksTotal": "Current number of allocated blocks in the system", - "NumFilesUnderConstruction": "Current number of files under construction", - "NumActiveClients": "Current number of active clients holding lease", - "FilesTotal": "Current number of files and directories", - "PendingReplicationBlocks": "Current number of blocks pending to be replicated", - "UnderReplicatedBlocks": "Current number of blocks under replicated", - "CorruptBlocks": "Current number of blocks with corrupt replicas.", - "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications", - "PendingDeletionBlocks": "Current number of blocks pending deletion", - "ExcessBlocks": "Current number of excess blocks", - "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.", - "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate", - "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode", - "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0", - "BlockCapacity": "Current number of block capacity", - "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat", - "TotalSyncCount": "Total number of sync operations performed by edit log" +{ + "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state", + "MissingBlocks": "Current number of missing blocks", + "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1", + "ExpiredHeartbeats": "Total number of expired heartbeats", + "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint", + "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll", + "LastWrittenTransactionId": "Last transaction ID written to the edit log", + "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint", + "CapacityTotal": "Current raw capacity of DataNodes in bytes", + "CapacityUsed": "Current used capacity across all DataNodes in bytes", + "CapacityRemaining": "Current remaining capacity in bytes", + "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes", + "TotalLoad": "Current number of connections", + "SnapshottableDirectories": "Current number of snapshottable directories", + "Snapshots": "Current number of snapshots", + "NumEncryptionZones": "Current number of encryption zones", + "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock", + "BlocksTotal": "Current number of allocated blocks in the system", + "NumFilesUnderConstruction": "Current number of files under construction", + "NumActiveClients": "Current number of active clients holding lease", + "FilesTotal": "Current number of files and directories", + "PendingReplicationBlocks": "Current number of blocks pending to be replicated", + "UnderReplicatedBlocks": "Current number of blocks under replicated", + "CorruptBlocks": "Current number of blocks with corrupt replicas.", + "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications", + "PendingDeletionBlocks": "Current number of blocks pending deletion", + "ExcessBlocks": "Current number of excess blocks", + "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.", + "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate", + "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode", + "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0", + "BlockCapacity": "Current number of block capacity", + "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat", + "TotalSyncCount": "Total number of sync operations performed by edit log" } \ No newline at end of file diff --git a/metrics/namenode/FSNamesystemState.json b/metrics/namenode/FSNamesystemState.json index 4464b56..6b241c9 100644 --- a/metrics/namenode/FSNamesystemState.json +++ b/metrics/namenode/FSNamesystemState.json @@ -1,16 +1,16 @@ -{ - "FsLockQueueLength": "Filesystem lock queue length", - "MaxObjects": "Max objects", - "BlockDeletionStartTime": "Start time of block deletion", - "NumLiveDataNodes": "Number of datanodes which are currently live", - "NumDeadDataNodes": "Number of datanodes which are currently dead", - "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live", - "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead", - "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state", - "NumStaleDataNodes": "Number of datanodes marked as content stale", - "VolumeFailuresTotal": "Total number of volume failures across all Datanodes", - "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures", - "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)", - "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)", - "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation" +{ + "FsLockQueueLength": "Filesystem lock queue length", + "MaxObjects": "Max objects", + "BlockDeletionStartTime": "Start time of block deletion", + "NumLiveDataNodes": "Number of datanodes which are currently live", + "NumDeadDataNodes": "Number of datanodes which are currently dead", + "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live", + "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead", + "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state", + "NumStaleDataNodes": "Number of datanodes marked as content stale", + "VolumeFailuresTotal": "Total number of volume failures across all Datanodes", + "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures", + "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)", + "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)", + "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation" } \ No newline at end of file diff --git a/metrics/namenode/NameNodeActivity.json b/metrics/namenode/NameNodeActivity.json index d92896d..b5850fe 100644 --- a/metrics/namenode/NameNodeActivity.json +++ b/metrics/namenode/NameNodeActivity.json @@ -1,41 +1,41 @@ -{ - "CreateFileOps": "Total number of files created.", - "FilesCreated": "Total number of files and directories created by create or mkdir operations.", - "FilesAppended": "Total number of files appended.", - "GetBlockLocations": "Total number of getBlockLocations operations.", - "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).", - "GetListingOps": "Total number of directory listing operations.", - "DeleteFileOps": "Total number of delete operations.", - "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.", - "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.", - "AddBlockOps": "Total number of addBlock operations succeeded.", - "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.", - "CreateSymlinkOps": "Total number of createSymlink operations.", - "GetLinkTargetOps": "Total number of getLinkTarget operations.", - "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.", - "AllowSnapshotOps": "Total number of allowSnapshot operations.", - "DisallowSnapshotOps": "Total number of disallowSnapshot operations.", - "CreateSnapshotOps": "Total number of createSnapshot operations.", - "DeleteSnapshotOps": "Total number of deleteSnapshot operations.", - "RenameSnapshotOps": "Total number of renameSnapshot operations.", - "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.", - "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.", - "TransactionsNumOps": "Total number of Journal transactions.", - "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.", - "SyncsNumOps": "Total number of Journal syncs.", - "SyncsAvgTime": "Average time of Journal syncs in milliseconds.", - "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.", - "BlockReportNumOps": "Total number of processing block reports from DataNode.", - "BlockReportAvgTime": "Average time of processing block reports in milliseconds.", - "CacheReportNumOps": "Total number of processing cache reports from DataNode.", - "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.", - "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).", - "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.", - "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.", - "GetEditAvgTime": "Average edits download time in milliseconds.", - "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.", - "GetImageAvgTime": "Average fsimage download time in milliseconds.", - "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.", - "PutImageAvgTime": "Average fsimage upload time in milliseconds.", - "TotalFileOps": "Total number of all file operations." -} +{ + "CreateFileOps": "Total number of files created.", + "FilesCreated": "Total number of files and directories created by create or mkdir operations.", + "FilesAppended": "Total number of files appended.", + "GetBlockLocations": "Total number of getBlockLocations operations.", + "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).", + "GetListingOps": "Total number of directory listing operations.", + "DeleteFileOps": "Total number of delete operations.", + "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.", + "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.", + "AddBlockOps": "Total number of addBlock operations succeeded.", + "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.", + "CreateSymlinkOps": "Total number of createSymlink operations.", + "GetLinkTargetOps": "Total number of getLinkTarget operations.", + "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.", + "AllowSnapshotOps": "Total number of allowSnapshot operations.", + "DisallowSnapshotOps": "Total number of disallowSnapshot operations.", + "CreateSnapshotOps": "Total number of createSnapshot operations.", + "DeleteSnapshotOps": "Total number of deleteSnapshot operations.", + "RenameSnapshotOps": "Total number of renameSnapshot operations.", + "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.", + "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.", + "TransactionsNumOps": "Total number of Journal transactions.", + "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.", + "SyncsNumOps": "Total number of Journal syncs.", + "SyncsAvgTime": "Average time of Journal syncs in milliseconds.", + "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.", + "BlockReportNumOps": "Total number of processing block reports from DataNode.", + "BlockReportAvgTime": "Average time of processing block reports in milliseconds.", + "CacheReportNumOps": "Total number of processing cache reports from DataNode.", + "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.", + "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).", + "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.", + "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.", + "GetEditAvgTime": "Average edits download time in milliseconds.", + "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.", + "GetImageAvgTime": "Average fsimage download time in milliseconds.", + "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.", + "PutImageAvgTime": "Average fsimage upload time in milliseconds.", + "TotalFileOps": "Total number of all file operations." +} diff --git a/metrics/namenode/RetryCache.json b/metrics/namenode/RetryCache.json index ff92fbd..caa37b3 100644 --- a/metrics/namenode/RetryCache.json +++ b/metrics/namenode/RetryCache.json @@ -1,5 +1,5 @@ -{ - "CacheHit": "Total number of RetryCache hit.", - "CacheCleared": "Total number of RetryCache cleared.", - "CacheUpdated": "Total number of RetryCache updated." -} +{ + "CacheHit": "Total number of RetryCache hit.", + "CacheCleared": "Total number of RetryCache cleared.", + "CacheUpdated": "Total number of RetryCache updated." +} diff --git a/metrics/namenode/StartupProgress.json b/metrics/namenode/StartupProgress.json index ad089f4..d25e8a4 100644 --- a/metrics/namenode/StartupProgress.json +++ b/metrics/namenode/StartupProgress.json @@ -1,20 +1,20 @@ -{ - "ElapsedTime": "Total elapsed time in milliseconds.", - "PercentComplete": "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0).", - "LoadingFsImageCount": "", - "LoadingFsImageElapsedTime": "", - "LoadingFsImageTotal": "", - "LoadingFsImagePercentComplete": "", - "LoadingEditsCount": "", - "LoadingEditsElapsedTime": "", - "LoadingEditsTotal": "", - "LoadingEditsPercentComplete": "", - "SavingCheckpointCount": "", - "SavingCheckpointElapsedTime": "", - "SavingCheckpointTotal": "", - "SavingCheckpointPercentComplete": "", - "SafeModeCount": "", - "SafeModeElapsedTime": "", - "SafeModeTotal": "", - "SafeModePercentComplete": "" +{ + "ElapsedTime": "Total elapsed time in milliseconds.", + "PercentComplete": "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0).", + "LoadingFsImageCount": "", + "LoadingFsImageElapsedTime": "", + "LoadingFsImageTotal": "", + "LoadingFsImagePercentComplete": "", + "LoadingEditsCount": "", + "LoadingEditsElapsedTime": "", + "LoadingEditsTotal": "", + "LoadingEditsPercentComplete": "", + "SavingCheckpointCount": "", + "SavingCheckpointElapsedTime": "", + "SavingCheckpointTotal": "", + "SavingCheckpointPercentComplete": "", + "SafeModeCount": "", + "SafeModeElapsedTime": "", + "SafeModeTotal": "", + "SafeModePercentComplete": "" } \ No newline at end of file diff --git a/metrics/nodemanager/NodeManagerMetrics.json b/metrics/nodemanager/NodeManagerMetrics.json index 9346ffd..daa5ba2 100644 --- a/metrics/nodemanager/NodeManagerMetrics.json +++ b/metrics/nodemanager/NodeManagerMetrics.json @@ -1,19 +1,19 @@ -{ - "ContainersLaunched": "Count of launched container", - "ContainersCompleted": "Count of completed container", - "ContainersFailed": "Count of failed container", - "ContainersKilled": "Count of killed container", - "ContainersIniting": "Count of initing container", - "ContainersRunning": "Count of running container", - "AllocatedGB": "Memory size of allocated (in GB)", - "AllocatedContainers": "Count of allocated container", - "AvailableGB": "Memory size of available (in GB)", - "AllocatedVCores": "Count of allocated VCores", - "AvailableVCores": "Count of available VCores", - "ContainerLaunchDurationNumOps": "Count of launched container", - "ContainerLaunchDurationAvgTime": "Average time of launching container (in ms)", - "BadLocalDirs": "Count of bad local directory", - "BadLogDirs": "Count of bad log directory", - "GoodLocalDirsDiskUtilizationPerc": "Percent of good local directory disk utilization", - "GoodLogDirsDiskUtilizationPerc": "Percent of good local log directory disk utilization" -} +{ + "ContainersLaunched": "Count of launched container", + "ContainersCompleted": "Count of completed container", + "ContainersFailed": "Count of failed container", + "ContainersKilled": "Count of killed container", + "ContainersIniting": "Count of initing container", + "ContainersRunning": "Count of running container", + "AllocatedGB": "Memory size of allocated (in GB)", + "AllocatedContainers": "Count of allocated container", + "AvailableGB": "Memory size of available (in GB)", + "AllocatedVCores": "Count of allocated VCores", + "AvailableVCores": "Count of available VCores", + "ContainerLaunchDurationNumOps": "Count of launched container", + "ContainerLaunchDurationAvgTime": "Average time of launching container (in ms)", + "BadLocalDirs": "Count of bad local directory", + "BadLogDirs": "Count of bad log directory", + "GoodLocalDirsDiskUtilizationPerc": "Percent of good local directory disk utilization", + "GoodLogDirsDiskUtilizationPerc": "Percent of good local log directory disk utilization" +} diff --git a/metrics/nodemanager/ShuffleMetrics.json b/metrics/nodemanager/ShuffleMetrics.json index f1dcd8e..7d8d041 100644 --- a/metrics/nodemanager/ShuffleMetrics.json +++ b/metrics/nodemanager/ShuffleMetrics.json @@ -1,6 +1,6 @@ -{ - "ShuffleOutputBytes": "Output byte of shuffle", - "ShuffleOutputsFailed": "Output failed of shuffle", - "ShuffleOutputsOK": "Output ok of shuffle", - "ShuffleConnections": "Connection count of shuffle" -} +{ + "ShuffleOutputBytes": "Output byte of shuffle", + "ShuffleOutputsFailed": "Output failed of shuffle", + "ShuffleOutputsOK": "Output ok of shuffle", + "ShuffleConnections": "Connection count of shuffle" +} diff --git a/metrics/resourcemanager/ClusterMetrics.json b/metrics/resourcemanager/ClusterMetrics.json index 4a3ae36..71af2cc 100644 --- a/metrics/resourcemanager/ClusterMetrics.json +++ b/metrics/resourcemanager/ClusterMetrics.json @@ -1,11 +1,11 @@ -{ - "NumActiveNMs": "Current number of active NodeManagers", - "NumDecommissionedNMs": "Current number of decommissioned NodeManagers", - "NumLostNMs": "Current number of lost NodeManagers for not sending heartbeats", - "NumUnhealthyNMs": "Current number of unhealthy NodeManagers", - "NumRebootedNMs": "Current number of rebooted NodeManagers", - "AMLaunchDelayNumOps": "Total number of AMs launched", - "AMLaunchDelayAvgTime": "Average time in milliseconds RM spends to launch AM containers after the AM container is allocated", - "AMRegisterDelayNumOps": "Total number of AMs registered", - "AMRegisterDelayAvgTime": "Average time in milliseconds AM spends to register with RM after the AM container gets launched" +{ + "NumActiveNMs": "Current number of active NodeManagers", + "NumDecommissionedNMs": "Current number of decommissioned NodeManagers", + "NumLostNMs": "Current number of lost NodeManagers for not sending heartbeats", + "NumUnhealthyNMs": "Current number of unhealthy NodeManagers", + "NumRebootedNMs": "Current number of rebooted NodeManagers", + "AMLaunchDelayNumOps": "Total number of AMs launched", + "AMLaunchDelayAvgTime": "Average time in milliseconds RM spends to launch AM containers after the AM container is allocated", + "AMRegisterDelayNumOps": "Total number of AMs registered", + "AMRegisterDelayAvgTime": "Average time in milliseconds AM spends to register with RM after the AM container gets launched" } \ No newline at end of file diff --git a/metrics/resourcemanager/QueueMetrics.json b/metrics/resourcemanager/QueueMetrics.json index 41c8789..497f42d 100644 --- a/metrics/resourcemanager/QueueMetrics.json +++ b/metrics/resourcemanager/QueueMetrics.json @@ -1,33 +1,33 @@ -{ - "running_0": "Current number of running applications whose elapsed time are less than 60 minutes.", - "running_60": "Current number of running applications whose elapsed time are between 60 and 300 minutes.", - "running_300": "Current number of running applications whose elapsed time are between 300 and 1440 minutes.", - "running_1440": "Current number of running applications elapsed time are more than 1440 minutes.", - "AppsSubmitted": "Total number of submitted applications.", - "AppsRunning": "Current number of running applications.", - "AppsPending": "Current number of applications that have not yet been assigned by any containers.", - "AppsCompleted": "Total number of completed applications.", - "AppsKilled": "Total number of killed applications.", - "AppsFailed": "Total number of failed applications.", - "AllocatedMB": "Current allocated memory in MB.", - "AllocatedVCores": "Current allocated CPU in virtual cores.", - "AllocatedContainers": "Current number of allocated containers.", - "AggregateContainersAllocated": "Total number of allocated containers.", - "AggregateContainersReleased": "Total number of released containers.", - "AvailableMB": "Current available memory in MB.", - "AvailableVCores": "Current available CPU in virtual cores.", - "PendingMB": "Current pending memory resource requests in MB that are not yet fulfilled by the scheduler.", - "PendingVCores": "Current pending CPU allocation requests in virtual cores that are not yet fulfilled by the scheduler.", - "PendingContainers": "Current pending resource requests that are not yet fulfilled by the scheduler.", - "ReservedMB": "Current reserved memory in MB.", - "ReservedVCores": "Current reserved CPU in virtual cores.", - "ReservedContainers": "Current number of reserved containers.", - "ActiveUsers": "Current number of active users.", - "ActiveApplications": "Current number of active applications.", - "FairShareMB": "(FairScheduler only) Current fair share of memory in MB.", - "FairShareVCores": "(FairScheduler only) Current fair share of CPU in virtual cores.", - "MinShareMB": "(FairScheduler only) Minimum share of memory in MB.", - "MinShareVCores": "(FairScheduler only) Minimum share of CPU in virtual cores.", - "MaxShareMB": "(FairScheduler only) Maximum share of memory in MB.", - "MaxShareVCores": "(FairScheduler only) Maximum share of CPU in virtual cores." +{ + "running_0": "Current number of running applications whose elapsed time are less than 60 minutes.", + "running_60": "Current number of running applications whose elapsed time are between 60 and 300 minutes.", + "running_300": "Current number of running applications whose elapsed time are between 300 and 1440 minutes.", + "running_1440": "Current number of running applications elapsed time are more than 1440 minutes.", + "AppsSubmitted": "Total number of submitted applications.", + "AppsRunning": "Current number of running applications.", + "AppsPending": "Current number of applications that have not yet been assigned by any containers.", + "AppsCompleted": "Total number of completed applications.", + "AppsKilled": "Total number of killed applications.", + "AppsFailed": "Total number of failed applications.", + "AllocatedMB": "Current allocated memory in MB.", + "AllocatedVCores": "Current allocated CPU in virtual cores.", + "AllocatedContainers": "Current number of allocated containers.", + "AggregateContainersAllocated": "Total number of allocated containers.", + "AggregateContainersReleased": "Total number of released containers.", + "AvailableMB": "Current available memory in MB.", + "AvailableVCores": "Current available CPU in virtual cores.", + "PendingMB": "Current pending memory resource requests in MB that are not yet fulfilled by the scheduler.", + "PendingVCores": "Current pending CPU allocation requests in virtual cores that are not yet fulfilled by the scheduler.", + "PendingContainers": "Current pending resource requests that are not yet fulfilled by the scheduler.", + "ReservedMB": "Current reserved memory in MB.", + "ReservedVCores": "Current reserved CPU in virtual cores.", + "ReservedContainers": "Current number of reserved containers.", + "ActiveUsers": "Current number of active users.", + "ActiveApplications": "Current number of active applications.", + "FairShareMB": "(FairScheduler only) Current fair share of memory in MB.", + "FairShareVCores": "(FairScheduler only) Current fair share of CPU in virtual cores.", + "MinShareMB": "(FairScheduler only) Minimum share of memory in MB.", + "MinShareVCores": "(FairScheduler only) Minimum share of CPU in virtual cores.", + "MaxShareMB": "(FairScheduler only) Maximum share of memory in MB.", + "MaxShareVCores": "(FairScheduler only) Maximum share of CPU in virtual cores." } \ No newline at end of file diff --git a/metrics/resourcemanager/RMNMInfo.json b/metrics/resourcemanager/RMNMInfo.json index e62122f..8d70b64 100644 --- a/metrics/resourcemanager/RMNMInfo.json +++ b/metrics/resourcemanager/RMNMInfo.json @@ -1,6 +1,6 @@ -{ - "NumContainers": "Total number of containers currently running on the host", - "State": "State of the host - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONED, LOST, REBOOTED", - "UsedMemoryMB": "The total amount of memory currently used on the host (in MB)", - "AvailableMemoryMB": "The total amount of memory currently available on the host (in MB)" +{ + "NumContainers": "Total number of containers currently running on the host", + "State": "State of the host - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONED, LOST, REBOOTED", + "UsedMemoryMB": "The total amount of memory currently used on the host (in MB)", + "AvailableMemoryMB": "The total amount of memory currently available on the host (in MB)" } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6b77016..3b2d237 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -requests -prometheus_client -pyyaml +requests +prometheus_client +pyyaml diff --git a/scraper.py b/scraper.py index fd4f49e..78ef44f 100644 --- a/scraper.py +++ b/scraper.py @@ -1,52 +1,52 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import threading -import requests - -from utils import get_module_logger - - -logger = get_module_logger(__name__) - - -class Scraper(threading.Thread): - def __init__(self, url, result): - super(Scraper, self).__init__() - self.name = "thread-%s" % url - self.url = url - self.result = result - - def run(self): - result = [] - try: - s = requests.session() - response = s.get(self.url, timeout=5) - except Exception as e: - logger.warning("Get {0} failed, error: {1}.".format(self.url, str(e))) - else: - if response.status_code != requests.codes.ok: - logger.warning("Get {0} failed, response code is: {1}.".format(self.url, response.status_code)) - else: - rlt = response.json() - if rlt and "beans" in rlt: - result = rlt['beans'] - else: - logger.warning("No metrics get in the {0}.".format(self.url)) - s.close() - if len(result) > 0: - self.result.append(result) - - -class ScrapeMetrics(object): - def __init__(self, urls): - self.urls = urls - - def scrape(self): - result = [] - tasks = [Scraper(url, result) for url in self.urls] - for task in tasks: - task.start() - for task in tasks: - task.join() - return result +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import threading +import requests + +from utils import get_module_logger + + +logger = get_module_logger(__name__) + + +class Scraper(threading.Thread): + def __init__(self, url, result): + super(Scraper, self).__init__() + self.name = "thread-%s" % url + self.url = url + self.result = result + + def run(self): + result = [] + try: + s = requests.session() + response = s.get(self.url, timeout=5) + except Exception as e: + logger.warning("Get {0} failed, error: {1}.".format(self.url, str(e))) + else: + if response.status_code != requests.codes.ok: + logger.warning("Get {0} failed, response code is: {1}.".format(self.url, response.status_code)) + else: + rlt = response.json() + if rlt and "beans" in rlt: + result = rlt['beans'] + else: + logger.warning("No metrics get in the {0}.".format(self.url)) + s.close() + if len(result) > 0: + self.result.append(result) + + +class ScrapeMetrics(object): + def __init__(self, urls): + self.urls = urls + + def scrape(self): + result = [] + tasks = [Scraper(url, result) for url in self.urls] + for task in tasks: + task.start() + for task in tasks: + task.join() + return result diff --git a/utils.py b/utils.py index 7f3989e..b2e55e6 100644 --- a/utils.py +++ b/utils.py @@ -1,70 +1,70 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import os -import argparse -import logging -import yaml - - -def get_module_logger(mod_name): - logger = logging.getLogger(mod_name) - logger.setLevel(logging.DEBUG) - - path = os.path.dirname(os.path.abspath(__file__)) - par_path = os.path.dirname(path) - fh = logging.FileHandler(os.path.join(par_path, "hadoop_jmx_exporter.log")) - fh.setLevel(logging.INFO) - - sh = logging.StreamHandler() - sh.setLevel(logging.INFO) - - fmt = logging.Formatter(fmt='%(asctime)s %(filename)s[line:%(lineno)d]-[%(levelname)s]: %(message)s') - fh.setFormatter(fmt) - sh.setFormatter(fmt) - - logger.addHandler(fh) - logger.addHandler(sh) - return logger - - -logger = get_module_logger(__name__) - -def read_json_file(path_name, file_name): - path = os.path.dirname(os.path.realpath(__file__)) - metric_path = os.path.join(path, "metrics", path_name) - metric_name = "{0}.json".format(file_name) - try: - with open(os.path.join(metric_path, metric_name), 'r') as f: - metrics = yaml.safe_load(f) - return metrics - except Exception as e: - logger.info("read metrics json file failed, error msg is: %s" % e) - return {} - - -def get_file_list(file_path_name): - path = os.path.dirname(os.path.abspath(__file__)) - json_path = os.path.join(path, "metrics", file_path_name) - try: - files = os.listdir(json_path) - except OSError: - logger.info("No such file or directory: '%s'" % json_path) - return [] - else: - rlt = [] - for i in range(len(files)): - rlt.append(files[i].split(".json")[0]) - return rlt - - -def parse_args(): - parser = argparse.ArgumentParser(description='hadoop jmx metric prometheus exporter') - parser.add_argument('-cluster', required=True, metavar='cluster_name', help='Hadoop cluster name (maybe HA name)') - parser.add_argument('-queue', required=False, metavar='yarn_queue_regexp', help='Regular expression of queue name. default: root.*', default='root.*') - parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*") - parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*") - parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*") - parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0') - parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688) - return parser.parse_args() +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import os +import argparse +import logging +import yaml + + +def get_module_logger(mod_name): + logger = logging.getLogger(mod_name) + logger.setLevel(logging.DEBUG) + + path = os.path.dirname(os.path.abspath(__file__)) + par_path = os.path.dirname(path) + fh = logging.FileHandler(os.path.join(par_path, "hadoop_jmx_exporter.log")) + fh.setLevel(logging.INFO) + + sh = logging.StreamHandler() + sh.setLevel(logging.INFO) + + fmt = logging.Formatter(fmt='%(asctime)s %(filename)s[line:%(lineno)d]-[%(levelname)s]: %(message)s') + fh.setFormatter(fmt) + sh.setFormatter(fmt) + + logger.addHandler(fh) + logger.addHandler(sh) + return logger + + +logger = get_module_logger(__name__) + +def read_json_file(path_name, file_name): + path = os.path.dirname(os.path.realpath(__file__)) + metric_path = os.path.join(path, "metrics", path_name) + metric_name = "{0}.json".format(file_name) + try: + with open(os.path.join(metric_path, metric_name), 'r') as f: + metrics = yaml.safe_load(f) + return metrics + except Exception as e: + logger.info("read metrics json file failed, error msg is: %s" % e) + return {} + + +def get_file_list(file_path_name): + path = os.path.dirname(os.path.abspath(__file__)) + json_path = os.path.join(path, "metrics", file_path_name) + try: + files = os.listdir(json_path) + except OSError: + logger.info("No such file or directory: '%s'" % json_path) + return [] + else: + rlt = [] + for i in range(len(files)): + rlt.append(files[i].split(".json")[0]) + return rlt + + +def parse_args(): + parser = argparse.ArgumentParser(description='hadoop jmx metric prometheus exporter') + parser.add_argument('-cluster', required=True, metavar='cluster_name', help='Hadoop cluster name (maybe HA name)') + parser.add_argument('-queue', required=False, metavar='yarn_queue_regexp', help='Regular expression of queue name. default: root.*', default='root.*') + parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*") + parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*") + parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*") + parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0') + parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688) + return parser.parse_args() diff --git a/yarn_nodemanager.py b/yarn_nodemanager.py index 87abf18..4b53b07 100644 --- a/yarn_nodemanager.py +++ b/yarn_nodemanager.py @@ -1,88 +1,88 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import re -from prometheus_client.core import GaugeMetricFamily - -from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector -from scraper import ScrapeMetrics - -logger = get_module_logger(__name__) - - -class NodeManagerMetricCollector(MetricCollector): - - def __init__(self, cluster, rmc): - MetricCollector.__init__(self, cluster, "yarn", "nodemanager") - self.target = "-" - self.rmc = rmc - - self.hadoop_nodemanager_metrics = {} - for i in range(len(self.file_list)): - self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {}) - - self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "nodemanager") - - def collect(self): - isSetup = False - beans_list = ScrapeMetrics(self.rmc.nms).scrape() - for beans in beans_list: - if not isSetup: - self.common_metric_collector.setup_labels(beans) - self.setup_metrics_labels(beans) - isSetup = True - for i in range(len(beans)): - if 'tag.Hostname' in beans[i]: - self.target = beans[i]["tag.Hostname"] - break - self.hadoop_nodemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) - self.get_metrics(beans) - - for i in range(len(self.merge_list)): - service = self.merge_list[i] - if service in self.hadoop_nodemanager_metrics: - for metric in self.hadoop_nodemanager_metrics[service]: - yield self.hadoop_nodemanager_metrics[service][metric] - - def setup_metrics_labels(self, beans): - for i in range(len(beans)): - for service in self.metrics: - if service in beans[i]['name']: - container_flag = 1 - for metric in self.metrics[service]: - label = ["cluster", "host"] - if metric.startswith("Containers"): - if container_flag: - container_flag = 0 - label.append("status") - key = "containers" - name = "_".join([self.prefix, "container_count"]) - description = "Count of container" - else: - continue - else: - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case]) - key = metric - description = self.metrics[service][metric] - label.append("target") - self.hadoop_nodemanager_metrics[service][key] = GaugeMetricFamily(name, description, labels=label) - - def get_metrics(self, beans): - for i in range(len(beans)): - for service in self.metrics: - if service not in beans[i]['name']: - continue - for metric in beans[i]: - if metric not in self.metrics[service]: - continue - label = [self.cluster, self.target] - if metric.startswith("Containers"): - key = "containers" - label.append(metric.split("Containers")[1]) - else: - key = metric - label.append(self.target) - value = beans[i][metric] if beans[i][metric] > 0 else 0 # incase vcore or memory < 0 - self.hadoop_nodemanager_metrics[service][key].add_metric(label, value) +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import re +from prometheus_client.core import GaugeMetricFamily + +from utils import get_module_logger +from common import MetricCollector, CommonMetricCollector +from scraper import ScrapeMetrics + +logger = get_module_logger(__name__) + + +class NodeManagerMetricCollector(MetricCollector): + + def __init__(self, cluster, rmc): + MetricCollector.__init__(self, cluster, "yarn", "nodemanager") + self.target = "-" + self.rmc = rmc + + self.hadoop_nodemanager_metrics = {} + for i in range(len(self.file_list)): + self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {}) + + self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "nodemanager") + + def collect(self): + isSetup = False + beans_list = ScrapeMetrics(self.rmc.nms).scrape() + for beans in beans_list: + if not isSetup: + self.common_metric_collector.setup_labels(beans) + self.setup_metrics_labels(beans) + isSetup = True + for i in range(len(beans)): + if 'tag.Hostname' in beans[i]: + self.target = beans[i]["tag.Hostname"] + break + self.hadoop_nodemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.hadoop_nodemanager_metrics: + for metric in self.hadoop_nodemanager_metrics[service]: + yield self.hadoop_nodemanager_metrics[service][metric] + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + for service in self.metrics: + if service in beans[i]['name']: + container_flag = 1 + for metric in self.metrics[service]: + label = ["cluster", "host"] + if metric.startswith("Containers"): + if container_flag: + container_flag = 0 + label.append("status") + key = "containers" + name = "_".join([self.prefix, "container_count"]) + description = "Count of container" + else: + continue + else: + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case]) + key = metric + description = self.metrics[service][metric] + label.append("target") + self.hadoop_nodemanager_metrics[service][key] = GaugeMetricFamily(name, description, labels=label) + + def get_metrics(self, beans): + for i in range(len(beans)): + for service in self.metrics: + if service not in beans[i]['name']: + continue + for metric in beans[i]: + if metric not in self.metrics[service]: + continue + label = [self.cluster, self.target] + if metric.startswith("Containers"): + key = "containers" + label.append(metric.split("Containers")[1]) + else: + key = metric + label.append(self.target) + value = beans[i][metric] if beans[i][metric] > 0 else 0 # incase vcore or memory < 0 + self.hadoop_nodemanager_metrics[service][key].add_metric(label, value) diff --git a/yarn_resourcemanager.py b/yarn_resourcemanager.py index 6b87b3d..4fc7f1c 100644 --- a/yarn_resourcemanager.py +++ b/yarn_resourcemanager.py @@ -1,253 +1,253 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -import yaml -import re -from prometheus_client.core import GaugeMetricFamily - -from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector -from scraper import ScrapeMetrics - -logger = get_module_logger(__name__) - - -class ResourceManagerMetricCollector(MetricCollector): - - NODE_STATE = { - 'NEW': 1, - 'RUNNING': 2, - 'UNHEALTHY': 3, - 'DECOMMISSIONED': 4, - 'LOST': 5, - 'REBOOTED': 6, - } - - def __init__(self, cluster, urls, queue_regexp): - MetricCollector.__init__(self, cluster, "yarn", "resourcemanager") - self.target = "-" - self.queue_regexp = queue_regexp - self.nms = set() - - self.hadoop_resourcemanager_metrics = {} - for i in range(len(self.file_list)): - self.hadoop_resourcemanager_metrics.setdefault(self.file_list[i], {}) - - self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "resourcemanager") - - self.scrape_metrics = ScrapeMetrics(urls) - - def collect(self): - isSetup = False - beans_list = self.scrape_metrics.scrape() - for beans in beans_list: - if not isSetup: - self.common_metric_collector.setup_labels(beans) - self.setup_metrics_labels(beans) - isSetup = True - for i in range(len(beans)): - if 'tag.Hostname' in beans[i]: - self.target = beans[i]["tag.Hostname"] - break - self.hadoop_resourcemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) - self.get_metrics(beans) - - for i in range(len(self.merge_list)): - service = self.merge_list[i] - if service in self.hadoop_resourcemanager_metrics: - for metric in self.hadoop_resourcemanager_metrics[service]: - yield self.hadoop_resourcemanager_metrics[service][metric] - - def setup_rmnminfo_labels(self): - for metric in self.metrics['RMNMInfo']: - label = ["cluster", "host", "version", "rack", "_target"] - if 'NumContainers' in metric: - name = "_".join([self.prefix, 'node_containers_total']) - elif 'State' in metric: - name = "_".join([self.prefix, 'node_state']) - elif 'UsedMemoryMB' in metric: - name = "_".join([self.prefix, 'node_memory_used_mb']) - elif 'AvailableMemoryMB' in metric: - name = "_".join([self.prefix, 'node_memory_available_mb']) - else: - continue - self.hadoop_resourcemanager_metrics['RMNMInfo'][metric] = GaugeMetricFamily(name, self.metrics['RMNMInfo'][metric], labels=label) - - def setup_queue_labels(self): - running_flag, mb_flag, vcore_flag, container_flag, apps_flag = 1, 1, 1, 1, 1 - for metric in self.metrics['QueueMetrics']: - label = ["cluster", "modeler_type", "queue", "user"] - if "running_" in metric: - if running_flag: - running_flag = 0 - label.append("elapsed_time") - key = "running_app" - name = "_".join([self.prefix, "running_app_total"]) - description = "Current number of running applications in each elapsed time ( < 60min, 60min < x < 300min, 300min < x < 1440min and x > 1440min )" - else: - continue - elif metric.endswith("VCores"): - if vcore_flag: - vcore_flag = 0 - label.append("status") - key = "vcore" - name = "_".join([self.prefix, "vcore_count"]) - description = "Count of vcore" - else: - continue - elif metric.endswith("Containers"): - if container_flag: - container_flag = 0 - label.append("status") - key = "containers" - name = "_".join([self.prefix, "container_count"]) - description = "Count of container" - else: - continue - elif metric.endswith("MB"): - if mb_flag: - mb_flag = 0 - label.append("status") - key = "memory" - name = "_".join([self.prefix, "memory_in_mb"]) - description = "Memory in MB" - else: - continue - elif metric.startswith("Apps"): - if apps_flag: - apps_flag = 0 - label.append("status") - key = "apps" - name = "_".join([self.prefix, "application_count"]) - description = "Count of application" - else: - continue - else: - key = metric - snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() - name = "_".join([self.prefix, snake_case]) - description = self.metrics['QueueMetrics'][metric] - label.append("_target") - self.hadoop_resourcemanager_metrics['QueueMetrics'][key] = GaugeMetricFamily(name, description, labels=label) - - def setup_cluster_labels(self): - nm_flag, cm_num_flag, cm_avg_flag = 1, 1, 1 - for metric in self.metrics['ClusterMetrics']: - if "NMs" in metric: - if nm_flag: - nm_flag = 0 - label = ["cluster", "status"] - key = "NMs" - name = "nodemanager_total" - description = "Current number of NodeManagers in each status" - else: - continue - elif "NumOps" in metric: - if cm_num_flag: - cm_num_flag = 0 - label = ["cluster", "oper"] - key = "NumOps" - name = "ams_total" - description = "Total number of Applications Masters in each operation" - else: - continue - elif "AvgTime" in metric: - if cm_avg_flag: - cm_avg_flag = 0 - label = ["cluster", "oper"] - key = "AvgTime" - name = "average_time_milliseconds" - description = "Average time in milliseconds AM spends in each operation" - else: - continue - else: - key = metric - name = metric - description = self.metrics['ClusterMetrics'][metric] - label = ["cluster"] - label.append("_target") - self.hadoop_resourcemanager_metrics['ClusterMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, name]), description, labels=label) - - def setup_metrics_labels(self, beans): - for i in range(len(beans)): - if 'RMNMInfo' in beans[i]['name']: - self.setup_rmnminfo_labels() - if 'QueueMetrics' in self.metrics: - self.setup_queue_labels() - if 'ClusterMetrics' in self.metrics: - self.setup_cluster_labels() - - def get_rmnminfo_metrics(self, bean): - for metric in self.metrics['RMNMInfo']: - nms = set() - live_nm_list = yaml.safe_load(bean['LiveNodeManagers']) - for j in range(len(live_nm_list)): - nms.add("http://"+live_nm_list[j]["NodeHTTPAddress"]+"/jmx") - host = live_nm_list[j]['HostName'] - version = live_nm_list[j]['NodeManagerVersion'] - rack = live_nm_list[j]['Rack'] - label = [self.cluster, host, version, rack, self.target] - if 'State' == metric: - value = self.NODE_STATE[live_nm_list[j]['State']] - else: - value = live_nm_list[j][metric] if metric in live_nm_list[j] else 0.0 - self.hadoop_resourcemanager_metrics['RMNMInfo'][metric].add_metric(label, value) - self.nms = nms - - def get_queue_metrics(self, bean): - for metric in self.metrics['QueueMetrics']: - label = [self.cluster, bean.get("modelerType", "-"), bean.get("tag.Queue", "-"), bean.get("tag.User", "-")] - if "running_0" in metric: - key = "running_app" - label.append("0to60") - elif "running_60" in metric: - key = "running_app" - label.append("60to300") - elif "running_300" in metric: - key = "running_app" - label.append("300to1440") - elif "running_1440" in metric: - key = "running_app" - label.append("1440up") - elif metric.endswith("VCores"): - label.append(metric.split("VCores")[0]) - key = "vcore" - elif metric.endswith("Containers"): - label.append(metric.split("Containers")[0]) - key = "containers" - elif metric.endswith("MB"): - label.append(metric.split("MB")[0]) - key = "memory" - elif metric.startswith("Apps"): - label.append(metric.split("Apps")[1]) - key = "apps" - else: - key = metric - label.append(self.target) - self.hadoop_resourcemanager_metrics['QueueMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_cluster_metrics(self, bean): - for metric in self.metrics['ClusterMetrics']: - label = [self.cluster] - if "NMs" in metric: - label.append(metric.split('NMs')[0].split('Num')[1]) - key = "NMs" - elif "NumOps" in metric: - key = "NumOps" - label.append(metric.split("DelayNumOps")[0].split('AM')[1]) - elif "AvgTime" in metric: - key = "AvgTime" - label.append(metric.split("DelayAvgTime")[0].split('AM')[1]) - else: - continue - label.append(self.target) - self.hadoop_resourcemanager_metrics['ClusterMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) - - def get_metrics(self, beans): - for i in range(len(beans)): - if 'RMNMInfo' in beans[i]['name']: - self.get_rmnminfo_metrics(beans[i]) - if 'QueueMetrics' in beans[i]['name'] and re.match(self.queue_regexp, beans[i]['tag.Queue']): - self.get_queue_metrics(beans[i]) - if 'ClusterMetrics' in beans[i]['name']: - self.get_cluster_metrics(beans[i]) +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +import yaml +import re +from prometheus_client.core import GaugeMetricFamily + +from utils import get_module_logger +from common import MetricCollector, CommonMetricCollector +from scraper import ScrapeMetrics + +logger = get_module_logger(__name__) + + +class ResourceManagerMetricCollector(MetricCollector): + + NODE_STATE = { + 'NEW': 1, + 'RUNNING': 2, + 'UNHEALTHY': 3, + 'DECOMMISSIONED': 4, + 'LOST': 5, + 'REBOOTED': 6, + } + + def __init__(self, cluster, urls, queue_regexp): + MetricCollector.__init__(self, cluster, "yarn", "resourcemanager") + self.target = "-" + self.queue_regexp = queue_regexp + self.nms = set() + + self.hadoop_resourcemanager_metrics = {} + for i in range(len(self.file_list)): + self.hadoop_resourcemanager_metrics.setdefault(self.file_list[i], {}) + + self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "resourcemanager") + + self.scrape_metrics = ScrapeMetrics(urls) + + def collect(self): + isSetup = False + beans_list = self.scrape_metrics.scrape() + for beans in beans_list: + if not isSetup: + self.common_metric_collector.setup_labels(beans) + self.setup_metrics_labels(beans) + isSetup = True + for i in range(len(beans)): + if 'tag.Hostname' in beans[i]: + self.target = beans[i]["tag.Hostname"] + break + self.hadoop_resourcemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.hadoop_resourcemanager_metrics: + for metric in self.hadoop_resourcemanager_metrics[service]: + yield self.hadoop_resourcemanager_metrics[service][metric] + + def setup_rmnminfo_labels(self): + for metric in self.metrics['RMNMInfo']: + label = ["cluster", "host", "version", "rack", "_target"] + if 'NumContainers' in metric: + name = "_".join([self.prefix, 'node_containers_total']) + elif 'State' in metric: + name = "_".join([self.prefix, 'node_state']) + elif 'UsedMemoryMB' in metric: + name = "_".join([self.prefix, 'node_memory_used_mb']) + elif 'AvailableMemoryMB' in metric: + name = "_".join([self.prefix, 'node_memory_available_mb']) + else: + continue + self.hadoop_resourcemanager_metrics['RMNMInfo'][metric] = GaugeMetricFamily(name, self.metrics['RMNMInfo'][metric], labels=label) + + def setup_queue_labels(self): + running_flag, mb_flag, vcore_flag, container_flag, apps_flag = 1, 1, 1, 1, 1 + for metric in self.metrics['QueueMetrics']: + label = ["cluster", "modeler_type", "queue", "user"] + if "running_" in metric: + if running_flag: + running_flag = 0 + label.append("elapsed_time") + key = "running_app" + name = "_".join([self.prefix, "running_app_total"]) + description = "Current number of running applications in each elapsed time ( < 60min, 60min < x < 300min, 300min < x < 1440min and x > 1440min )" + else: + continue + elif metric.endswith("VCores"): + if vcore_flag: + vcore_flag = 0 + label.append("status") + key = "vcore" + name = "_".join([self.prefix, "vcore_count"]) + description = "Count of vcore" + else: + continue + elif metric.endswith("Containers"): + if container_flag: + container_flag = 0 + label.append("status") + key = "containers" + name = "_".join([self.prefix, "container_count"]) + description = "Count of container" + else: + continue + elif metric.endswith("MB"): + if mb_flag: + mb_flag = 0 + label.append("status") + key = "memory" + name = "_".join([self.prefix, "memory_in_mb"]) + description = "Memory in MB" + else: + continue + elif metric.startswith("Apps"): + if apps_flag: + apps_flag = 0 + label.append("status") + key = "apps" + name = "_".join([self.prefix, "application_count"]) + description = "Count of application" + else: + continue + else: + key = metric + snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() + name = "_".join([self.prefix, snake_case]) + description = self.metrics['QueueMetrics'][metric] + label.append("_target") + self.hadoop_resourcemanager_metrics['QueueMetrics'][key] = GaugeMetricFamily(name, description, labels=label) + + def setup_cluster_labels(self): + nm_flag, cm_num_flag, cm_avg_flag = 1, 1, 1 + for metric in self.metrics['ClusterMetrics']: + if "NMs" in metric: + if nm_flag: + nm_flag = 0 + label = ["cluster", "status"] + key = "NMs" + name = "nodemanager_total" + description = "Current number of NodeManagers in each status" + else: + continue + elif "NumOps" in metric: + if cm_num_flag: + cm_num_flag = 0 + label = ["cluster", "oper"] + key = "NumOps" + name = "ams_total" + description = "Total number of Applications Masters in each operation" + else: + continue + elif "AvgTime" in metric: + if cm_avg_flag: + cm_avg_flag = 0 + label = ["cluster", "oper"] + key = "AvgTime" + name = "average_time_milliseconds" + description = "Average time in milliseconds AM spends in each operation" + else: + continue + else: + key = metric + name = metric + description = self.metrics['ClusterMetrics'][metric] + label = ["cluster"] + label.append("_target") + self.hadoop_resourcemanager_metrics['ClusterMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, name]), description, labels=label) + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + if 'RMNMInfo' in beans[i]['name']: + self.setup_rmnminfo_labels() + if 'QueueMetrics' in self.metrics: + self.setup_queue_labels() + if 'ClusterMetrics' in self.metrics: + self.setup_cluster_labels() + + def get_rmnminfo_metrics(self, bean): + for metric in self.metrics['RMNMInfo']: + nms = set() + live_nm_list = yaml.safe_load(bean['LiveNodeManagers']) + for j in range(len(live_nm_list)): + nms.add("http://"+live_nm_list[j]["NodeHTTPAddress"]+"/jmx") + host = live_nm_list[j]['HostName'] + version = live_nm_list[j]['NodeManagerVersion'] + rack = live_nm_list[j]['Rack'] + label = [self.cluster, host, version, rack, self.target] + if 'State' == metric: + value = self.NODE_STATE[live_nm_list[j]['State']] + else: + value = live_nm_list[j][metric] if metric in live_nm_list[j] else 0.0 + self.hadoop_resourcemanager_metrics['RMNMInfo'][metric].add_metric(label, value) + self.nms = nms + + def get_queue_metrics(self, bean): + for metric in self.metrics['QueueMetrics']: + label = [self.cluster, bean.get("modelerType", "-"), bean.get("tag.Queue", "-"), bean.get("tag.User", "-")] + if "running_0" in metric: + key = "running_app" + label.append("0to60") + elif "running_60" in metric: + key = "running_app" + label.append("60to300") + elif "running_300" in metric: + key = "running_app" + label.append("300to1440") + elif "running_1440" in metric: + key = "running_app" + label.append("1440up") + elif metric.endswith("VCores"): + label.append(metric.split("VCores")[0]) + key = "vcore" + elif metric.endswith("Containers"): + label.append(metric.split("Containers")[0]) + key = "containers" + elif metric.endswith("MB"): + label.append(metric.split("MB")[0]) + key = "memory" + elif metric.startswith("Apps"): + label.append(metric.split("Apps")[1]) + key = "apps" + else: + key = metric + label.append(self.target) + self.hadoop_resourcemanager_metrics['QueueMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_cluster_metrics(self, bean): + for metric in self.metrics['ClusterMetrics']: + label = [self.cluster] + if "NMs" in metric: + label.append(metric.split('NMs')[0].split('Num')[1]) + key = "NMs" + elif "NumOps" in metric: + key = "NumOps" + label.append(metric.split("DelayNumOps")[0].split('AM')[1]) + elif "AvgTime" in metric: + key = "AvgTime" + label.append(metric.split("DelayAvgTime")[0].split('AM')[1]) + else: + continue + label.append(self.target) + self.hadoop_resourcemanager_metrics['ClusterMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) + + def get_metrics(self, beans): + for i in range(len(beans)): + if 'RMNMInfo' in beans[i]['name']: + self.get_rmnminfo_metrics(beans[i]) + if 'QueueMetrics' in beans[i]['name'] and re.match(self.queue_regexp, beans[i]['tag.Queue']): + self.get_queue_metrics(beans[i]) + if 'ClusterMetrics' in beans[i]['name']: + self.get_cluster_metrics(beans[i]) From df0e340b39f6e098e6ef27207b36b3a6c45a19aa Mon Sep 17 00:00:00 2001 From: akenO8 Date: Thu, 1 Jun 2023 11:25:48 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E8=A7=84=E8=8C=83=E7=9B=AE=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/__init__.py | 0 common.py => cmd/common.py | 0 hdfs_datanode.py => cmd/hdfs_datanode.py | 2 +- .../hdfs_journalnode.py | 2 +- hdfs_namenode.py => cmd/hdfs_namenode.py | 7 +- cmd/hive_server.py | 195 ++++++++++++++++++ scraper.py => cmd/scraper.py | 10 +- utils.py => cmd/utils.py | 6 +- .../yarn_nodemanager.py | 2 +- .../yarn_resourcemanager.py | 10 +- hadoop_jmx_exporter.py | 18 +- metrics/namenode/NameNode.json | 133 ++++++++++++ requirements.txt | 1 + 13 files changed, 363 insertions(+), 23 deletions(-) create mode 100644 cmd/__init__.py rename common.py => cmd/common.py (100%) rename hdfs_datanode.py => cmd/hdfs_datanode.py (99%) rename hdfs_journalnode.py => cmd/hdfs_journalnode.py (99%) rename hdfs_namenode.py => cmd/hdfs_namenode.py (98%) create mode 100644 cmd/hive_server.py rename scraper.py => cmd/scraper.py (79%) rename utils.py => cmd/utils.py (92%) rename yarn_nodemanager.py => cmd/yarn_nodemanager.py (98%) rename yarn_resourcemanager.py => cmd/yarn_resourcemanager.py (96%) create mode 100644 metrics/namenode/NameNode.json diff --git a/cmd/__init__.py b/cmd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/common.py b/cmd/common.py similarity index 100% rename from common.py rename to cmd/common.py diff --git a/hdfs_datanode.py b/cmd/hdfs_datanode.py similarity index 99% rename from hdfs_datanode.py rename to cmd/hdfs_datanode.py index 8e55a75..6a1de85 100644 --- a/hdfs_datanode.py +++ b/cmd/hdfs_datanode.py @@ -6,7 +6,7 @@ from prometheus_client.core import GaugeMetricFamily from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector +from cmd.common import MetricCollector, CommonMetricCollector from scraper import ScrapeMetrics logger = get_module_logger(__name__) diff --git a/hdfs_journalnode.py b/cmd/hdfs_journalnode.py similarity index 99% rename from hdfs_journalnode.py rename to cmd/hdfs_journalnode.py index 75d7191..b214e0b 100644 --- a/hdfs_journalnode.py +++ b/cmd/hdfs_journalnode.py @@ -5,7 +5,7 @@ from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector +from cmd.common import MetricCollector, CommonMetricCollector from scraper import ScrapeMetrics logger = get_module_logger(__name__) diff --git a/hdfs_namenode.py b/cmd/hdfs_namenode.py similarity index 98% rename from hdfs_namenode.py rename to cmd/hdfs_namenode.py index 93a15a9..25768a9 100644 --- a/hdfs_namenode.py +++ b/cmd/hdfs_namenode.py @@ -395,7 +395,10 @@ def get_nninfo_metrics(self, bean): label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target] items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] - dns.add("http://"+info["infoAddr"]+"/jmx") + # dns.add("http://"+info["infoAddr"]+"/jmx") + dn_port = info["infoAddr"].split(':')[1] + dn_host = node.split(':')[0] + dns.add("http://" + dn_host + ":" + dn_port + "/jmx") for item in items: value = info[item] if item in info else 0 if item == "adminState": @@ -475,4 +478,4 @@ def get_metrics(self, beans): if 'RetryCache' in beans[i]['name']: self.get_retrycache_metrics(beans[i]) if 'NameNodeInfo' in beans[i]['name']: - self.get_nninfo_metrics(beans[i]) + self.get_nninfo_metrics(beans[i]) \ No newline at end of file diff --git a/cmd/hive_server.py b/cmd/hive_server.py new file mode 100644 index 0000000..27f803c --- /dev/null +++ b/cmd/hive_server.py @@ -0,0 +1,195 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import yaml +import re +import time +from sys import exit +from prometheus_client import start_http_server +from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily, REGISTRY + +import utils +from utils import get_module_logger +from consul import Consul +from common import MetricCollector, CommonMetricCollector + +logger = get_module_logger(__name__) + + +class HiveServerMetricCollector(MetricCollector): + + def __init__(self, cluster, url): + MetricCollector.__init__(self, cluster, url, "hive", "hiveserver2") + self._hadoop_hiveserver2_metrics = {} + for i in range(len(self._file_list)): + self._hadoop_hiveserver2_metrics.setdefault(self._file_list[i], {}) + + def collect(self): + # Request data from ambari Collect Host API + # Request exactly the System level information we need from node + # beans returns a type of 'List' + try: + count = 0 + # In case no metrics we need in the jmx url, a time sleep and while-loop was set here to wait for the KEY metrics + while count < 5: + beans = utils.get_metrics(self._url) + if 'init_total_count_tables' not in beans: + count += 1 + time.sleep(1) + continue + else: + break + except: + logger.info("Can't scrape metrics from url: {0}".format(self._url)) + else: + pass + finally: + # set up all metrics with labels and descriptions. + self.setup_labels(beans) + + # add metric value to every metric. + self.get_metrics(beans) + + # update namenode metrics with common metrics + common_metrics = CommonMetricCollector(self._cluster, beans, "hive", "hiveserver2") + self._hadoop_hiveserver2_metrics.update(common_metrics()) + + for i in range(len(self._merge_list)): + service = self._merge_list[i] + for metric in self._hadoop_hiveserver2_metrics[service]: + yield self._hadoop_hiveserver2_metrics[service][metric] + + def setup_node_labels(self, bean, service): + label = ["cluster", "host", "client_id", "node_id"] + for metric in self._metrics[service]: + if metric in bean: + name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() + self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily( + "_".join([self._prefix, 'producer_node', name]), + self._metrics[service][metric], + labels=label) + + def setup_topic_labels(self, bean, service): + label = ["cluster", "host", "client_id", "topic"] + for metric in self._metrics[service]: + if metric in bean: + name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() + self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily( + "_".join([self._prefix, 'producer_topic', name]), + self._metrics[service][metric], + labels=label) + + def setup_producer_labels(self, bean, service): + label = ["cluster", "host", "client_id"] + for metric in self._metrics[service]: + if metric in bean: + name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() + self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily("_".join([self._prefix, name]), + self._metrics[service][metric], + labels=label) + + def setup_other_labels(self, bean, service): + label = ["cluster", "host"] + for metric in self._metrics[service]: + if metric in bean: + name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() + self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily("_".join([self._prefix, name]), + self._metrics[service][metric], + labels=label) + + def setup_labels(self, beans): + # The metrics we want to export. + for service in self._metrics: + for i in range(len(beans)): + if 'producer-node-metrics' == service and 'type=producer-node-metrics' in beans[i]['name']: + self._setup_node_labels(beans[i], service) + elif 'producer-topic-metrics' == service and 'type=producer-topic-metrics' in beans[i]['name']: + self._setup_topic_labels(beans[i], service) + elif 'producer-metrics' == service and 'type=producer-metrics' in beans[i][ + 'name'] or 'kafka-metrics-count' == service and 'type=kafka-metrics-count' in beans[i]['name']: + self._setup_producer_labels(beans[i], service) + elif service in beans[i]['name']: + self._setup_other_labels(beans[i], service) + else: + continue + + def get_node_metrics(self, bean, service, host): + client_id = bean['name'].split('client-id=')[1].split(',')[0] + node_id = bean['name'].split('node-id=')[1].split(',')[0] + for metric in bean: + if metric in self._metrics[service]: + self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host, client_id, node_id], + bean[metric]) + else: + continue + + def get_topic_metrics(self, bean, service, host): + client_id = bean['name'].split('client-id=')[1].split(',')[0] + topic = bean['name'].split('topic=')[1].split(',')[0] + for metric in bean: + if metric in self._metrics[service]: + self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host, client_id, topic], + bean[metric]) + else: + continue + + def get_producer_metrics(self, bean, service, host): + client_id = bean['name'].split('client-id=')[1].split(',')[0] + for metric in bean: + if metric in self._metrics[service]: + self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host, client_id], + bean[metric]) + else: + continue + + def get_other_metrics(self, bean, service, host): + for metric in bean: + if metric in self._metrics[service]: + self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host], bean[metric]) + else: + continue + + def get_metrics(self, beans): + # bean is a type of + # status is a type of + for i in range(len(beans)): + if 'tag.Hostname' in beans[i]: + host = beans[i]['tag.Hostname'] + break + else: + continue + for i in range(len(beans)): + for service in self._metrics: + if 'producer-node-metrics' == service and 'type=producer-node-metrics' in beans[i]['name']: + self._get_node_metrics(beans[i], service, host) + elif 'producer-topic-metrics' == service and 'type=producer-topic-metrics' in beans[i]['name']: + self._get_topic_metrics(beans[i], service, host) + elif 'producer-metrics' == service and 'type=producer-metrics' in beans[i][ + 'name'] or 'kafka-metrics-count' == service and 'type=kafka-metrics-count' in beans[i]['name']: + self._get_producer_metrics(beans[i], service, host) + elif service in beans[i]['name']: + self._get_other_metrics(beans[i], service, host) + else: + continue + + +def main(): + try: + args = utils.parse_args() + port = int(args.port) + cluster = args.cluster + v = args.hive_url + REGISTRY.register(HiveServerMetricCollector(cluster, v)) + + start_http_server(port) + # print("Polling %s. Serving at port: %s" % (args.address, port)) + print("Polling %s. Serving at port: %s" % (args.address, port)) + while True: + time.sleep(1) + except KeyboardInterrupt: + print(" Interrupted") + exit(0) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scraper.py b/cmd/scraper.py similarity index 79% rename from scraper.py rename to cmd/scraper.py index 78ef44f..1307c92 100644 --- a/scraper.py +++ b/cmd/scraper.py @@ -1,10 +1,11 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- - +import os import threading import requests +import requests_kerberos -from utils import get_module_logger +from cmd.utils import get_module_logger logger = get_module_logger(__name__) @@ -16,12 +17,15 @@ def __init__(self, url, result): self.name = "thread-%s" % url self.url = url self.result = result + krbpath = '/tmp/krb5cc_hdfs' + os.environ['KRB5CCNAME'] = krbpath + self.auth_kerberos = requests_kerberos.HTTPKerberosAuth(mutual_authentication=requests_kerberos.OPTIONAL) def run(self): result = [] try: s = requests.session() - response = s.get(self.url, timeout=5) + response = s.get(self.url, timeout=5, auth=self.auth_kerberos) except Exception as e: logger.warning("Get {0} failed, error: {1}.".format(self.url, str(e))) else: diff --git a/utils.py b/cmd/utils.py similarity index 92% rename from utils.py rename to cmd/utils.py index b2e55e6..ce936e2 100644 --- a/utils.py +++ b/cmd/utils.py @@ -32,7 +32,8 @@ def get_module_logger(mod_name): def read_json_file(path_name, file_name): path = os.path.dirname(os.path.realpath(__file__)) - metric_path = os.path.join(path, "metrics", path_name) + parent_path = os.path.dirname(path) + metric_path = os.path.join(parent_path, "metrics", path_name) metric_name = "{0}.json".format(file_name) try: with open(os.path.join(metric_path, metric_name), 'r') as f: @@ -45,7 +46,8 @@ def read_json_file(path_name, file_name): def get_file_list(file_path_name): path = os.path.dirname(os.path.abspath(__file__)) - json_path = os.path.join(path, "metrics", file_path_name) + parent_path = os.path.dirname(path) + json_path = os.path.join(parent_path, "metrics", file_path_name) try: files = os.listdir(json_path) except OSError: diff --git a/yarn_nodemanager.py b/cmd/yarn_nodemanager.py similarity index 98% rename from yarn_nodemanager.py rename to cmd/yarn_nodemanager.py index 4b53b07..8afa984 100644 --- a/yarn_nodemanager.py +++ b/cmd/yarn_nodemanager.py @@ -5,7 +5,7 @@ from prometheus_client.core import GaugeMetricFamily from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector +from cmd.common import MetricCollector, CommonMetricCollector from scraper import ScrapeMetrics logger = get_module_logger(__name__) diff --git a/yarn_resourcemanager.py b/cmd/yarn_resourcemanager.py similarity index 96% rename from yarn_resourcemanager.py rename to cmd/yarn_resourcemanager.py index 4fc7f1c..0d660c8 100644 --- a/yarn_resourcemanager.py +++ b/cmd/yarn_resourcemanager.py @@ -6,7 +6,7 @@ from prometheus_client.core import GaugeMetricFamily from utils import get_module_logger -from common import MetricCollector, CommonMetricCollector +from cmd.common import MetricCollector, CommonMetricCollector from scraper import ScrapeMetrics logger = get_module_logger(__name__) @@ -172,8 +172,8 @@ def setup_metrics_labels(self, beans): for i in range(len(beans)): if 'RMNMInfo' in beans[i]['name']: self.setup_rmnminfo_labels() - if 'QueueMetrics' in self.metrics: - self.setup_queue_labels() + # if 'QueueMetrics' in self.metrics: + # self.setup_queue_labels() if 'ClusterMetrics' in self.metrics: self.setup_cluster_labels() @@ -247,7 +247,7 @@ def get_metrics(self, beans): for i in range(len(beans)): if 'RMNMInfo' in beans[i]['name']: self.get_rmnminfo_metrics(beans[i]) - if 'QueueMetrics' in beans[i]['name'] and re.match(self.queue_regexp, beans[i]['tag.Queue']): - self.get_queue_metrics(beans[i]) + # if 'QueueMetrics' in beans[i]['name'] and 'tag.Queue' in beans[i] and re.match(self.queue_regexp, beans[i]['tag.Queue']): + # self.get_queue_metrics(beans[i]) if 'ClusterMetrics' in beans[i]['name']: self.get_cluster_metrics(beans[i]) diff --git a/hadoop_jmx_exporter.py b/hadoop_jmx_exporter.py index ef590dd..e97f1fa 100755 --- a/hadoop_jmx_exporter.py +++ b/hadoop_jmx_exporter.py @@ -5,13 +5,13 @@ from prometheus_client import start_http_server from prometheus_client.core import REGISTRY -import utils -from utils import get_module_logger -from hdfs_namenode import NameNodeMetricCollector -from hdfs_datanode import DataNodeMetricCollector -from hdfs_journalnode import JournalNodeMetricCollector -from yarn_resourcemanager import ResourceManagerMetricCollector -from yarn_nodemanager import NodeManagerMetricCollector +from cmd import utils +from cmd.utils import get_module_logger +from cmd.hdfs_namenode import NameNodeMetricCollector +from cmd.hdfs_datanode import DataNodeMetricCollector +from cmd.hdfs_journalnode import JournalNodeMetricCollector +from cmd.yarn_resourcemanager import ResourceManagerMetricCollector +from cmd.yarn_nodemanager import NodeManagerMetricCollector logger = get_module_logger(__name__) @@ -26,9 +26,11 @@ def register_prometheus(cluster, args): rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue) rmc.collect() REGISTRY.register(rmc) - REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) + # REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) if args.jns is not None and len(args.jns) > 0: REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns)) + + def main(): args = utils.parse_args() host = args.host diff --git a/metrics/namenode/NameNode.json b/metrics/namenode/NameNode.json new file mode 100644 index 0000000..c055ce8 --- /dev/null +++ b/metrics/namenode/NameNode.json @@ -0,0 +1,133 @@ +{ + "CreateFileOps": "Total number of files created.", + "FilesCreated": "Total number of files and directories created by create or mkdir operations.", + "FilesAppended": "Total number of files appended.", + "GetBlockLocations": "Total number of getBlockLocations operations.", + "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).", + "GetListingOps": "Total number of directory listing operations.", + "DeleteFileOps": "Total number of delete operations.", + "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.", + "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.", + "AddBlockOps": "Total number of addBlock operations succeeded.", + "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.", + "CreateSymlinkOps": "Total number of createSymlink operations.", + "GetLinkTargetOps": "Total number of getLinkTarget operations.", + "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.", + "AllowSnapshotOps": "Total number of allowSnapshot operations.", + "DisallowSnapshotOps": "Total number of disallowSnapshot operations.", + "CreateSnapshotOps": "Total number of createSnapshot operations.", + "DeleteSnapshotOps": "Total number of deleteSnapshot operations.", + "RenameSnapshotOps": "Total number of renameSnapshot operations.", + "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.", + "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.", + "TransactionsNumOps": "Total number of Journal transactions.", + "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.", + "SyncsNumOps": "Total number of Journal syncs.", + "SyncsAvgTime": "Average time of Journal syncs in milliseconds.", + "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.", + "BlockReportNumOps": "Total number of processing block reports from DataNode.", + "BlockReportAvgTime": "Average time of processing block reports in milliseconds.", + "CacheReportNumOps": "Total number of processing cache reports from DataNode.", + "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.", + "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).", + "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.", + "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.", + "GetEditAvgTime": "Average edits download time in milliseconds.", + "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.", + "GetImageAvgTime": "Average fsimage download time in milliseconds.", + "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.", + "PutImageAvgTime": "Average fsimage upload time in milliseconds.", + "TotalFileOps": "Total number of all file operations.", + "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state", + "MissingBlocks": "Current number of missing blocks", + "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1", + "ExpiredHeartbeats": "Total number of expired heartbeats", + "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint", + "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll", + "LastWrittenTransactionId": "Last transaction ID written to the edit log", + "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint", + "CapacityTotal": "Current raw capacity of DataNodes in bytes", + "CapacityUsed": "Current used capacity across all DataNodes in bytes", + "CapacityRemaining": "Current remaining capacity in bytes", + "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes", + "TotalLoad": "Current number of connections", + "SnapshottableDirectories": "Current number of snapshottable directories", + "Snapshots": "Current number of snapshots", + "NumEncryptionZones": "Current number of encryption zones", + "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock", + "BlocksTotal": "Current number of allocated blocks in the system", + "NumFilesUnderConstruction": "Current number of files under construction", + "NumActiveClients": "Current number of active clients holding lease", + "FilesTotal": "Current number of files and directories", + "PendingReplicationBlocks": "Current number of blocks pending to be replicated", + "UnderReplicatedBlocks": "Current number of blocks under replicated", + "CorruptBlocks": "Current number of blocks with corrupt replicas.", + "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications", + "PendingDeletionBlocks": "Current number of blocks pending deletion", + "ExcessBlocks": "Current number of excess blocks", + "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.", + "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate", + "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode", + "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0", + "BlockCapacity": "Current number of block capacity", + "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat", + "TotalSyncCount": "Total number of sync operations performed by edit log", + "FsLockQueueLength": "Filesystem lock queue length", + "MaxObjects": "Max objects", + "BlockDeletionStartTime": "Start time of block deletion", + "NumLiveDataNodes": "Number of datanodes which are currently live", + "NumDeadDataNodes": "Number of datanodes which are currently dead", + "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live", + "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead", + "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state", + "NumStaleDataNodes": "Number of datanodes marked as content stale", + "VolumeFailuresTotal": "Total number of volume failures across all Datanodes", + "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures", + "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)", + "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)", + "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation", + "Total": "Total", + "TotalBlocks": "Total number of blocks", + "Used": "Total used space by data nodes", + "Free": "Total free space by data nodes", + "Safemode": "Is in safe mode. 0: no, 1: yes", + "NonDfsUsedSpace": "Total used space by data nodes for non DFS purposes such as storing temporary files on the local file system", + "PercentUsed": "Total used space by data nodes as percentage of total capacity", + "BlockPoolUsedSpace": "Block pool used space", + "PercentBlockPoolUsed": "Percent of block pool used", + "PercentRemaining": "Total remaining space by data nodes as percentage of total capacity", + "CacheCapacity": "Cache Capacity", + "CacheUsed": "Cache Used", + "TotalFiles": "Total Files", + "NumberOfMissingBlocks": "Number of missing blocks", + "NumberOfMissingBlocksWithReplicationFactorOne": "Number of missing blocks with replication factor one", + "LiveNodes": "Live nodes", + "SoftwareVersion": "Software version", + "DeadNodes": "Dead nodes", + "DecomNodes": "Decom nodes", + "EnteringMaintenanceNodes": "Entering maintenance nodes", + "NodeUsage": "Node Usage", + "NNStartedTimeInMillis": "NameNode started time in millis", + "CorruptFiles": "Corrupt file list", + "CacheHit": "Total number of RetryCache hit.", + "CacheCleared": "Total number of RetryCache cleared.", + "CacheUpdated": "Total number of RetryCache updated.", + "ElapsedTime": "Total elapsed time in milliseconds.", + "PercentComplete": "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0).", + "LoadingFsImageCount": "", + "LoadingFsImageElapsedTime": "", + "LoadingFsImageTotal": "", + "LoadingFsImagePercentComplete": "", + "LoadingEditsCount": "", + "LoadingEditsElapsedTime": "", + "LoadingEditsTotal": "", + "LoadingEditsPercentComplete": "", + "SavingCheckpointCount": "", + "SavingCheckpointElapsedTime": "", + "SavingCheckpointTotal": "", + "SavingCheckpointPercentComplete": "", + "SafeModeCount": "", + "SafeModeElapsedTime": "", + "SafeModeTotal": "", + "SafeModePercentComplete": "" +} diff --git a/requirements.txt b/requirements.txt index 3b2d237..5fb1acc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests +requests_kerberos prometheus_client pyyaml From 24e9d51378efdeb865d96f525c47c2ea4be0e76a Mon Sep 17 00:00:00 2001 From: akenO8 Date: Thu, 8 Jun 2023 11:24:00 +0800 Subject: [PATCH 3/5] [NEW]add_hive_and_trino --- README.md | 2 + cmd/a.py | 4 + cmd/common.py | 44 +- cmd/hive_server.py | 226 ++---- cmd/scraper.py | 15 +- cmd/trino_coordinator.py | 106 +++ cmd/utils.py | 2 + hadoop_jmx_exporter.py | 9 +- metrics/coordinator/ClusterMemoryManager.json | 4 + metrics/coordinator/ClusterMemoryPool.json | 3 + .../coordinator/HeartbeatFailureDetector.json | 5 + metrics/coordinator/Memory.json | 4 + metrics/coordinator/QueryManager.json | 14 + metrics/coordinator/SqlTaskManager.json | 10 + metrics/coordinator/Threading.json | 5 + metrics/hiveserver2/HiveServer2.json | 705 ++++++++++++++++++ 16 files changed, 964 insertions(+), 194 deletions(-) create mode 100644 cmd/a.py create mode 100644 cmd/trino_coordinator.py create mode 100644 metrics/coordinator/ClusterMemoryManager.json create mode 100644 metrics/coordinator/ClusterMemoryPool.json create mode 100644 metrics/coordinator/HeartbeatFailureDetector.json create mode 100644 metrics/coordinator/Memory.json create mode 100644 metrics/coordinator/QueryManager.json create mode 100644 metrics/coordinator/SqlTaskManager.json create mode 100644 metrics/coordinator/Threading.json create mode 100644 metrics/hiveserver2/HiveServer2.json diff --git a/README.md b/README.md index 8d54a93..6940846 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ optional arguments: Hadoop resourcemanager metrics jmx URL. -jns [journalnode_jmx_url [journalnode_jmx_url ...]] Hadoop journalnode jmx metrics URL. + -hss [hiveserver2_jmx_url [hiveserver2_jmx_url ...]] + Hadoop hiveserver2 jmx metrics URL. -host host Listen on this address. default: 0.0.0.0 -port port Listen to this port. default: 6688 ➜ hadoop_exporter git:(master) ✗ diff --git a/cmd/a.py b/cmd/a.py new file mode 100644 index 0000000..b6d21ec --- /dev/null +++ b/cmd/a.py @@ -0,0 +1,4 @@ +import requests + +rsp = requests.get("http://pcdn.dq.baidu.com/v3/accesslog", stream=True) +print (rsp.raw._connection.sock.getpeername()[0]) \ No newline at end of file diff --git a/cmd/common.py b/cmd/common.py index fdef10b..4bf51c6 100644 --- a/cmd/common.py +++ b/cmd/common.py @@ -6,7 +6,6 @@ import utils - logger = utils.get_module_logger(__name__) @@ -22,7 +21,10 @@ def __init__(self, cluster, component, service): self.metrics.setdefault(self.file_list[i], utils.read_json_file(service, self.file_list[i])) common_file = utils.get_file_list("common") - self.merge_list = self.file_list + common_file + if component == "hdfs" or component == "yarn": + self.merge_list = self.file_list + common_file + else: + self.merge_list = self.file_list def collect(self): pass @@ -130,14 +132,17 @@ def setup_jvm_labels(self): key = snake_case descriptions = self.tmp_metrics['JvmMetrics'][metric] label.append("_target") - self.common_metrics['JvmMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, key]), descriptions, labels=label) + self.common_metrics['JvmMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, key]), descriptions, + labels=label) def setup_os_labels(self): for metric in self.tmp_metrics['OperatingSystem']: label = ["cluster", "_target"] snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case]) - self.common_metrics['OperatingSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['OperatingSystem'][metric], labels=label) + self.common_metrics['OperatingSystem'][metric] = GaugeMetricFamily(name, + self.tmp_metrics['OperatingSystem'][ + metric], labels=label) def setup_rpc_labels(self): num_rpc_flag, avg_rpc_flag = 1, 1 @@ -170,7 +175,9 @@ def setup_rpc_labels(self): key = metric label.append("_target") name = "_".join([self.prefix, snake_case]) - self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcActivity'][metric], labels=label) + self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, + self.tmp_metrics['RpcActivity'][metric], + labels=label) def setup_rpc_detailed_labels(self): for metric in self.tmp_metrics['RpcDetailedActivity']: @@ -183,7 +190,9 @@ def setup_rpc_detailed_labels(self): name = "_".join([self.prefix, 'rpc_detailed_method_avg_time_milliseconds']) else: continue - self.common_metrics['RpcDetailedActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcDetailedActivity'][metric], labels=label) + self.common_metrics['RpcDetailedActivity'][key] = GaugeMetricFamily(name, + self.tmp_metrics['RpcDetailedActivity'][ + metric], labels=label) return self.common_metrics def setup_ugi_labels(self): @@ -214,7 +223,9 @@ def setup_ugi_labels(self): label.append("_target") snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, 'ugi', snake_case]) - self.common_metrics['UgiMetrics'][metric] = GaugeMetricFamily(name, self.tmp_metrics['UgiMetrics'][metric], labels=label) + self.common_metrics['UgiMetrics'][metric] = GaugeMetricFamily(name, + self.tmp_metrics['UgiMetrics'][metric], + labels=label) def setup_metric_system_labels(self): metric_num_flag, metric_avg_flag = 1, 1 @@ -226,7 +237,8 @@ def setup_metric_system_labels(self): label.extend(["oper", "_target"]) metric_num_flag = 0 name = "_".join([self.prefix, 'metricssystem_operations_total']) - self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, "Total number of operations", labels=label) + self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, "Total number of operations", + labels=label) else: continue elif 'AvgTime' in metric: @@ -243,14 +255,17 @@ def setup_metric_system_labels(self): label.append("_target") snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, 'metricssystem', snake_case]) - self.common_metrics['MetricsSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['MetricsSystem'][metric], labels=label) + self.common_metrics['MetricsSystem'][metric] = GaugeMetricFamily(name, + self.tmp_metrics['MetricsSystem'][ + metric], labels=label) def setup_runtime_labels(self): for metric in self.tmp_metrics['Runtime']: label = ["cluster", "host", "_target"] snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case, "milliseconds"]) - self.common_metrics['Runtime'][metric] = GaugeMetricFamily(name, self.tmp_metrics['Runtime'][metric], labels=label) + self.common_metrics['Runtime'][metric] = GaugeMetricFamily(name, self.tmp_metrics['Runtime'][metric], + labels=label) def get_jvm_metrics(self, bean): for metric in self.tmp_metrics['JvmMetrics']: @@ -373,7 +388,8 @@ def get_ugi_metrics(self, bean): key = metric label = [self.cluster] label.append(self.target) - self.common_metrics['UgiMetrics'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + self.common_metrics['UgiMetrics'][key].add_metric(label, + bean[metric] if metric in bean and bean[metric] else 0) def get_metric_system_metrics(self, bean): for metric in self.tmp_metrics['MetricsSystem']: @@ -389,9 +405,11 @@ def get_metric_system_metrics(self, bean): key = metric label = [self.cluster] label.append(self.target) - self.common_metrics['MetricsSystem'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + self.common_metrics['MetricsSystem'][key].add_metric(label, + bean[metric] if metric in bean and bean[metric] else 0) def get_runtime_metrics(self, bean): for metric in self.tmp_metrics['Runtime']: label = [self.cluster, bean['Name'].split("@")[1], self.target] - self.common_metrics['Runtime'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) + self.common_metrics['Runtime'][metric].add_metric(label, + bean[metric] if metric in bean and bean[metric] else 0) diff --git a/cmd/hive_server.py b/cmd/hive_server.py index 27f803c..e1bcf09 100644 --- a/cmd/hive_server.py +++ b/cmd/hive_server.py @@ -1,195 +1,69 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import yaml -import re -import time -from sys import exit -from prometheus_client import start_http_server -from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily, REGISTRY +from prometheus_client.core import GaugeMetricFamily +from scraper import ScrapeMetrics -import utils from utils import get_module_logger -from consul import Consul -from common import MetricCollector, CommonMetricCollector +from common import MetricCollector logger = get_module_logger(__name__) class HiveServerMetricCollector(MetricCollector): - def __init__(self, cluster, url): - MetricCollector.__init__(self, cluster, url, "hive", "hiveserver2") - self._hadoop_hiveserver2_metrics = {} - for i in range(len(self._file_list)): - self._hadoop_hiveserver2_metrics.setdefault(self._file_list[i], {}) + def __init__(self, cluster, urls): + MetricCollector.__init__(self, cluster, "hive", "hiveserver2") + self.target = "-" + self.urls = urls - def collect(self): - # Request data from ambari Collect Host API - # Request exactly the System level information we need from node - # beans returns a type of 'List' - try: - count = 0 - # In case no metrics we need in the jmx url, a time sleep and while-loop was set here to wait for the KEY metrics - while count < 5: - beans = utils.get_metrics(self._url) - if 'init_total_count_tables' not in beans: - count += 1 - time.sleep(1) - continue - else: - break - except: - logger.info("Can't scrape metrics from url: {0}".format(self._url)) - else: - pass - finally: - # set up all metrics with labels and descriptions. - self.setup_labels(beans) - - # add metric value to every metric. - self.get_metrics(beans) - - # update namenode metrics with common metrics - common_metrics = CommonMetricCollector(self._cluster, beans, "hive", "hiveserver2") - self._hadoop_hiveserver2_metrics.update(common_metrics()) - - for i in range(len(self._merge_list)): - service = self._merge_list[i] - for metric in self._hadoop_hiveserver2_metrics[service]: - yield self._hadoop_hiveserver2_metrics[service][metric] + self.hadoop_hiveserver2_metrics = {} + for i in range(len(self.file_list)): + self.hadoop_hiveserver2_metrics.setdefault(self.file_list[i], {}) - def setup_node_labels(self, bean, service): - label = ["cluster", "host", "client_id", "node_id"] - for metric in self._metrics[service]: - if metric in bean: - name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() - self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily( - "_".join([self._prefix, 'producer_node', name]), - self._metrics[service][metric], - labels=label) + self.scrape_metrics = ScrapeMetrics(urls) - def setup_topic_labels(self, bean, service): - label = ["cluster", "host", "client_id", "topic"] - for metric in self._metrics[service]: - if metric in bean: - name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() - self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily( - "_".join([self._prefix, 'producer_topic', name]), - self._metrics[service][metric], - labels=label) - - def setup_producer_labels(self, bean, service): - label = ["cluster", "host", "client_id"] - for metric in self._metrics[service]: - if metric in bean: - name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() - self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily("_".join([self._prefix, name]), - self._metrics[service][metric], - labels=label) - - def setup_other_labels(self, bean, service): - label = ["cluster", "host"] - for metric in self._metrics[service]: - if metric in bean: - name = re.sub('[^a-z0-9A-Z]', '_', metric).lower() - self._hadoop_hiveserver2_metrics[service][metric] = GaugeMetricFamily("_".join([self._prefix, name]), - self._metrics[service][metric], - labels=label) - - def setup_labels(self, beans): - # The metrics we want to export. - for service in self._metrics: + def collect(self): + isSetup = False + beans_list = self.scrape_metrics.scrape() + for beans in beans_list: + if not isSetup: + self.setup_metrics_labels(beans) + isSetup = True for i in range(len(beans)): - if 'producer-node-metrics' == service and 'type=producer-node-metrics' in beans[i]['name']: - self._setup_node_labels(beans[i], service) - elif 'producer-topic-metrics' == service and 'type=producer-topic-metrics' in beans[i]['name']: - self._setup_topic_labels(beans[i], service) - elif 'producer-metrics' == service and 'type=producer-metrics' in beans[i][ - 'name'] or 'kafka-metrics-count' == service and 'type=kafka-metrics-count' in beans[i]['name']: - self._setup_producer_labels(beans[i], service) - elif service in beans[i]['name']: - self._setup_other_labels(beans[i], service) - else: - continue - - def get_node_metrics(self, bean, service, host): - client_id = bean['name'].split('client-id=')[1].split(',')[0] - node_id = bean['name'].split('node-id=')[1].split(',')[0] - for metric in bean: - if metric in self._metrics[service]: - self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host, client_id, node_id], - bean[metric]) - else: - continue - - def get_topic_metrics(self, bean, service, host): - client_id = bean['name'].split('client-id=')[1].split(',')[0] - topic = bean['name'].split('topic=')[1].split(',')[0] - for metric in bean: - if metric in self._metrics[service]: - self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host, client_id, topic], - bean[metric]) - else: - continue - - def get_producer_metrics(self, bean, service, host): - client_id = bean['name'].split('client-id=')[1].split(',')[0] - for metric in bean: - if metric in self._metrics[service]: - self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host, client_id], - bean[metric]) - else: - continue + if 'tag.Hostname' in beans[i]: + self.target = beans[i]["tag.Hostname"] + break + self.get_metrics(beans) - def get_other_metrics(self, bean, service, host): - for metric in bean: - if metric in self._metrics[service]: - self._hadoop_hiveserver2_metrics[service][metric].add_metric([self._cluster, host], bean[metric]) - else: - continue + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.hadoop_hiveserver2_metrics: + for metric in self.hadoop_hiveserver2_metrics[service]: + yield self.hadoop_hiveserver2_metrics[service][metric] + + def setup_metrics_labels(self, beans): + # for i in range(len(beans)): + # if 'Hadoop:service=hiveserver2,name=hiveserver2' in beans[i]['name']: + # self.setup_hiveserver2_labels() + self.setup_hiveserver2_labels() + + def setup_hiveserver2_labels(self): + # for metric in self.metrics['HiveServer2']: + label = ["cluster", "method", "_target"] + key = "Hiveserver2" + name = "_".join([self.prefix, key]) + description = "Hive Server2 metric." + self.hadoop_hiveserver2_metrics['HiveServer2'][key] = GaugeMetricFamily(name, description, labels=label) def get_metrics(self, beans): - # bean is a type of - # status is a type of - for i in range(len(beans)): - if 'tag.Hostname' in beans[i]: - host = beans[i]['tag.Hostname'] - break - else: - continue for i in range(len(beans)): - for service in self._metrics: - if 'producer-node-metrics' == service and 'type=producer-node-metrics' in beans[i]['name']: - self._get_node_metrics(beans[i], service, host) - elif 'producer-topic-metrics' == service and 'type=producer-topic-metrics' in beans[i]['name']: - self._get_topic_metrics(beans[i], service, host) - elif 'producer-metrics' == service and 'type=producer-metrics' in beans[i][ - 'name'] or 'kafka-metrics-count' == service and 'type=kafka-metrics-count' in beans[i]['name']: - self._get_producer_metrics(beans[i], service, host) - elif service in beans[i]['name']: - self._get_other_metrics(beans[i], service, host) - else: - continue - - -def main(): - try: - args = utils.parse_args() - port = int(args.port) - cluster = args.cluster - v = args.hive_url - REGISTRY.register(HiveServerMetricCollector(cluster, v)) - - start_http_server(port) - # print("Polling %s. Serving at port: %s" % (args.address, port)) - print("Polling %s. Serving at port: %s" % (args.address, port)) - while True: - time.sleep(1) - except KeyboardInterrupt: - print(" Interrupted") - exit(0) - - -if __name__ == "__main__": - main() \ No newline at end of file + if 'Hadoop:service=hiveserver2,name=hiveserver2' in beans[i]['name']: + self.get_hiveserver2_labels(beans[i]) + + def get_hiveserver2_labels(self, bean): + for metric in self.metrics['HiveServer2']: + key = "Hiveserver2" + method = metric.replace('.', '_').replace('-', '_') + label = [self.cluster, method, self.target] + self.hadoop_hiveserver2_metrics['HiveServer2'][key].add_metric(label, bean[metric] if metric in bean else 0) diff --git a/cmd/scraper.py b/cmd/scraper.py index 1307c92..83c2204 100644 --- a/cmd/scraper.py +++ b/cmd/scraper.py @@ -4,6 +4,7 @@ import threading import requests import requests_kerberos +from requests.auth import HTTPBasicAuth from cmd.utils import get_module_logger @@ -25,7 +26,10 @@ def run(self): result = [] try: s = requests.session() - response = s.get(self.url, timeout=5, auth=self.auth_kerberos) + if 'mbean' in self.url: + response = s.get(self.url, timeout=5, auth=HTTPBasicAuth('admin','')) + else: + response = s.get(self.url, timeout=5, auth=self.auth_kerberos) except Exception as e: logger.warning("Get {0} failed, error: {1}.".format(self.url, str(e))) else: @@ -33,10 +37,13 @@ def run(self): logger.warning("Get {0} failed, response code is: {1}.".format(self.url, response.status_code)) else: rlt = response.json() - if rlt and "beans" in rlt: - result = rlt['beans'] + if 'mbean' in self.url and rlt: + result = rlt else: - logger.warning("No metrics get in the {0}.".format(self.url)) + if rlt and "beans" in rlt: + result = rlt['beans'] + else: + logger.warning("No metrics get in the {0}.".format(self.url)) s.close() if len(result) > 0: self.result.append(result) diff --git a/cmd/trino_coordinator.py b/cmd/trino_coordinator.py new file mode 100644 index 0000000..d80ce6c --- /dev/null +++ b/cmd/trino_coordinator.py @@ -0,0 +1,106 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from prometheus_client.core import GaugeMetricFamily +from scraper import ScrapeMetrics + +from utils import get_module_logger +from common import MetricCollector + +logger = get_module_logger(__name__) + + +class TrinoCoordinatorMetricCollector(MetricCollector): + + def __init__(self, cluster, urls): + MetricCollector.__init__(self, cluster, "trino", "coordinator") + self.trino_coordinator_prefix = 'trino_coordinator' + self.target = "-" + self.urls = urls + + self.trino_coordinator_metrics = {} + for i in range(len(self.file_list)): + self.trino_coordinator_metrics.setdefault(self.file_list[i], {}) + + self.scrape_metrics = ScrapeMetrics(urls) + + def collect(self): + isSetup = False + # isGetHost = False + beans_list = self.scrape_metrics.scrape() + for beans in beans_list: + if not isSetup: + self.setup_metrics_labels(beans) + isSetup = True + # if not isGetHost: + for i in range(len(beans)): + if 'java.lang:type=Runtime' in beans[i]['objectName']: + self.target = beans[i]['attributes'][0]['value'].split('@')[1] + # isGetHost = True + break + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.trino_coordinator_metrics: + for metric in self.trino_coordinator_metrics[service]: + yield self.trino_coordinator_metrics[service][metric] + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + if 'java.lang:type=Memory' in beans[i]['objectName']: + self.setup_trino_coor_labels('Memory') + if 'java.lang:type=Threading' in beans[i]['objectName']: + self.setup_trino_coor_labels('Threading') + if 'trino.execution:name=QueryManager' in beans[i]['objectName']: + self.setup_trino_coor_labels('QueryManager') + if 'trino.execution:name=SqlTaskManager' in beans[i]['objectName']: + self.setup_trino_coor_labels('SqlTaskManager') + if 'trino.failuredetector:name=HeartbeatFailureDetector' in beans[i]['objectName']: + self.setup_trino_coor_labels('HeartbeatFailureDetector') + if 'trino.memory:name=ClusterMemoryManager' in beans[i]['objectName']: + self.setup_trino_coor_labels('ClusterMemoryManager') + if 'trino.memory:type=ClusterMemoryPool,name=general' in beans[i]['objectName']: + self.setup_trino_coor_labels('ClusterMemoryPool') + + def setup_trino_coor_labels(self, kind): + label = ["cluster", "method", "_target"] + name = "_".join([self.trino_coordinator_prefix, kind]) + description = "Trino Coordinator {0} metric.".format(kind) + # 暂时没有细分,如果后面在kind内部继续划分key,可以用上 + key = kind + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + + def get_metrics(self, beans): + for i in range(len(beans)): + if 'java.lang:type=Memory' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'Memory') + if 'java.lang:type=Threading' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'Threading') + if 'trino.execution:name=QueryManager' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'QueryManager') + if 'trino.execution:name=SqlTaskManager' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'SqlTaskManager') + if 'trino.failuredetector:name=HeartbeatFailureDetector' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'HeartbeatFailureDetector') + if 'trino.memory:name=ClusterMemoryManager' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'ClusterMemoryManager') + if 'trino.memory:type=ClusterMemoryPool,name=general' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'ClusterMemoryPool') + + def get_trino_coor_labels(self, bean, kind): + # type(bean) = dict + for metric in self.metrics[kind]: + value = 0 + for attr in bean['attributes']: + key = kind + method = metric.replace('.', '_').replace(':', '_').replace('-', '_') + label = [self.cluster, method, self.target] + if attr['name'] == metric: + if kind == 'Memory' and 'HeapMemoryUsage' in metric: + manu = 'used' + value = attr['value'][manu] + else: + value = attr['value'] + break + self.trino_coordinator_metrics[kind][key].add_metric(label, value) \ No newline at end of file diff --git a/cmd/utils.py b/cmd/utils.py index ce936e2..8726b1f 100644 --- a/cmd/utils.py +++ b/cmd/utils.py @@ -67,6 +67,8 @@ def parse_args(): parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*") parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*") parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*") + parser.add_argument('-hss', required=False, metavar='hiveserver_jmx_url', help='Hadoop hiveserver jmx metrics URL.', nargs="*") + parser.add_argument('-tcs', required=False, metavar='trino_coordinator_jmx_url', help='Trino coordinator jmx metrics URL.', nargs="*") parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0') parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688) return parser.parse_args() diff --git a/hadoop_jmx_exporter.py b/hadoop_jmx_exporter.py index e97f1fa..01c75ca 100755 --- a/hadoop_jmx_exporter.py +++ b/hadoop_jmx_exporter.py @@ -6,11 +6,13 @@ from prometheus_client.core import REGISTRY from cmd import utils +from cmd.trino_coordinator import TrinoCoordinatorMetricCollector from cmd.utils import get_module_logger from cmd.hdfs_namenode import NameNodeMetricCollector from cmd.hdfs_datanode import DataNodeMetricCollector from cmd.hdfs_journalnode import JournalNodeMetricCollector from cmd.yarn_resourcemanager import ResourceManagerMetricCollector +from cmd.hive_server import HiveServerMetricCollector from cmd.yarn_nodemanager import NodeManagerMetricCollector logger = get_module_logger(__name__) @@ -26,9 +28,13 @@ def register_prometheus(cluster, args): rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue) rmc.collect() REGISTRY.register(rmc) - # REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) + REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) if args.jns is not None and len(args.jns) > 0: REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns)) + if args.hss is not None and len(args.hss) > 0: + REGISTRY.register(HiveServerMetricCollector(cluster, args.hss)) + if args.tcs is not None and len(args.tcs) > 0: + REGISTRY.register(TrinoCoordinatorMetricCollector(cluster, args.tcs)) def main(): @@ -39,6 +45,7 @@ def main(): print "Listen at %s:%s" % (host, port) register_prometheus(args.cluster, args) while True: + print("sleep 300s") time.sleep(300) diff --git a/metrics/coordinator/ClusterMemoryManager.json b/metrics/coordinator/ClusterMemoryManager.json new file mode 100644 index 0000000..0ca1f52 --- /dev/null +++ b/metrics/coordinator/ClusterMemoryManager.json @@ -0,0 +1,4 @@ +{ + "ClusterMemoryBytes": "Cluster memory bytes", + "QueriesKilledDueToOutOfMemory": "Cumulative count (since Trino started) of queries that ran out of memory and were killed" +} \ No newline at end of file diff --git a/metrics/coordinator/ClusterMemoryPool.json b/metrics/coordinator/ClusterMemoryPool.json new file mode 100644 index 0000000..ed7a4ad --- /dev/null +++ b/metrics/coordinator/ClusterMemoryPool.json @@ -0,0 +1,3 @@ +{ + "FreeDistributedBytes": "Free memory (general pool)" +} \ No newline at end of file diff --git a/metrics/coordinator/HeartbeatFailureDetector.json b/metrics/coordinator/HeartbeatFailureDetector.json new file mode 100644 index 0000000..5cf19b4 --- /dev/null +++ b/metrics/coordinator/HeartbeatFailureDetector.json @@ -0,0 +1,5 @@ +{ + "TotalCount": "Total nodes", + "ActiveCount": "Active nodes", + "FailedCount": "Failed nodes" +} \ No newline at end of file diff --git a/metrics/coordinator/Memory.json b/metrics/coordinator/Memory.json new file mode 100644 index 0000000..625437e --- /dev/null +++ b/metrics/coordinator/Memory.json @@ -0,0 +1,4 @@ +{ + "HeapMemoryUsage": "Heap size", + "NonHeapMemoryUsage": "Non Heap size" +} \ No newline at end of file diff --git a/metrics/coordinator/QueryManager.json b/metrics/coordinator/QueryManager.json new file mode 100644 index 0000000..7ff03c6 --- /dev/null +++ b/metrics/coordinator/QueryManager.json @@ -0,0 +1,14 @@ +{ + "RunningQueries": "Total number of running queries", + "QueuedQueries": "Total number of waiting queries", + "AbandonedQueries.FiveMinute.Count": "Total number of abandoned queries", + "CanceledQueries.FiveMinute.Count": "Total number of canceled queries", + "CompletedQueries.FiveMinute.Count": "Total number of completed queries", + "StartedQueries.FiveMinute.Count": "Queries started", + "FailedQueries.FiveMinute.Count": "Failed queries from last 5 min (all)", + "InternalFailures.FiveMinute.Count": "Failed queries from last 5 min (internal)", + "ExternalFailures.FiveMinute.Count": "Failed queries from last 5 min (external)", + "UserErrorFailures.FiveMinute.Count": "Failed queries (user)", + "ExecutionTime.FiveMinutes.P50": "Execution latency (P50)", + "WallInputBytesRate.FiveMinutes.P90": "Input data rate (P90)" +} \ No newline at end of file diff --git a/metrics/coordinator/SqlTaskManager.json b/metrics/coordinator/SqlTaskManager.json new file mode 100644 index 0000000..f836775 --- /dev/null +++ b/metrics/coordinator/SqlTaskManager.json @@ -0,0 +1,10 @@ +{ + "InputDataSize.FiveMinute.Count": "Input data bytes count", + "InputDataSize.FiveMinute.Rate": "Input data bytes rate", + "OutputDataSize.FiveMinute.Count": "Output data bytes count", + "OutputDataSize.FiveMinute.Rate": "Output data bytes rate", + "InputPositions.FiveMinute.Count": "Input rows count", + "InputPositions.FiveMinute.Rate": "Input rows rate", + "OutputPositions.FiveMinute.Count": "Output rows count", + "OutputPositions.FiveMinute.Rate": "Output rows rate" +} \ No newline at end of file diff --git a/metrics/coordinator/Threading.json b/metrics/coordinator/Threading.json new file mode 100644 index 0000000..2587706 --- /dev/null +++ b/metrics/coordinator/Threading.json @@ -0,0 +1,5 @@ +{ + "ThreadCount": "Number of threads", + "PeakThreadCount": "Peak number of threads", + "DaemonThreadCount": "Number of backend threads" +} \ No newline at end of file diff --git a/metrics/hiveserver2/HiveServer2.json b/metrics/hiveserver2/HiveServer2.json new file mode 100644 index 0000000..bd5e4be --- /dev/null +++ b/metrics/hiveserver2/HiveServer2.json @@ -0,0 +1,705 @@ +{ + "buffers.direct.capacity" : "Description of the metric", + "buffers.direct.count" : "Description of the metric", + "buffers.direct.used" : "Description of the metric", + "buffers.mapped.capacity" : "Description of the metric", + "buffers.mapped.count" : "Description of the metric", + "buffers.mapped.used" : "Description of the metric", + "classLoading.loaded" : "Description of the metric", + "classLoading.unloaded" : "Description of the metric", + "exec_async_pool_size" : "Description of the metric", + "exec_async_queue_size" : "Description of the metric", + "gc.G1-Old-Generation.count" : "Description of the metric", + "gc.G1-Old-Generation.time" : "Description of the metric", + "gc.G1-Young-Generation.count" : "Description of the metric", + "gc.G1-Young-Generation.time" : "Description of the metric", + "hs2_active_sessions" : "Description of the metric", + "hs2_avg_active_session_time" : "Description of the metric", + "hs2_avg_open_session_time" : "Description of the metric", + "hs2_open_sessions" : "Description of the metric", + "memory.heap.committed" : "Description of the metric", + "memory.heap.init" : "Description of the metric", + "memory.heap.max" : "Description of the metric", + "memory.heap.usage" : "Description of the metric", + "memory.heap.used" : "Description of the metric", + "memory.non-heap.committed" : "Description of the metric", + "memory.non-heap.init" : "Description of the metric", + "memory.non-heap.max" : "Description of the metric", + "memory.non-heap.usage" : "Description of the metric", + "memory.non-heap.used" : "Description of the metric", + "memory.pools.Code-Cache.usage" : "Description of the metric", + "memory.pools.Compressed-Class-Space.usage" : "Description of the metric", + "memory.pools.G1-Eden-Space.usage" : "Description of the metric", + "memory.pools.G1-Old-Gen.usage" : "Description of the metric", + "memory.pools.G1-Survivor-Space.usage" : "Description of the metric", + "memory.pools.Metaspace.usage" : "Description of the metric", + "memory.total.committed" : "Description of the metric", + "memory.total.init" : "Description of the metric", + "memory.total.max" : "Description of the metric", + "memory.total.used" : "Description of the metric", + "qc_current_size" : "Description of the metric", + "qc_max_size" : "Description of the metric", + "threads.blocked.count" : "Description of the metric", + "threads.count" : "Description of the metric", + "threads.daemon.count" : "Description of the metric", + "threads.deadlock.count" : "Description of the metric", + "threads.new.count" : "Description of the metric", + "threads.runnable.count" : "Description of the metric", + "threads.terminated.count" : "Description of the metric", + "threads.timed_waiting.count" : "Description of the metric", + "threads.waiting.count" : "Description of the metric", + "active_calls_api_Driver.execute" : "Description of the metric", + "active_calls_api_Driver.run" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook" : "Description of the metric", + "active_calls_api_FileMoves" : "Description of the metric", + "active_calls_api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook" : "Description of the metric", + "active_calls_api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook" : "Description of the metric", + "active_calls_api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook" : "Description of the metric", + "active_calls_api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook" : "Description of the metric", + "active_calls_api_RemoveTempOrDuplicateFiles" : "Description of the metric", + "active_calls_api_RenameOrMoveFiles" : "Description of the metric", + "active_calls_api_TezBuildDag" : "Description of the metric", + "active_calls_api_TezCompiler" : "Description of the metric", + "active_calls_api_TezGetSession" : "Description of the metric", + "active_calls_api_TezRunDag" : "Description of the metric", + "active_calls_api_TezSubmitDag" : "Description of the metric", + "active_calls_api_TezSubmitToRunningDag" : "Description of the metric", + "active_calls_api_acquireReadWriteLocks" : "Description of the metric", + "active_calls_api_compile" : "Description of the metric", + "active_calls_api_deserializePlan" : "Description of the metric", + "active_calls_api_doAuthorization" : "Description of the metric", + "active_calls_api_hs2_operation_INITIALIZED" : "Description of the metric", + "active_calls_api_hs2_operation_PENDING" : "Description of the metric", + "active_calls_api_hs2_operation_RUNNING" : "Description of the metric", + "active_calls_api_hs2_sql_operation_PENDING" : "Description of the metric", + "active_calls_api_hs2_sql_operation_RUNNING" : "Description of the metric", + "active_calls_api_optimizer" : "Description of the metric", + "active_calls_api_parse" : "Description of the metric", + "active_calls_api_partition-retrieving" : "Description of the metric", + "active_calls_api_releaseLocks" : "Description of the metric", + "active_calls_api_runTasks" : "Description of the metric", + "active_calls_api_semanticAnalyze" : "Description of the metric", + "active_calls_api_serializePlan" : "Description of the metric", + "active_calls_api_waitCompile" : "Description of the metric", + "active_calls_hs2_compiling_queries" : "Description of the metric", + "active_calls_hs2_executing_queries" : "Description of the metric", + "active_calls_hs2_submitted_queries" : "Description of the metric", + "cumulative_connection_count" : "Description of the metric", + "hive_tez_tasks" : "Description of the metric", + "hs2_completed_operation_CANCELED" : "Description of the metric", + "hs2_completed_operation_CLOSED" : "Description of the metric", + "hs2_completed_operation_ERROR" : "Description of the metric", + "hs2_completed_operation_FINISHED" : "Description of the metric", + "hs2_completed_sql_operation_CANCELED" : "Description of the metric", + "hs2_completed_sql_operation_CLOSED" : "Description of the metric", + "hs2_completed_sql_operation_ERROR" : "Description of the metric", + "hs2_completed_sql_operation_FINISHED" : "Description of the metric", + "hs2_sql_operation_active_user" : "Description of the metric", + "jvm.pause.extraSleepTime" : "Description of the metric", + "jvm.pause.info-threshold" : "Description of the metric", + "open_connections" : "Description of the metric", + "open_operations" : "Description of the metric", + "qc_invalid_for_caching" : "Description of the metric", + "qc_lookups" : "Description of the metric", + "waiting_compile_ops" : "Description of the metric", + "zookeeper_hive_exclusivelocks" : "Description of the metric", + "zookeeper_hive_sharedlocks" : "Description of the metric", + "hs2_failed_queries_count" : "Description of the metric", + "hs2_failed_queries_mean_rate" : "Description of the metric", + "hs2_failed_queries_1min_rate" : "Description of the metric", + "hs2_failed_queries_5min_rate" : "Description of the metric", + "hs2_failed_queries_15min_rate" : "Description of the metric", + "hs2_succeeded_queries_count" : "Description of the metric", + "hs2_succeeded_queries_mean_rate" : "Description of the metric", + "hs2_succeeded_queries_1min_rate" : "Description of the metric", + "hs2_succeeded_queries_5min_rate" : "Description of the metric", + "hs2_succeeded_queries_15min_rate" : "Description of the metric", + "api_Driver.execute_count" : "Description of the metric", + "api_Driver.execute_mean_rate" : "Description of the metric", + "api_Driver.execute_1min_rate" : "Description of the metric", + "api_Driver.execute_5min_rate" : "Description of the metric", + "api_Driver.execute_15min_rate" : "Description of the metric", + "api_Driver.execute_mean" : "Description of the metric", + "api_Driver.execute_min" : "Description of the metric", + "api_Driver.execute_max" : "Description of the metric", + "api_Driver.execute_median" : "Description of the metric", + "api_Driver.execute_stddev" : "Description of the metric", + "api_Driver.execute_75thpercentile" : "Description of the metric", + "api_Driver.execute_95thpercentile" : "Description of the metric", + "api_Driver.execute_98thpercentile" : "Description of the metric", + "api_Driver.execute_99thpercentile" : "Description of the metric", + "api_Driver.execute_999thpercentile" : "Description of the metric", + "api_Driver.run_count" : "Description of the metric", + "api_Driver.run_mean_rate" : "Description of the metric", + "api_Driver.run_1min_rate" : "Description of the metric", + "api_Driver.run_5min_rate" : "Description of the metric", + "api_Driver.run_15min_rate" : "Description of the metric", + "api_Driver.run_mean" : "Description of the metric", + "api_Driver.run_min" : "Description of the metric", + "api_Driver.run_max" : "Description of the metric", + "api_Driver.run_median" : "Description of the metric", + "api_Driver.run_stddev" : "Description of the metric", + "api_Driver.run_75thpercentile" : "Description of the metric", + "api_Driver.run_95thpercentile" : "Description of the metric", + "api_Driver.run_98thpercentile" : "Description of the metric", + "api_Driver.run_99thpercentile" : "Description of the metric", + "api_Driver.run_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_999thpercentile" : "Description of the metric", + "api_FileMoves_count" : "Description of the metric", + "api_FileMoves_mean_rate" : "Description of the metric", + "api_FileMoves_1min_rate" : "Description of the metric", + "api_FileMoves_5min_rate" : "Description of the metric", + "api_FileMoves_15min_rate" : "Description of the metric", + "api_FileMoves_mean" : "Description of the metric", + "api_FileMoves_min" : "Description of the metric", + "api_FileMoves_max" : "Description of the metric", + "api_FileMoves_median" : "Description of the metric", + "api_FileMoves_stddev" : "Description of the metric", + "api_FileMoves_75thpercentile" : "Description of the metric", + "api_FileMoves_95thpercentile" : "Description of the metric", + "api_FileMoves_98thpercentile" : "Description of the metric", + "api_FileMoves_99thpercentile" : "Description of the metric", + "api_FileMoves_999thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_count" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_1min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_5min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_15min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_min" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_max" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_median" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_stddev" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_75thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_95thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_98thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_99thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_999thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_count" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_1min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_5min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_15min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_min" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_max" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_median" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_stddev" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_75thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_95thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_98thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_99thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_999thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_count" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_1min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_5min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_15min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_min" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_max" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_median" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_stddev" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_75thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_95thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_98thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_99thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_999thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_count" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_mean_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_1min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_5min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_15min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_mean" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_min" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_max" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_median" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_stddev" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_75thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_95thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_98thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_99thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_999thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_count" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_mean_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_1min_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_5min_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_15min_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_mean" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_min" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_max" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_median" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_stddev" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_75thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_95thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_98thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_99thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_999thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_count" : "Description of the metric", + "api_RenameOrMoveFiles_mean_rate" : "Description of the metric", + "api_RenameOrMoveFiles_1min_rate" : "Description of the metric", + "api_RenameOrMoveFiles_5min_rate" : "Description of the metric", + "api_RenameOrMoveFiles_15min_rate" : "Description of the metric", + "api_RenameOrMoveFiles_mean" : "Description of the metric", + "api_RenameOrMoveFiles_min" : "Description of the metric", + "api_RenameOrMoveFiles_max" : "Description of the metric", + "api_RenameOrMoveFiles_median" : "Description of the metric", + "api_RenameOrMoveFiles_stddev" : "Description of the metric", + "api_RenameOrMoveFiles_75thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_95thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_98thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_99thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_999thpercentile" : "Description of the metric", + "api_TezBuildDag_count" : "Description of the metric", + "api_TezBuildDag_mean_rate" : "Description of the metric", + "api_TezBuildDag_1min_rate" : "Description of the metric", + "api_TezBuildDag_5min_rate" : "Description of the metric", + "api_TezBuildDag_15min_rate" : "Description of the metric", + "api_TezBuildDag_mean" : "Description of the metric", + "api_TezBuildDag_min" : "Description of the metric", + "api_TezBuildDag_max" : "Description of the metric", + "api_TezBuildDag_median" : "Description of the metric", + "api_TezBuildDag_stddev" : "Description of the metric", + "api_TezBuildDag_75thpercentile" : "Description of the metric", + "api_TezBuildDag_95thpercentile" : "Description of the metric", + "api_TezBuildDag_98thpercentile" : "Description of the metric", + "api_TezBuildDag_99thpercentile" : "Description of the metric", + "api_TezBuildDag_999thpercentile" : "Description of the metric", + "api_TezCompiler_count" : "Description of the metric", + "api_TezCompiler_mean_rate" : "Description of the metric", + "api_TezCompiler_1min_rate" : "Description of the metric", + "api_TezCompiler_5min_rate" : "Description of the metric", + "api_TezCompiler_15min_rate" : "Description of the metric", + "api_TezCompiler_mean" : "Description of the metric", + "api_TezCompiler_min" : "Description of the metric", + "api_TezCompiler_max" : "Description of the metric", + "api_TezCompiler_median" : "Description of the metric", + "api_TezCompiler_stddev" : "Description of the metric", + "api_TezCompiler_75thpercentile" : "Description of the metric", + "api_TezCompiler_95thpercentile" : "Description of the metric", + "api_TezCompiler_98thpercentile" : "Description of the metric", + "api_TezCompiler_99thpercentile" : "Description of the metric", + "api_TezCompiler_999thpercentile" : "Description of the metric", + "api_TezGetSession_count" : "Description of the metric", + "api_TezGetSession_mean_rate" : "Description of the metric", + "api_TezGetSession_1min_rate" : "Description of the metric", + "api_TezGetSession_5min_rate" : "Description of the metric", + "api_TezGetSession_15min_rate" : "Description of the metric", + "api_TezGetSession_mean" : "Description of the metric", + "api_TezGetSession_min" : "Description of the metric", + "api_TezGetSession_max" : "Description of the metric", + "api_TezGetSession_median" : "Description of the metric", + "api_TezGetSession_stddev" : "Description of the metric", + "api_TezGetSession_75thpercentile" : "Description of the metric", + "api_TezGetSession_95thpercentile" : "Description of the metric", + "api_TezGetSession_98thpercentile" : "Description of the metric", + "api_TezGetSession_99thpercentile" : "Description of the metric", + "api_TezGetSession_999thpercentile" : "Description of the metric", + "api_TezRunDag_count" : "Description of the metric", + "api_TezRunDag_mean_rate" : "Description of the metric", + "api_TezRunDag_1min_rate" : "Description of the metric", + "api_TezRunDag_5min_rate" : "Description of the metric", + "api_TezRunDag_15min_rate" : "Description of the metric", + "api_TezRunDag_mean" : "Description of the metric", + "api_TezRunDag_min" : "Description of the metric", + "api_TezRunDag_max" : "Description of the metric", + "api_TezRunDag_median" : "Description of the metric", + "api_TezRunDag_stddev" : "Description of the metric", + "api_TezRunDag_75thpercentile" : "Description of the metric", + "api_TezRunDag_95thpercentile" : "Description of the metric", + "api_TezRunDag_98thpercentile" : "Description of the metric", + "api_TezRunDag_99thpercentile" : "Description of the metric", + "api_TezRunDag_999thpercentile" : "Description of the metric", + "api_TezSubmitDag_count" : "Description of the metric", + "api_TezSubmitDag_mean_rate" : "Description of the metric", + "api_TezSubmitDag_1min_rate" : "Description of the metric", + "api_TezSubmitDag_5min_rate" : "Description of the metric", + "api_TezSubmitDag_15min_rate" : "Description of the metric", + "api_TezSubmitDag_mean" : "Description of the metric", + "api_TezSubmitDag_min" : "Description of the metric", + "api_TezSubmitDag_max" : "Description of the metric", + "api_TezSubmitDag_median" : "Description of the metric", + "api_TezSubmitDag_stddev" : "Description of the metric", + "api_TezSubmitDag_75thpercentile" : "Description of the metric", + "api_TezSubmitDag_95thpercentile" : "Description of the metric", + "api_TezSubmitDag_98thpercentile" : "Description of the metric", + "api_TezSubmitDag_99thpercentile" : "Description of the metric", + "api_TezSubmitDag_999thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_count" : "Description of the metric", + "api_TezSubmitToRunningDag_mean_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_1min_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_5min_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_15min_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_mean" : "Description of the metric", + "api_TezSubmitToRunningDag_min" : "Description of the metric", + "api_TezSubmitToRunningDag_max" : "Description of the metric", + "api_TezSubmitToRunningDag_median" : "Description of the metric", + "api_TezSubmitToRunningDag_stddev" : "Description of the metric", + "api_TezSubmitToRunningDag_75thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_95thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_98thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_99thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_999thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_count" : "Description of the metric", + "api_acquireReadWriteLocks_mean_rate" : "Description of the metric", + "api_acquireReadWriteLocks_1min_rate" : "Description of the metric", + "api_acquireReadWriteLocks_5min_rate" : "Description of the metric", + "api_acquireReadWriteLocks_15min_rate" : "Description of the metric", + "api_acquireReadWriteLocks_mean" : "Description of the metric", + "api_acquireReadWriteLocks_min" : "Description of the metric", + "api_acquireReadWriteLocks_max" : "Description of the metric", + "api_acquireReadWriteLocks_median" : "Description of the metric", + "api_acquireReadWriteLocks_stddev" : "Description of the metric", + "api_acquireReadWriteLocks_75thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_95thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_98thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_99thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_999thpercentile" : "Description of the metric", + "api_compile_count" : "Description of the metric", + "api_compile_mean_rate" : "Description of the metric", + "api_compile_1min_rate" : "Description of the metric", + "api_compile_5min_rate" : "Description of the metric", + "api_compile_15min_rate" : "Description of the metric", + "api_compile_mean" : "Description of the metric", + "api_compile_min" : "Description of the metric", + "api_compile_max" : "Description of the metric", + "api_compile_median" : "Description of the metric", + "api_compile_stddev" : "Description of the metric", + "api_compile_75thpercentile" : "Description of the metric", + "api_compile_95thpercentile" : "Description of the metric", + "api_compile_98thpercentile" : "Description of the metric", + "api_compile_99thpercentile" : "Description of the metric", + "api_compile_999thpercentile" : "Description of the metric", + "api_deserializePlan_count" : "Description of the metric", + "api_deserializePlan_mean_rate" : "Description of the metric", + "api_deserializePlan_1min_rate" : "Description of the metric", + "api_deserializePlan_5min_rate" : "Description of the metric", + "api_deserializePlan_15min_rate" : "Description of the metric", + "api_deserializePlan_mean" : "Description of the metric", + "api_deserializePlan_min" : "Description of the metric", + "api_deserializePlan_max" : "Description of the metric", + "api_deserializePlan_median" : "Description of the metric", + "api_deserializePlan_stddev" : "Description of the metric", + "api_deserializePlan_75thpercentile" : "Description of the metric", + "api_deserializePlan_95thpercentile" : "Description of the metric", + "api_deserializePlan_98thpercentile" : "Description of the metric", + "api_deserializePlan_99thpercentile" : "Description of the metric", + "api_deserializePlan_999thpercentile" : "Description of the metric", + "api_doAuthorization_count" : "Description of the metric", + "api_doAuthorization_mean_rate" : "Description of the metric", + "api_doAuthorization_1min_rate" : "Description of the metric", + "api_doAuthorization_5min_rate" : "Description of the metric", + "api_doAuthorization_15min_rate" : "Description of the metric", + "api_doAuthorization_mean" : "Description of the metric", + "api_doAuthorization_min" : "Description of the metric", + "api_doAuthorization_max" : "Description of the metric", + "api_doAuthorization_median" : "Description of the metric", + "api_doAuthorization_stddev" : "Description of the metric", + "api_doAuthorization_75thpercentile" : "Description of the metric", + "api_doAuthorization_95thpercentile" : "Description of the metric", + "api_doAuthorization_98thpercentile" : "Description of the metric", + "api_doAuthorization_99thpercentile" : "Description of the metric", + "api_doAuthorization_999thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_count" : "Description of the metric", + "api_hs2_operation_INITIALIZED_mean_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_1min_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_5min_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_15min_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_mean" : "Description of the metric", + "api_hs2_operation_INITIALIZED_min" : "Description of the metric", + "api_hs2_operation_INITIALIZED_max" : "Description of the metric", + "api_hs2_operation_INITIALIZED_median" : "Description of the metric", + "api_hs2_operation_INITIALIZED_stddev" : "Description of the metric", + "api_hs2_operation_INITIALIZED_75thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_95thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_98thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_99thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_999thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_count" : "Description of the metric", + "api_hs2_operation_PENDING_mean_rate" : "Description of the metric", + "api_hs2_operation_PENDING_1min_rate" : "Description of the metric", + "api_hs2_operation_PENDING_5min_rate" : "Description of the metric", + "api_hs2_operation_PENDING_15min_rate" : "Description of the metric", + "api_hs2_operation_PENDING_mean" : "Description of the metric", + "api_hs2_operation_PENDING_min" : "Description of the metric", + "api_hs2_operation_PENDING_max" : "Description of the metric", + "api_hs2_operation_PENDING_median" : "Description of the metric", + "api_hs2_operation_PENDING_stddev" : "Description of the metric", + "api_hs2_operation_PENDING_75thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_95thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_98thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_99thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_999thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_count" : "Description of the metric", + "api_hs2_operation_RUNNING_mean_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_1min_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_5min_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_15min_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_mean" : "Description of the metric", + "api_hs2_operation_RUNNING_min" : "Description of the metric", + "api_hs2_operation_RUNNING_max" : "Description of the metric", + "api_hs2_operation_RUNNING_median" : "Description of the metric", + "api_hs2_operation_RUNNING_stddev" : "Description of the metric", + "api_hs2_operation_RUNNING_75thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_95thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_98thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_99thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_999thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_count" : "Description of the metric", + "api_hs2_sql_operation_PENDING_mean_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_1min_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_5min_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_15min_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_mean" : "Description of the metric", + "api_hs2_sql_operation_PENDING_min" : "Description of the metric", + "api_hs2_sql_operation_PENDING_max" : "Description of the metric", + "api_hs2_sql_operation_PENDING_median" : "Description of the metric", + "api_hs2_sql_operation_PENDING_stddev" : "Description of the metric", + "api_hs2_sql_operation_PENDING_75thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_95thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_98thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_99thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_999thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_count" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_mean_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_1min_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_5min_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_15min_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_mean" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_min" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_max" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_median" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_stddev" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_75thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_95thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_98thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_99thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_999thpercentile" : "Description of the metric", + "api_optimizer_count" : "Description of the metric", + "api_optimizer_mean_rate" : "Description of the metric", + "api_optimizer_1min_rate" : "Description of the metric", + "api_optimizer_5min_rate" : "Description of the metric", + "api_optimizer_15min_rate" : "Description of the metric", + "api_optimizer_mean" : "Description of the metric", + "api_optimizer_min" : "Description of the metric", + "api_optimizer_max" : "Description of the metric", + "api_optimizer_median" : "Description of the metric", + "api_optimizer_stddev" : "Description of the metric", + "api_optimizer_75thpercentile" : "Description of the metric", + "api_optimizer_95thpercentile" : "Description of the metric", + "api_optimizer_98thpercentile" : "Description of the metric", + "api_optimizer_99thpercentile" : "Description of the metric", + "api_optimizer_999thpercentile" : "Description of the metric", + "api_parse_count" : "Description of the metric", + "api_parse_mean_rate" : "Description of the metric", + "api_parse_1min_rate" : "Description of the metric", + "api_parse_5min_rate" : "Description of the metric", + "api_parse_15min_rate" : "Description of the metric", + "api_parse_mean" : "Description of the metric", + "api_parse_min" : "Description of the metric", + "api_parse_max" : "Description of the metric", + "api_parse_median" : "Description of the metric", + "api_parse_stddev" : "Description of the metric", + "api_parse_75thpercentile" : "Description of the metric", + "api_parse_95thpercentile" : "Description of the metric", + "api_parse_98thpercentile" : "Description of the metric", + "api_parse_99thpercentile" : "Description of the metric", + "api_parse_999thpercentile" : "Description of the metric", + "api_partition-retrieving_count" : "Description of the metric", + "api_partition-retrieving_mean_rate" : "Description of the metric", + "api_partition-retrieving_1min_rate" : "Description of the metric", + "api_partition-retrieving_5min_rate" : "Description of the metric", + "api_partition-retrieving_15min_rate" : "Description of the metric", + "api_partition-retrieving_mean" : "Description of the metric", + "api_partition-retrieving_min" : "Description of the metric", + "api_partition-retrieving_max" : "Description of the metric", + "api_partition-retrieving_median" : "Description of the metric", + "api_partition-retrieving_stddev" : "Description of the metric", + "api_partition-retrieving_75thpercentile" : "Description of the metric", + "api_partition-retrieving_95thpercentile" : "Description of the metric", + "api_partition-retrieving_98thpercentile" : "Description of the metric", + "api_partition-retrieving_99thpercentile" : "Description of the metric", + "api_partition-retrieving_999thpercentile" : "Description of the metric", + "api_releaseLocks_count" : "Description of the metric", + "api_releaseLocks_mean_rate" : "Description of the metric", + "api_releaseLocks_1min_rate" : "Description of the metric", + "api_releaseLocks_5min_rate" : "Description of the metric", + "api_releaseLocks_15min_rate" : "Description of the metric", + "api_releaseLocks_mean" : "Description of the metric", + "api_releaseLocks_min" : "Description of the metric", + "api_releaseLocks_max" : "Description of the metric", + "api_releaseLocks_median" : "Description of the metric", + "api_releaseLocks_stddev" : "Description of the metric", + "api_releaseLocks_75thpercentile" : "Description of the metric", + "api_releaseLocks_95thpercentile" : "Description of the metric", + "api_releaseLocks_98thpercentile" : "Description of the metric", + "api_releaseLocks_99thpercentile" : "Description of the metric", + "api_releaseLocks_999thpercentile" : "Description of the metric", + "api_runTasks_count" : "Description of the metric", + "api_runTasks_mean_rate" : "Description of the metric", + "api_runTasks_1min_rate" : "Description of the metric", + "api_runTasks_5min_rate" : "Description of the metric", + "api_runTasks_15min_rate" : "Description of the metric", + "api_runTasks_mean" : "Description of the metric", + "api_runTasks_min" : "Description of the metric", + "api_runTasks_max" : "Description of the metric", + "api_runTasks_median" : "Description of the metric", + "api_runTasks_stddev" : "Description of the metric", + "api_runTasks_75thpercentile" : "Description of the metric", + "api_runTasks_95thpercentile" : "Description of the metric", + "api_runTasks_98thpercentile" : "Description of the metric", + "api_runTasks_99thpercentile" : "Description of the metric", + "api_runTasks_999thpercentile" : "Description of the metric", + "api_semanticAnalyze_count" : "Description of the metric", + "api_semanticAnalyze_mean_rate" : "Description of the metric", + "api_semanticAnalyze_1min_rate" : "Description of the metric", + "api_semanticAnalyze_5min_rate" : "Description of the metric", + "api_semanticAnalyze_15min_rate" : "Description of the metric", + "api_semanticAnalyze_mean" : "Description of the metric", + "api_semanticAnalyze_min" : "Description of the metric", + "api_semanticAnalyze_max" : "Description of the metric", + "api_semanticAnalyze_median" : "Description of the metric", + "api_semanticAnalyze_stddev" : "Description of the metric", + "api_semanticAnalyze_75thpercentile" : "Description of the metric", + "api_semanticAnalyze_95thpercentile" : "Description of the metric", + "api_semanticAnalyze_98thpercentile" : "Description of the metric", + "api_semanticAnalyze_99thpercentile" : "Description of the metric", + "api_semanticAnalyze_999thpercentile" : "Description of the metric", + "api_serializePlan_count" : "Description of the metric", + "api_serializePlan_mean_rate" : "Description of the metric", + "api_serializePlan_1min_rate" : "Description of the metric", + "api_serializePlan_5min_rate" : "Description of the metric", + "api_serializePlan_15min_rate" : "Description of the metric", + "api_serializePlan_mean" : "Description of the metric", + "api_serializePlan_min" : "Description of the metric", + "api_serializePlan_max" : "Description of the metric", + "api_serializePlan_median" : "Description of the metric", + "api_serializePlan_stddev" : "Description of the metric", + "api_serializePlan_75thpercentile" : "Description of the metric", + "api_serializePlan_95thpercentile" : "Description of the metric", + "api_serializePlan_98thpercentile" : "Description of the metric", + "api_serializePlan_99thpercentile" : "Description of the metric", + "api_serializePlan_999thpercentile" : "Description of the metric", + "api_waitCompile_count" : "Description of the metric", + "api_waitCompile_mean_rate" : "Description of the metric", + "api_waitCompile_1min_rate" : "Description of the metric", + "api_waitCompile_5min_rate" : "Description of the metric", + "api_waitCompile_15min_rate" : "Description of the metric", + "api_waitCompile_mean" : "Description of the metric", + "api_waitCompile_min" : "Description of the metric", + "api_waitCompile_max" : "Description of the metric", + "api_waitCompile_median" : "Description of the metric", + "api_waitCompile_stddev" : "Description of the metric", + "api_waitCompile_75thpercentile" : "Description of the metric", + "api_waitCompile_95thpercentile" : "Description of the metric", + "api_waitCompile_98thpercentile" : "Description of the metric", + "api_waitCompile_99thpercentile" : "Description of the metric", + "api_waitCompile_999thpercentile" : "Description of the metric", + "hs2_compiling_queries_count" : "Description of the metric", + "hs2_compiling_queries_mean_rate" : "Description of the metric", + "hs2_compiling_queries_1min_rate" : "Description of the metric", + "hs2_compiling_queries_5min_rate" : "Description of the metric", + "hs2_compiling_queries_15min_rate" : "Description of the metric", + "hs2_compiling_queries_mean" : "Description of the metric", + "hs2_compiling_queries_min" : "Description of the metric", + "hs2_compiling_queries_max" : "Description of the metric", + "hs2_compiling_queries_median" : "Description of the metric", + "hs2_compiling_queries_stddev" : "Description of the metric", + "hs2_compiling_queries_75thpercentile" : "Description of the metric", + "hs2_compiling_queries_95thpercentile" : "Description of the metric", + "hs2_compiling_queries_98thpercentile" : "Description of the metric", + "hs2_compiling_queries_99thpercentile" : "Description of the metric", + "hs2_compiling_queries_999thpercentile" : "Description of the metric", + "hs2_executing_queries_count" : "Description of the metric", + "hs2_executing_queries_mean_rate" : "Description of the metric", + "hs2_executing_queries_1min_rate" : "Description of the metric", + "hs2_executing_queries_5min_rate" : "Description of the metric", + "hs2_executing_queries_15min_rate" : "Description of the metric", + "hs2_executing_queries_mean" : "Description of the metric", + "hs2_executing_queries_min" : "Description of the metric", + "hs2_executing_queries_max" : "Description of the metric", + "hs2_executing_queries_median" : "Description of the metric", + "hs2_executing_queries_stddev" : "Description of the metric", + "hs2_executing_queries_75thpercentile" : "Description of the metric", + "hs2_executing_queries_95thpercentile" : "Description of the metric", + "hs2_executing_queries_98thpercentile" : "Description of the metric", + "hs2_executing_queries_99thpercentile" : "Description of the metric", + "hs2_executing_queries_999thpercentile" : "Description of the metric", + "hs2_submitted_queries_count" : "Description of the metric", + "hs2_submitted_queries_mean_rate" : "Description of the metric", + "hs2_submitted_queries_1min_rate" : "Description of the metric", + "hs2_submitted_queries_5min_rate" : "Description of the metric", + "hs2_submitted_queries_15min_rate" : "Description of the metric", + "hs2_submitted_queries_mean" : "Description of the metric", + "hs2_submitted_queries_min" : "Description of the metric", + "hs2_submitted_queries_max" : "Description of the metric", + "hs2_submitted_queries_median" : "Description of the metric", + "hs2_submitted_queries_stddev" : "Description of the metric", + "hs2_submitted_queries_75thpercentile" : "Description of the metric", + "hs2_submitted_queries_95thpercentile" : "Description of the metric", + "hs2_submitted_queries_98thpercentile" : "Description of the metric", + "hs2_submitted_queries_99thpercentile" : "Description of the metric", + "hs2_submitted_queries_999thpercentile" : "Description of the metric" +} \ No newline at end of file From 39bf1cc5fb1e51f4b0f8f0232a69e2cf52e1e4b4 Mon Sep 17 00:00:00 2001 From: akenO8 Date: Tue, 19 Sep 2023 12:03:28 +0800 Subject: [PATCH 4/5] [ADD]add trino gc metrics --- cmd/a.py | 4 - cmd/trino_coordinator.py | 108 ++- .../coordinator/G1OldGarbageCollector.json | 14 + .../coordinator/G1YoungGarbageCollector.json | 14 + metrics/coordinator/GcMonitor.json | 8 + metrics/resourcemanager/QueueMetrics.json | 16 +- metrics_orig/common/JvmMetrics.json | 28 + metrics_orig/common/MetricsSystem.json | 15 + metrics_orig/common/OperatingSystem.json | 14 + metrics_orig/common/RpcActivity.json | 16 + metrics_orig/common/RpcDetailedActivity.json | 4 + metrics_orig/common/Runtime.json | 4 + metrics_orig/common/UgiMetrics.json | 10 + .../coordinator/ClusterMemoryManager.json | 4 + .../coordinator/ClusterMemoryPool.json | 3 + .../coordinator/HeartbeatFailureDetector.json | 5 + metrics_orig/coordinator/Memory.json | 4 + metrics_orig/coordinator/QueryManager.json | 14 + metrics_orig/coordinator/SqlTaskManager.json | 10 + metrics_orig/coordinator/Threading.json | 5 + metrics_orig/datanode/DataNodeActivity.json | 70 ++ metrics_orig/datanode/DataNodeInfo.json | 4 + metrics_orig/datanode/FSDatasetState.json | 13 + metrics_orig/hiveserver2/HiveServer2.json | 705 ++++++++++++++++++ metrics_orig/journalnode/JournalNode.json | 29 + metrics_orig/namenode/FSNamesystem.json | 36 + metrics_orig/namenode/FSNamesystemState.json | 16 + metrics_orig/namenode/NameNode.json | 133 ++++ metrics_orig/namenode/NameNodeActivity.json | 41 + metrics_orig/namenode/NameNodeInfo.json | 25 + metrics_orig/namenode/RetryCache.json | 5 + metrics_orig/namenode/StartupProgress.json | 20 + .../nodemanager/NodeManagerMetrics.json | 19 + metrics_orig/nodemanager/ShuffleMetrics.json | 6 + .../resourcemanager/ClusterMetrics.json | 11 + .../resourcemanager/QueueMetrics.json | 33 + metrics_orig/resourcemanager/RMNMInfo.json | 6 + 37 files changed, 1451 insertions(+), 21 deletions(-) delete mode 100644 cmd/a.py create mode 100644 metrics/coordinator/G1OldGarbageCollector.json create mode 100644 metrics/coordinator/G1YoungGarbageCollector.json create mode 100644 metrics/coordinator/GcMonitor.json create mode 100644 metrics_orig/common/JvmMetrics.json create mode 100644 metrics_orig/common/MetricsSystem.json create mode 100644 metrics_orig/common/OperatingSystem.json create mode 100644 metrics_orig/common/RpcActivity.json create mode 100644 metrics_orig/common/RpcDetailedActivity.json create mode 100644 metrics_orig/common/Runtime.json create mode 100644 metrics_orig/common/UgiMetrics.json create mode 100644 metrics_orig/coordinator/ClusterMemoryManager.json create mode 100644 metrics_orig/coordinator/ClusterMemoryPool.json create mode 100644 metrics_orig/coordinator/HeartbeatFailureDetector.json create mode 100644 metrics_orig/coordinator/Memory.json create mode 100644 metrics_orig/coordinator/QueryManager.json create mode 100644 metrics_orig/coordinator/SqlTaskManager.json create mode 100644 metrics_orig/coordinator/Threading.json create mode 100644 metrics_orig/datanode/DataNodeActivity.json create mode 100644 metrics_orig/datanode/DataNodeInfo.json create mode 100644 metrics_orig/datanode/FSDatasetState.json create mode 100644 metrics_orig/hiveserver2/HiveServer2.json create mode 100644 metrics_orig/journalnode/JournalNode.json create mode 100644 metrics_orig/namenode/FSNamesystem.json create mode 100644 metrics_orig/namenode/FSNamesystemState.json create mode 100644 metrics_orig/namenode/NameNode.json create mode 100644 metrics_orig/namenode/NameNodeActivity.json create mode 100644 metrics_orig/namenode/NameNodeInfo.json create mode 100644 metrics_orig/namenode/RetryCache.json create mode 100644 metrics_orig/namenode/StartupProgress.json create mode 100644 metrics_orig/nodemanager/NodeManagerMetrics.json create mode 100644 metrics_orig/nodemanager/ShuffleMetrics.json create mode 100644 metrics_orig/resourcemanager/ClusterMetrics.json create mode 100644 metrics_orig/resourcemanager/QueueMetrics.json create mode 100644 metrics_orig/resourcemanager/RMNMInfo.json diff --git a/cmd/a.py b/cmd/a.py deleted file mode 100644 index b6d21ec..0000000 --- a/cmd/a.py +++ /dev/null @@ -1,4 +0,0 @@ -import requests - -rsp = requests.get("http://pcdn.dq.baidu.com/v3/accesslog", stream=True) -print (rsp.raw._connection.sock.getpeername()[0]) \ No newline at end of file diff --git a/cmd/trino_coordinator.py b/cmd/trino_coordinator.py index d80ce6c..5d85179 100644 --- a/cmd/trino_coordinator.py +++ b/cmd/trino_coordinator.py @@ -62,6 +62,14 @@ def setup_metrics_labels(self, beans): self.setup_trino_coor_labels('ClusterMemoryManager') if 'trino.memory:type=ClusterMemoryPool,name=general' in beans[i]['objectName']: self.setup_trino_coor_labels('ClusterMemoryPool') + if 'io.airlift.stats:name=GcMonitor' in beans[i]['objectName']: + self.setup_trino_coor_labels('GcMonitor') + if 'java.lang:name=G1 Young Generation,type=GarbageCollector' in beans[i]['objectName']: + # self.setup_trino_coor_labels('G1YoungGarbageCollector') + self.setup_young_gc_labels(); + if 'java.lang:name=G1 Old Generation,type=GarbageCollector' in beans[i]['objectName']: + # self.setup_trino_coor_labels('G1OldGarbageCollector') + self.setup_old_gc_labels(); def setup_trino_coor_labels(self, kind): label = ["cluster", "method", "_target"] @@ -71,6 +79,50 @@ def setup_trino_coor_labels(self, kind): key = kind self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + def setup_young_gc_labels(self): + kind = 'G1YoungGarbageCollector' + label = ["cluster", "method", "_target"] + name = "_".join([self.trino_coordinator_prefix, kind]) + + young_before, young_after = 1, 1 + for metric in self.metrics[kind]: + if metric.split('.')[0] == 'LastGcInfo': + key = metric.split('.')[1] + if key == 'memoryUsageBeforeGc' and young_before: + description = 'Trino node memory usage metric before Young GC.' + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_before = 0 + elif key == 'memoryUsageAfterGc' and young_after: + description = 'Trino node memory usage metric after Young GC.' + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_after = 0 + else: + key = kind + description = 'Trino node Old Young metric' + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + + def setup_old_gc_labels(self): + kind = 'G1OldGarbageCollector' + label = ["cluster", "method", "_target"] + name = "_".join([self.trino_coordinator_prefix, kind]) + + young_before, young_after = 1, 1 + for metric in self.metrics[kind]: + if metric.split('.')[0] == 'LastGcInfo': + key = metric.split('.')[1] + if key == 'memoryUsageBeforeGc' and young_before: + description = 'Trino node memory usage metric before Old GC.' + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_before = 0 + elif key == 'memoryUsageAfterGc' and young_after: + description = 'Trino node memory usage metric after Old GC.' + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_after = 0 + else: + key = kind + description = 'Trino node Old GC metric' + self.trino_coordinator_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + def get_metrics(self, beans): for i in range(len(beans)): if 'java.lang:type=Memory' in beans[i]['objectName']: @@ -87,13 +139,23 @@ def get_metrics(self, beans): self.get_trino_coor_labels(beans[i], 'ClusterMemoryManager') if 'trino.memory:type=ClusterMemoryPool,name=general' in beans[i]['objectName']: self.get_trino_coor_labels(beans[i], 'ClusterMemoryPool') + if 'io.airlift.stats:name=GcMonitor' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'GcMonitor') + if 'java.lang:name=G1 Young Generation,type=GarbageCollector' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'G1YoungGarbageCollector') + self.get_young_gc_labels(beans[i]); + if 'java.lang:name=G1 Old Generation,type=GarbageCollector' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'G1OldGarbageCollector') + self.get_old_gc_labels(beans[i]); def get_trino_coor_labels(self, bean, kind): # type(bean) = dict for metric in self.metrics[kind]: + key = kind + label = [self.cluster, '', self.target] value = 0 for attr in bean['attributes']: - key = kind + # type(attr) = dict method = metric.replace('.', '_').replace(':', '_').replace('-', '_') label = [self.cluster, method, self.target] if attr['name'] == metric: @@ -103,4 +165,46 @@ def get_trino_coor_labels(self, bean, kind): else: value = attr['value'] break - self.trino_coordinator_metrics[kind][key].add_metric(label, value) \ No newline at end of file + if not self.trino_coordinator_metrics[kind].has_key(key): + self.setup_trino_coor_labels(kind) + self.trino_coordinator_metrics[kind][key].add_metric(label, value) + + def get_young_gc_labels(self, bean): + kind = 'G1YoungGarbageCollector' + for metric in self.metrics[kind]: + method = metric.replace('.', '_').replace(':', '_').replace('-', '_') + label = [self.cluster, method, self.target] + value = 0 + for attr in bean['attributes']: + if attr['name'] == 'LastGcInfo' and len(metric.split('.')) > 2 and metric.split('.')[0] == 'LastGcInfo': + if 'value' in attr: + key = metric.split('.')[1] + for vl in attr['value'][key]: + if vl['key'] == metric.split('.')[2]: + manu = 'used' + value = vl['value'][manu] + self.trino_coordinator_metrics[kind][key].add_metric(label, value) + elif attr['name'] == metric: + key = kind + value = attr['value'] + self.trino_coordinator_metrics[kind][key].add_metric(label, value) + + def get_old_gc_labels(self, bean): + kind = 'G1OldGarbageCollector' + for metric in self.metrics[kind]: + method = metric.replace('.', '_').replace(':', '_').replace('-', '_') + label = [self.cluster, method, self.target] + value = 0 + for attr in bean['attributes']: + if attr['name'] == 'LastGcInfo' and len(metric.split('.')) > 2 and metric.split('.')[0] == 'LastGcInfo': + if 'value' in attr: + key = metric.split('.')[1] + for vl in attr['value'][key]: + if vl['key'] == metric.split('.')[2]: + manu = 'used' + value = vl['value'][manu] + self.trino_coordinator_metrics[kind][key].add_metric(label, value) + elif attr['name'] == metric: + key = kind + value = attr['value'] + self.trino_coordinator_metrics[kind][key].add_metric(label, value) \ No newline at end of file diff --git a/metrics/coordinator/G1OldGarbageCollector.json b/metrics/coordinator/G1OldGarbageCollector.json new file mode 100644 index 0000000..7290747 --- /dev/null +++ b/metrics/coordinator/G1OldGarbageCollector.json @@ -0,0 +1,14 @@ +{ + "LastGcInfo.memoryUsageBeforeGc.G1 Old Gen": "", + "LastGcInfo.memoryUsageBeforeGc.G1 Survivor Space": "", + "LastGcInfo.memoryUsageBeforeGc.Compressed Class Space": "", + "LastGcInfo.memoryUsageBeforeGc.Metaspace": "", + "LastGcInfo.memoryUsageBeforeGc.G1 Eden Space": "", + "LastGcInfo.memoryUsageAfterGc.G1 Old Gen": "", + "LastGcInfo.memoryUsageAfterGc.G1 Survivor Space": "", + "LastGcInfo.memoryUsageAfterGc.Compressed Class Space": "", + "LastGcInfo.memoryUsageAfterGc.Metaspace": "", + "LastGcInfo.memoryUsageAfterGc.G1 Eden Space": "", + "CollectionCount": "FGC", + "CollectionTime": "FGCT" +} \ No newline at end of file diff --git a/metrics/coordinator/G1YoungGarbageCollector.json b/metrics/coordinator/G1YoungGarbageCollector.json new file mode 100644 index 0000000..b108d50 --- /dev/null +++ b/metrics/coordinator/G1YoungGarbageCollector.json @@ -0,0 +1,14 @@ +{ + "LastGcInfo.memoryUsageBeforeGc.G1 Old Gen": "", + "LastGcInfo.memoryUsageBeforeGc.G1 Survivor Space": "", + "LastGcInfo.memoryUsageBeforeGc.Compressed Class Space": "", + "LastGcInfo.memoryUsageBeforeGc.Metaspace": "", + "LastGcInfo.memoryUsageBeforeGc.G1 Eden Space": "", + "LastGcInfo.memoryUsageAfterGc.G1 Old Gen": "", + "LastGcInfo.memoryUsageAfterGc.G1 Survivor Space": "", + "LastGcInfo.memoryUsageAfterGc.Compressed Class Space": "", + "LastGcInfo.memoryUsageAfterGc.Metaspace": "", + "LastGcInfo.memoryUsageAfterGc.G1 Eden Space": "", + "CollectionCount": "YGC", + "CollectionTime": "YGCT" +} \ No newline at end of file diff --git a/metrics/coordinator/GcMonitor.json b/metrics/coordinator/GcMonitor.json new file mode 100644 index 0000000..e55791d --- /dev/null +++ b/metrics/coordinator/GcMonitor.json @@ -0,0 +1,8 @@ +{ + "MajorGc.AllTime.Count": "FGC", + "MajorGc.AllTime.Avg": "FGCT_avg", + "MajorGc.AllTime.Max": "YGCT_max", + "MinorGc.AllTime.Count": "YGC", + "MinorGc.AllTime.Avg": "YGCT_avg", + "MinorGc.AllTime.Max": "YGCT_max" +} \ No newline at end of file diff --git a/metrics/resourcemanager/QueueMetrics.json b/metrics/resourcemanager/QueueMetrics.json index 497f42d..759c6f0 100644 --- a/metrics/resourcemanager/QueueMetrics.json +++ b/metrics/resourcemanager/QueueMetrics.json @@ -15,19 +15,5 @@ "AggregateContainersAllocated": "Total number of allocated containers.", "AggregateContainersReleased": "Total number of released containers.", "AvailableMB": "Current available memory in MB.", - "AvailableVCores": "Current available CPU in virtual cores.", - "PendingMB": "Current pending memory resource requests in MB that are not yet fulfilled by the scheduler.", - "PendingVCores": "Current pending CPU allocation requests in virtual cores that are not yet fulfilled by the scheduler.", - "PendingContainers": "Current pending resource requests that are not yet fulfilled by the scheduler.", - "ReservedMB": "Current reserved memory in MB.", - "ReservedVCores": "Current reserved CPU in virtual cores.", - "ReservedContainers": "Current number of reserved containers.", - "ActiveUsers": "Current number of active users.", - "ActiveApplications": "Current number of active applications.", - "FairShareMB": "(FairScheduler only) Current fair share of memory in MB.", - "FairShareVCores": "(FairScheduler only) Current fair share of CPU in virtual cores.", - "MinShareMB": "(FairScheduler only) Minimum share of memory in MB.", - "MinShareVCores": "(FairScheduler only) Minimum share of CPU in virtual cores.", - "MaxShareMB": "(FairScheduler only) Maximum share of memory in MB.", - "MaxShareVCores": "(FairScheduler only) Maximum share of CPU in virtual cores." + "AvailableVCores": "Current available CPU in virtual cores." } \ No newline at end of file diff --git a/metrics_orig/common/JvmMetrics.json b/metrics_orig/common/JvmMetrics.json new file mode 100644 index 0000000..c60adce --- /dev/null +++ b/metrics_orig/common/JvmMetrics.json @@ -0,0 +1,28 @@ +{ + "MemNonHeapUsedM": "Current non-heap memory used in MB.", + "MemNonHeapCommittedM": "Current non-heap memory committed in MB.", + "MemNonHeapMaxM": "Max non-heap memory size in MB.", + "MemHeapUsedM": "Current heap memory used in MB.", + "MemHeapCommittedM": "Current heap memory committed in MB.", + "MemHeapMaxM": "Max heap memory size in MB.", + "MemMaxM": "Max memory size in MB.", + "ThreadsNew": "Current number of NEW threads.", + "ThreadsRunnable": "Current number of RUNNABLE threads.", + "ThreadsBlocked": "Current number of BLOCKED threads.", + "ThreadsWaiting": "Current number of WAITING threads.", + "ThreadsTimedWaiting": "Current number of TIMED_WAITING threads.", + "ThreadsTerminated": "Current number of TERMINATED threads.", + "GcCount": "Total number of Gc count", + "GcTimeMillis": "Total GC time in msec.", + "GcCountParNew": "ParNew GC count.", + "GcTimeMillisParNew": "ParNew GC time in msec.", + "GcCountConcurrentMarkSweep": "ConcurrentMarkSweep GC count.", + "GcTimeMillisConcurrentMarkSweep": "ConcurrentMarkSweep GC time in msec.", + "GcNumWarnThresholdExceeded": "Number of times that the GC warn threshold is exceeded.", + "GcNumInfoThresholdExceeded": "Number of times that the GC info threshold is exceeded.", + "GcTotalExtraSleepTime": "Total GC extra sleep time in msec.", + "LogFatal": "Total number of FATAL logs.", + "LogError": "Total number of ERROR logs.", + "LogWarn": "Total number of WARN logs.", + "LogInfo": "Total number of INFO logs." +} diff --git a/metrics_orig/common/MetricsSystem.json b/metrics_orig/common/MetricsSystem.json new file mode 100644 index 0000000..9b09717 --- /dev/null +++ b/metrics_orig/common/MetricsSystem.json @@ -0,0 +1,15 @@ +{ + "NumActiveSources": "Current number of active metrics sources.", + "NumAllSources": "Total number of metrics sources.", + "NumActiveSinks": "Current number of active sinks.", + "NumAllSinks": "Total number of sinks (BUT usually less than NumActiveSinks, see HADOOP-9946).", + "SnapshotNumOps": "Total number of operations to snapshot statistics from a metrics source.", + "SnapshotAvgTime": "Average time in milliseconds to snapshot statistics from a metrics source.", + "PublishNumOps": "Total number of operations to publish statistics to a sink.", + "PublishAvgTime": "Average time in milliseconds to publish statistics to a sink.", + "DroppedPubAll": "Total number of dropped publishes.", + "Sink_instanceNumOps": "Total number of sink operations for the instance.", + "Sink_instanceAvgTime": "Average time in milliseconds of sink operations for the instance.", + "Sink_instanceDropped": "Total number of dropped sink operations for the instance.", + "Sink_instanceQsize": "Current queue length of sink operations (BUT always set to 0 because nothing to increment this metrics, see HADOOP-9941)." +} \ No newline at end of file diff --git a/metrics_orig/common/OperatingSystem.json b/metrics_orig/common/OperatingSystem.json new file mode 100644 index 0000000..364419b --- /dev/null +++ b/metrics_orig/common/OperatingSystem.json @@ -0,0 +1,14 @@ +{ + "OpenFileDescriptorCount": "Total number of open file descriptor", + "MaxFileDescriptorCount": "Total number of max file descriptor", + "CommittedVirtualMemorySize": "The size of committed virtual memory in bytes", + "TotalSwapSpaceSize": "The size of total swap space in bytes", + "FreeSwapSpaceSize": "The size of free swap space in bytes", + "ProcessCpuTime": "Total process cpu time in microseconds", + "FreePhysicalMemorySize": "The size of free physical memory in bytes", + "TotalPhysicalMemorySize": "The size of total physical memory in bytes", + "SystemCpuLoad": "Average of system CPU load", + "ProcessCpuLoad": "Average of process CPU load", + "SystemLoadAverage": "Average of system load", + "AvailableProcessors": "Total number of available processors", +} \ No newline at end of file diff --git a/metrics_orig/common/RpcActivity.json b/metrics_orig/common/RpcActivity.json new file mode 100644 index 0000000..432d58d --- /dev/null +++ b/metrics_orig/common/RpcActivity.json @@ -0,0 +1,16 @@ +{ + "ReceivedBytes": "Total number of received bytes", + "SentBytes": "Total number of sent bytes", + "RpcQueueTimeNumOps": "Total number of RPC calls", + "RpcQueueTimeAvgTime": "Average queue time in milliseconds", + "RpcProcessingTimeNumOps": "Total number of RPC calls (same to RpcQueueTimeNumOps)", + "RpcProcessingTimeAvgTime": "Average Processing time in milliseconds", + "RpcAuthenticationFailures": "Total number of authentication failures", + "RpcAuthenticationSuccesses": "Total number of authentication successes", + "RpcAuthorizationFailures": "Total number of authorization failures", + "RpcAuthorizationSuccesses": "Total number of authorization successes", + "RpcClientBackoff": "Total number of RPC client back off", + "RpcSlowCalls": "Total number of RPC slow calls", + "NumOpenConnections": "Current number of open connections", + "CallQueueLength": "Current length of the call queue" +} diff --git a/metrics_orig/common/RpcDetailedActivity.json b/metrics_orig/common/RpcDetailedActivity.json new file mode 100644 index 0000000..85f1d97 --- /dev/null +++ b/metrics_orig/common/RpcDetailedActivity.json @@ -0,0 +1,4 @@ +{ + "methodNumOps": "Total number of the times the method is called", + "methodAvgTime": "Average turn around time of the method in milliseconds" +} \ No newline at end of file diff --git a/metrics_orig/common/Runtime.json b/metrics_orig/common/Runtime.json new file mode 100644 index 0000000..267db55 --- /dev/null +++ b/metrics_orig/common/Runtime.json @@ -0,0 +1,4 @@ +{ + "Uptime": "components uptime in milliseconds", + "StartTime": "components start time in milliseconds" +} \ No newline at end of file diff --git a/metrics_orig/common/UgiMetrics.json b/metrics_orig/common/UgiMetrics.json new file mode 100644 index 0000000..920bb05 --- /dev/null +++ b/metrics_orig/common/UgiMetrics.json @@ -0,0 +1,10 @@ +{ + "LoginSuccessNumOps": "Total number of successful kerberos logins.", + "LoginSuccessAvgTime": "Average time for successful kerberos logins in milliseconds.", + "LoginFailureNumOps": "Total number of failed kerberos logins.", + "LoginFailureAvgTime": "Average time for failed kerberos logins in milliseconds.", + "GetGroupsNumOps": "Total number of group resolutions.", + "GetGroupsAvgTime": "Average time for group resolution in milliseconds.", + "RenewalFailuresTotal": "Total number of renewal failures.", + "RenewalFailures": "Current number of renewal failures." +} \ No newline at end of file diff --git a/metrics_orig/coordinator/ClusterMemoryManager.json b/metrics_orig/coordinator/ClusterMemoryManager.json new file mode 100644 index 0000000..0ca1f52 --- /dev/null +++ b/metrics_orig/coordinator/ClusterMemoryManager.json @@ -0,0 +1,4 @@ +{ + "ClusterMemoryBytes": "Cluster memory bytes", + "QueriesKilledDueToOutOfMemory": "Cumulative count (since Trino started) of queries that ran out of memory and were killed" +} \ No newline at end of file diff --git a/metrics_orig/coordinator/ClusterMemoryPool.json b/metrics_orig/coordinator/ClusterMemoryPool.json new file mode 100644 index 0000000..ed7a4ad --- /dev/null +++ b/metrics_orig/coordinator/ClusterMemoryPool.json @@ -0,0 +1,3 @@ +{ + "FreeDistributedBytes": "Free memory (general pool)" +} \ No newline at end of file diff --git a/metrics_orig/coordinator/HeartbeatFailureDetector.json b/metrics_orig/coordinator/HeartbeatFailureDetector.json new file mode 100644 index 0000000..5cf19b4 --- /dev/null +++ b/metrics_orig/coordinator/HeartbeatFailureDetector.json @@ -0,0 +1,5 @@ +{ + "TotalCount": "Total nodes", + "ActiveCount": "Active nodes", + "FailedCount": "Failed nodes" +} \ No newline at end of file diff --git a/metrics_orig/coordinator/Memory.json b/metrics_orig/coordinator/Memory.json new file mode 100644 index 0000000..625437e --- /dev/null +++ b/metrics_orig/coordinator/Memory.json @@ -0,0 +1,4 @@ +{ + "HeapMemoryUsage": "Heap size", + "NonHeapMemoryUsage": "Non Heap size" +} \ No newline at end of file diff --git a/metrics_orig/coordinator/QueryManager.json b/metrics_orig/coordinator/QueryManager.json new file mode 100644 index 0000000..7ff03c6 --- /dev/null +++ b/metrics_orig/coordinator/QueryManager.json @@ -0,0 +1,14 @@ +{ + "RunningQueries": "Total number of running queries", + "QueuedQueries": "Total number of waiting queries", + "AbandonedQueries.FiveMinute.Count": "Total number of abandoned queries", + "CanceledQueries.FiveMinute.Count": "Total number of canceled queries", + "CompletedQueries.FiveMinute.Count": "Total number of completed queries", + "StartedQueries.FiveMinute.Count": "Queries started", + "FailedQueries.FiveMinute.Count": "Failed queries from last 5 min (all)", + "InternalFailures.FiveMinute.Count": "Failed queries from last 5 min (internal)", + "ExternalFailures.FiveMinute.Count": "Failed queries from last 5 min (external)", + "UserErrorFailures.FiveMinute.Count": "Failed queries (user)", + "ExecutionTime.FiveMinutes.P50": "Execution latency (P50)", + "WallInputBytesRate.FiveMinutes.P90": "Input data rate (P90)" +} \ No newline at end of file diff --git a/metrics_orig/coordinator/SqlTaskManager.json b/metrics_orig/coordinator/SqlTaskManager.json new file mode 100644 index 0000000..f836775 --- /dev/null +++ b/metrics_orig/coordinator/SqlTaskManager.json @@ -0,0 +1,10 @@ +{ + "InputDataSize.FiveMinute.Count": "Input data bytes count", + "InputDataSize.FiveMinute.Rate": "Input data bytes rate", + "OutputDataSize.FiveMinute.Count": "Output data bytes count", + "OutputDataSize.FiveMinute.Rate": "Output data bytes rate", + "InputPositions.FiveMinute.Count": "Input rows count", + "InputPositions.FiveMinute.Rate": "Input rows rate", + "OutputPositions.FiveMinute.Count": "Output rows count", + "OutputPositions.FiveMinute.Rate": "Output rows rate" +} \ No newline at end of file diff --git a/metrics_orig/coordinator/Threading.json b/metrics_orig/coordinator/Threading.json new file mode 100644 index 0000000..2587706 --- /dev/null +++ b/metrics_orig/coordinator/Threading.json @@ -0,0 +1,5 @@ +{ + "ThreadCount": "Number of threads", + "PeakThreadCount": "Peak number of threads", + "DaemonThreadCount": "Number of backend threads" +} \ No newline at end of file diff --git a/metrics_orig/datanode/DataNodeActivity.json b/metrics_orig/datanode/DataNodeActivity.json new file mode 100644 index 0000000..55ff752 --- /dev/null +++ b/metrics_orig/datanode/DataNodeActivity.json @@ -0,0 +1,70 @@ +{ + "BytesWritten": "Total number of bytes written to DataNode", + "BytesRead": "Total number of bytes read from DataNode", + "TotalWriteTime": "Total number of milliseconds spent on write operation", + "TotalReadTime": "Total number of milliseconds spent on read operation", + "BlocksWritten": "Total number of blocks written to DataNode", + "BlocksRead": "Total number of blocks read from DataNode", + "BlocksReplicated": "Total number of blocks replicated", + "BlocksRemoved": "Total number of blocks removed", + "BlocksVerified": "Total number of blocks verified", + "BlockVerificationFailures": "Total number of verifications failures", + "BlocksCached": "Total number of blocks cached", + "BlocksUncached": "Total number of blocks uncached", + "ReadsFromLocalClient": "Total number of read operations from local client", + "ReadsFromRemoteClient": "Total number of read operations from remote client", + "WritesFromLocalClient": "Total number of write operations from local client", + "WritesFromRemoteClient": "Total number of write operations from remote client", + "BlocksGetLocalPathInfo": "Total number of operations to get local path names of blocks", + "RemoteBytesRead": "Number of bytes read by remote clients", + "RemoteBytesWritten": "Number of bytes written by remote clients", + "RamDiskBlocksWrite": "Total number of blocks written to memory", + "RamDiskBlocksWriteFallback": "Total number of blocks written to memory but not satisfied (failed-over to disk)", + "RamDiskBytesWrite": "Total number of bytes written to memory", + "RamDiskBlocksReadHits": "Total number of times a block in memory was read", + "RamDiskBlocksEvicted": "Total number of blocks evicted in memory", + "RamDiskBlocksEvictedWithoutRead": "Total number of blocks evicted in memory without ever being read from memory", + "RamDiskBlocksEvictionWindowMsNumOps": "Number of blocks evicted in memory", + "RamDiskBlocksEvictionWindowMsAvgTime": "Average time of blocks in memory before being evicted in milliseconds", + "RamDiskBlocksLazyPersisted": "Total number of blocks written to disk by lazy writer", + "RamDiskBlocksDeletedBeforeLazyPersisted": "Total number of blocks deleted by application before being persisted to disk", + "RamDiskBytesLazyPersisted": "Total number of bytes written to disk by lazy writer", + "RamDiskBlocksLazyPersistWindowMsNumOps": "Number of blocks written to disk by lazy writer", + "RamDiskBlocksLazyPersistWindowMsAvgTime": "Average time of blocks written to disk by lazy writer in milliseconds", + "FsyncCount": "Total number of fsync", + "VolumeFailures": "Total number of volume failures occurred", + "DatanodeNetworkErrors" : "Total number of datanode network error", + "DataNodeActiveXceiversCount" : "Total number of datanode active Xceivers", + "ReadBlockOpNumOps": "Total number of read operations", + "ReadBlockOpAvgTime": "Average time of read operations in milliseconds", + "WriteBlockOpNumOps": "Total number of write operations", + "WriteBlockOpAvgTime": "Average time of write operations in milliseconds", + "BlockChecksumOpNumOps": "Total number of blockChecksum operations", + "BlockChecksumOpAvgTime": "Average time of blockChecksum operations in milliseconds", + "CopyBlockOpNumOps": "Total number of block copy operations", + "CopyBlockOpAvgTime": "Average time of block copy operations in milliseconds", + "ReplaceBlockOpNumOps": "Total number of block replace operations", + "ReplaceBlockOpAvgTime": "Average time of block replace operations in milliseconds", + "HeartbeatsNumOps": "Total number of heartbeats", + "HeartbeatsAvgTime": "Average heartbeat time in milliseconds", + "HeartbeatsTotalNumOps": "Total number of heartbeats which is a duplicate of HeartbeatsNumOps", + "HeartbeatsTotalAvgTime": "Average total heartbeat time in milliseconds", + "LifelinesNumOps": "Total number of lifeline messages", + "LifelinesAvgTime": "Average lifeline message processing time in milliseconds", + "BlockReportsNumOps": "Total number of block report operations", + "BlockReportsAvgTime": "Average time of block report operations in milliseconds", + "IncrementalBlockReportsNumOps": "Total number of incremental block report operations", + "IncrementalBlockReportsAvgTime": "Average time of incremental block report operations in milliseconds", + "CacheReportsNumOps": "Total number of cache report operations", + "CacheReportsAvgTime": "Average time of cache report operations in milliseconds", + "PacketAckRoundTripTimeNanosNumOps": "Total number of ack round trip", + "PacketAckRoundTripTimeNanosAvgTime": "Average time from ack send to receive minus the downstream ack time in nanoseconds", + "FlushNanosNumOps": "Total number of flushes", + "FlushNanosAvgTime": "Average flush time in nanoseconds", + "FsyncNanosNumOps": "Total number of fsync", + "FsyncNanosAvgTime": "Average fsync time in nanoseconds", + "SendDataPacketBlockedOnNetworkNanosNumOps": "Total number of sending packets", + "SendDataPacketBlockedOnNetworkNanosAvgTime": "Average waiting time of sending packets in nanoseconds", + "SendDataPacketTransferNanosNumOps": "Total number of sending packets", + "SendDataPacketTransferNanosAvgTime": "Average transfer time of sending packets in nanoseconds" +} \ No newline at end of file diff --git a/metrics_orig/datanode/DataNodeInfo.json b/metrics_orig/datanode/DataNodeInfo.json new file mode 100644 index 0000000..d837254 --- /dev/null +++ b/metrics_orig/datanode/DataNodeInfo.json @@ -0,0 +1,4 @@ +{ + "VolumeInfo": "Volume infomation in each path and in each mode", + "XceiverCount": "Total number of datanode Xceivers" +} diff --git a/metrics_orig/datanode/FSDatasetState.json b/metrics_orig/datanode/FSDatasetState.json new file mode 100644 index 0000000..aeb14af --- /dev/null +++ b/metrics_orig/datanode/FSDatasetState.json @@ -0,0 +1,13 @@ +{ + "Capacity" : "Current raw capacity of DataNode in bytes", + "DfsUsed" : "Current space used by DataNodes for DFS purposes in bytes", + "Remaining" : "Current remaining capacity in bytes", + "NumFailedVolumes" : "Total number of failed volumes", + "LastVolumeFailureDate" : "Last time of volume failures", + "EstimatedCapacityLostTotal" : "An estimate of the total capacity lost due to volume failures", + "CacheUsed" : "Total number of cache used", + "CacheCapacity" : "Current raw capacity of cache in bytes", + "NumBlocksCached" : "Total number of blocks cached", + "NumBlocksFailedToCache" : "Total number of blocks failed to cache", + "NumBlocksFailedToUnCache" : "Total number of blocks failed to uncached" +} \ No newline at end of file diff --git a/metrics_orig/hiveserver2/HiveServer2.json b/metrics_orig/hiveserver2/HiveServer2.json new file mode 100644 index 0000000..bd5e4be --- /dev/null +++ b/metrics_orig/hiveserver2/HiveServer2.json @@ -0,0 +1,705 @@ +{ + "buffers.direct.capacity" : "Description of the metric", + "buffers.direct.count" : "Description of the metric", + "buffers.direct.used" : "Description of the metric", + "buffers.mapped.capacity" : "Description of the metric", + "buffers.mapped.count" : "Description of the metric", + "buffers.mapped.used" : "Description of the metric", + "classLoading.loaded" : "Description of the metric", + "classLoading.unloaded" : "Description of the metric", + "exec_async_pool_size" : "Description of the metric", + "exec_async_queue_size" : "Description of the metric", + "gc.G1-Old-Generation.count" : "Description of the metric", + "gc.G1-Old-Generation.time" : "Description of the metric", + "gc.G1-Young-Generation.count" : "Description of the metric", + "gc.G1-Young-Generation.time" : "Description of the metric", + "hs2_active_sessions" : "Description of the metric", + "hs2_avg_active_session_time" : "Description of the metric", + "hs2_avg_open_session_time" : "Description of the metric", + "hs2_open_sessions" : "Description of the metric", + "memory.heap.committed" : "Description of the metric", + "memory.heap.init" : "Description of the metric", + "memory.heap.max" : "Description of the metric", + "memory.heap.usage" : "Description of the metric", + "memory.heap.used" : "Description of the metric", + "memory.non-heap.committed" : "Description of the metric", + "memory.non-heap.init" : "Description of the metric", + "memory.non-heap.max" : "Description of the metric", + "memory.non-heap.usage" : "Description of the metric", + "memory.non-heap.used" : "Description of the metric", + "memory.pools.Code-Cache.usage" : "Description of the metric", + "memory.pools.Compressed-Class-Space.usage" : "Description of the metric", + "memory.pools.G1-Eden-Space.usage" : "Description of the metric", + "memory.pools.G1-Old-Gen.usage" : "Description of the metric", + "memory.pools.G1-Survivor-Space.usage" : "Description of the metric", + "memory.pools.Metaspace.usage" : "Description of the metric", + "memory.total.committed" : "Description of the metric", + "memory.total.init" : "Description of the metric", + "memory.total.max" : "Description of the metric", + "memory.total.used" : "Description of the metric", + "qc_current_size" : "Description of the metric", + "qc_max_size" : "Description of the metric", + "threads.blocked.count" : "Description of the metric", + "threads.count" : "Description of the metric", + "threads.daemon.count" : "Description of the metric", + "threads.deadlock.count" : "Description of the metric", + "threads.new.count" : "Description of the metric", + "threads.runnable.count" : "Description of the metric", + "threads.terminated.count" : "Description of the metric", + "threads.timed_waiting.count" : "Description of the metric", + "threads.waiting.count" : "Description of the metric", + "active_calls_api_Driver.execute" : "Description of the metric", + "active_calls_api_Driver.run" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook" : "Description of the metric", + "active_calls_api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook" : "Description of the metric", + "active_calls_api_FileMoves" : "Description of the metric", + "active_calls_api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook" : "Description of the metric", + "active_calls_api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook" : "Description of the metric", + "active_calls_api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook" : "Description of the metric", + "active_calls_api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook" : "Description of the metric", + "active_calls_api_RemoveTempOrDuplicateFiles" : "Description of the metric", + "active_calls_api_RenameOrMoveFiles" : "Description of the metric", + "active_calls_api_TezBuildDag" : "Description of the metric", + "active_calls_api_TezCompiler" : "Description of the metric", + "active_calls_api_TezGetSession" : "Description of the metric", + "active_calls_api_TezRunDag" : "Description of the metric", + "active_calls_api_TezSubmitDag" : "Description of the metric", + "active_calls_api_TezSubmitToRunningDag" : "Description of the metric", + "active_calls_api_acquireReadWriteLocks" : "Description of the metric", + "active_calls_api_compile" : "Description of the metric", + "active_calls_api_deserializePlan" : "Description of the metric", + "active_calls_api_doAuthorization" : "Description of the metric", + "active_calls_api_hs2_operation_INITIALIZED" : "Description of the metric", + "active_calls_api_hs2_operation_PENDING" : "Description of the metric", + "active_calls_api_hs2_operation_RUNNING" : "Description of the metric", + "active_calls_api_hs2_sql_operation_PENDING" : "Description of the metric", + "active_calls_api_hs2_sql_operation_RUNNING" : "Description of the metric", + "active_calls_api_optimizer" : "Description of the metric", + "active_calls_api_parse" : "Description of the metric", + "active_calls_api_partition-retrieving" : "Description of the metric", + "active_calls_api_releaseLocks" : "Description of the metric", + "active_calls_api_runTasks" : "Description of the metric", + "active_calls_api_semanticAnalyze" : "Description of the metric", + "active_calls_api_serializePlan" : "Description of the metric", + "active_calls_api_waitCompile" : "Description of the metric", + "active_calls_hs2_compiling_queries" : "Description of the metric", + "active_calls_hs2_executing_queries" : "Description of the metric", + "active_calls_hs2_submitted_queries" : "Description of the metric", + "cumulative_connection_count" : "Description of the metric", + "hive_tez_tasks" : "Description of the metric", + "hs2_completed_operation_CANCELED" : "Description of the metric", + "hs2_completed_operation_CLOSED" : "Description of the metric", + "hs2_completed_operation_ERROR" : "Description of the metric", + "hs2_completed_operation_FINISHED" : "Description of the metric", + "hs2_completed_sql_operation_CANCELED" : "Description of the metric", + "hs2_completed_sql_operation_CLOSED" : "Description of the metric", + "hs2_completed_sql_operation_ERROR" : "Description of the metric", + "hs2_completed_sql_operation_FINISHED" : "Description of the metric", + "hs2_sql_operation_active_user" : "Description of the metric", + "jvm.pause.extraSleepTime" : "Description of the metric", + "jvm.pause.info-threshold" : "Description of the metric", + "open_connections" : "Description of the metric", + "open_operations" : "Description of the metric", + "qc_invalid_for_caching" : "Description of the metric", + "qc_lookups" : "Description of the metric", + "waiting_compile_ops" : "Description of the metric", + "zookeeper_hive_exclusivelocks" : "Description of the metric", + "zookeeper_hive_sharedlocks" : "Description of the metric", + "hs2_failed_queries_count" : "Description of the metric", + "hs2_failed_queries_mean_rate" : "Description of the metric", + "hs2_failed_queries_1min_rate" : "Description of the metric", + "hs2_failed_queries_5min_rate" : "Description of the metric", + "hs2_failed_queries_15min_rate" : "Description of the metric", + "hs2_succeeded_queries_count" : "Description of the metric", + "hs2_succeeded_queries_mean_rate" : "Description of the metric", + "hs2_succeeded_queries_1min_rate" : "Description of the metric", + "hs2_succeeded_queries_5min_rate" : "Description of the metric", + "hs2_succeeded_queries_15min_rate" : "Description of the metric", + "api_Driver.execute_count" : "Description of the metric", + "api_Driver.execute_mean_rate" : "Description of the metric", + "api_Driver.execute_1min_rate" : "Description of the metric", + "api_Driver.execute_5min_rate" : "Description of the metric", + "api_Driver.execute_15min_rate" : "Description of the metric", + "api_Driver.execute_mean" : "Description of the metric", + "api_Driver.execute_min" : "Description of the metric", + "api_Driver.execute_max" : "Description of the metric", + "api_Driver.execute_median" : "Description of the metric", + "api_Driver.execute_stddev" : "Description of the metric", + "api_Driver.execute_75thpercentile" : "Description of the metric", + "api_Driver.execute_95thpercentile" : "Description of the metric", + "api_Driver.execute_98thpercentile" : "Description of the metric", + "api_Driver.execute_99thpercentile" : "Description of the metric", + "api_Driver.execute_999thpercentile" : "Description of the metric", + "api_Driver.run_count" : "Description of the metric", + "api_Driver.run_mean_rate" : "Description of the metric", + "api_Driver.run_1min_rate" : "Description of the metric", + "api_Driver.run_5min_rate" : "Description of the metric", + "api_Driver.run_15min_rate" : "Description of the metric", + "api_Driver.run_mean" : "Description of the metric", + "api_Driver.run_min" : "Description of the metric", + "api_Driver.run_max" : "Description of the metric", + "api_Driver.run_median" : "Description of the metric", + "api_Driver.run_stddev" : "Description of the metric", + "api_Driver.run_75thpercentile" : "Description of the metric", + "api_Driver.run_95thpercentile" : "Description of the metric", + "api_Driver.run_98thpercentile" : "Description of the metric", + "api_Driver.run_99thpercentile" : "Description of the metric", + "api_Driver.run_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReExecutionOverlayPlugin$LocalHook_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.reexec.ReOptimizePlugin$LocalHook_999thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_count" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_1min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_5min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_15min_rate" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_min" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_max" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_median" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_stddev" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_75thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_95thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_98thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_99thpercentile" : "Description of the metric", + "api_FailureHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_999thpercentile" : "Description of the metric", + "api_FileMoves_count" : "Description of the metric", + "api_FileMoves_mean_rate" : "Description of the metric", + "api_FileMoves_1min_rate" : "Description of the metric", + "api_FileMoves_5min_rate" : "Description of the metric", + "api_FileMoves_15min_rate" : "Description of the metric", + "api_FileMoves_mean" : "Description of the metric", + "api_FileMoves_min" : "Description of the metric", + "api_FileMoves_max" : "Description of the metric", + "api_FileMoves_median" : "Description of the metric", + "api_FileMoves_stddev" : "Description of the metric", + "api_FileMoves_75thpercentile" : "Description of the metric", + "api_FileMoves_95thpercentile" : "Description of the metric", + "api_FileMoves_98thpercentile" : "Description of the metric", + "api_FileMoves_99thpercentile" : "Description of the metric", + "api_FileMoves_999thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_count" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_1min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_5min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_15min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_min" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_max" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_median" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_stddev" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_75thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_95thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_98thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_99thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_999thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_count" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_1min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_5min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_15min_rate" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_mean" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_min" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_max" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_median" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_stddev" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_75thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_95thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_98thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_99thpercentile" : "Description of the metric", + "api_PostHook.org.apache.hadoop.hive.ql.stats.OperatorStatsReaderHook_999thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_count" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_1min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_5min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_15min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_mean" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_min" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_max" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_median" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_stddev" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_75thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_95thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_98thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_99thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook_999thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_count" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_mean_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_1min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_5min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_15min_rate" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_mean" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_min" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_max" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_median" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_stddev" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_75thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_95thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_98thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_99thpercentile" : "Description of the metric", + "api_PreHook.org.apache.hadoop.hive.ql.security.authorization.plugin.DisallowTransformHook_999thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_count" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_mean_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_1min_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_5min_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_15min_rate" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_mean" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_min" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_max" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_median" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_stddev" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_75thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_95thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_98thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_99thpercentile" : "Description of the metric", + "api_RemoveTempOrDuplicateFiles_999thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_count" : "Description of the metric", + "api_RenameOrMoveFiles_mean_rate" : "Description of the metric", + "api_RenameOrMoveFiles_1min_rate" : "Description of the metric", + "api_RenameOrMoveFiles_5min_rate" : "Description of the metric", + "api_RenameOrMoveFiles_15min_rate" : "Description of the metric", + "api_RenameOrMoveFiles_mean" : "Description of the metric", + "api_RenameOrMoveFiles_min" : "Description of the metric", + "api_RenameOrMoveFiles_max" : "Description of the metric", + "api_RenameOrMoveFiles_median" : "Description of the metric", + "api_RenameOrMoveFiles_stddev" : "Description of the metric", + "api_RenameOrMoveFiles_75thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_95thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_98thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_99thpercentile" : "Description of the metric", + "api_RenameOrMoveFiles_999thpercentile" : "Description of the metric", + "api_TezBuildDag_count" : "Description of the metric", + "api_TezBuildDag_mean_rate" : "Description of the metric", + "api_TezBuildDag_1min_rate" : "Description of the metric", + "api_TezBuildDag_5min_rate" : "Description of the metric", + "api_TezBuildDag_15min_rate" : "Description of the metric", + "api_TezBuildDag_mean" : "Description of the metric", + "api_TezBuildDag_min" : "Description of the metric", + "api_TezBuildDag_max" : "Description of the metric", + "api_TezBuildDag_median" : "Description of the metric", + "api_TezBuildDag_stddev" : "Description of the metric", + "api_TezBuildDag_75thpercentile" : "Description of the metric", + "api_TezBuildDag_95thpercentile" : "Description of the metric", + "api_TezBuildDag_98thpercentile" : "Description of the metric", + "api_TezBuildDag_99thpercentile" : "Description of the metric", + "api_TezBuildDag_999thpercentile" : "Description of the metric", + "api_TezCompiler_count" : "Description of the metric", + "api_TezCompiler_mean_rate" : "Description of the metric", + "api_TezCompiler_1min_rate" : "Description of the metric", + "api_TezCompiler_5min_rate" : "Description of the metric", + "api_TezCompiler_15min_rate" : "Description of the metric", + "api_TezCompiler_mean" : "Description of the metric", + "api_TezCompiler_min" : "Description of the metric", + "api_TezCompiler_max" : "Description of the metric", + "api_TezCompiler_median" : "Description of the metric", + "api_TezCompiler_stddev" : "Description of the metric", + "api_TezCompiler_75thpercentile" : "Description of the metric", + "api_TezCompiler_95thpercentile" : "Description of the metric", + "api_TezCompiler_98thpercentile" : "Description of the metric", + "api_TezCompiler_99thpercentile" : "Description of the metric", + "api_TezCompiler_999thpercentile" : "Description of the metric", + "api_TezGetSession_count" : "Description of the metric", + "api_TezGetSession_mean_rate" : "Description of the metric", + "api_TezGetSession_1min_rate" : "Description of the metric", + "api_TezGetSession_5min_rate" : "Description of the metric", + "api_TezGetSession_15min_rate" : "Description of the metric", + "api_TezGetSession_mean" : "Description of the metric", + "api_TezGetSession_min" : "Description of the metric", + "api_TezGetSession_max" : "Description of the metric", + "api_TezGetSession_median" : "Description of the metric", + "api_TezGetSession_stddev" : "Description of the metric", + "api_TezGetSession_75thpercentile" : "Description of the metric", + "api_TezGetSession_95thpercentile" : "Description of the metric", + "api_TezGetSession_98thpercentile" : "Description of the metric", + "api_TezGetSession_99thpercentile" : "Description of the metric", + "api_TezGetSession_999thpercentile" : "Description of the metric", + "api_TezRunDag_count" : "Description of the metric", + "api_TezRunDag_mean_rate" : "Description of the metric", + "api_TezRunDag_1min_rate" : "Description of the metric", + "api_TezRunDag_5min_rate" : "Description of the metric", + "api_TezRunDag_15min_rate" : "Description of the metric", + "api_TezRunDag_mean" : "Description of the metric", + "api_TezRunDag_min" : "Description of the metric", + "api_TezRunDag_max" : "Description of the metric", + "api_TezRunDag_median" : "Description of the metric", + "api_TezRunDag_stddev" : "Description of the metric", + "api_TezRunDag_75thpercentile" : "Description of the metric", + "api_TezRunDag_95thpercentile" : "Description of the metric", + "api_TezRunDag_98thpercentile" : "Description of the metric", + "api_TezRunDag_99thpercentile" : "Description of the metric", + "api_TezRunDag_999thpercentile" : "Description of the metric", + "api_TezSubmitDag_count" : "Description of the metric", + "api_TezSubmitDag_mean_rate" : "Description of the metric", + "api_TezSubmitDag_1min_rate" : "Description of the metric", + "api_TezSubmitDag_5min_rate" : "Description of the metric", + "api_TezSubmitDag_15min_rate" : "Description of the metric", + "api_TezSubmitDag_mean" : "Description of the metric", + "api_TezSubmitDag_min" : "Description of the metric", + "api_TezSubmitDag_max" : "Description of the metric", + "api_TezSubmitDag_median" : "Description of the metric", + "api_TezSubmitDag_stddev" : "Description of the metric", + "api_TezSubmitDag_75thpercentile" : "Description of the metric", + "api_TezSubmitDag_95thpercentile" : "Description of the metric", + "api_TezSubmitDag_98thpercentile" : "Description of the metric", + "api_TezSubmitDag_99thpercentile" : "Description of the metric", + "api_TezSubmitDag_999thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_count" : "Description of the metric", + "api_TezSubmitToRunningDag_mean_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_1min_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_5min_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_15min_rate" : "Description of the metric", + "api_TezSubmitToRunningDag_mean" : "Description of the metric", + "api_TezSubmitToRunningDag_min" : "Description of the metric", + "api_TezSubmitToRunningDag_max" : "Description of the metric", + "api_TezSubmitToRunningDag_median" : "Description of the metric", + "api_TezSubmitToRunningDag_stddev" : "Description of the metric", + "api_TezSubmitToRunningDag_75thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_95thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_98thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_99thpercentile" : "Description of the metric", + "api_TezSubmitToRunningDag_999thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_count" : "Description of the metric", + "api_acquireReadWriteLocks_mean_rate" : "Description of the metric", + "api_acquireReadWriteLocks_1min_rate" : "Description of the metric", + "api_acquireReadWriteLocks_5min_rate" : "Description of the metric", + "api_acquireReadWriteLocks_15min_rate" : "Description of the metric", + "api_acquireReadWriteLocks_mean" : "Description of the metric", + "api_acquireReadWriteLocks_min" : "Description of the metric", + "api_acquireReadWriteLocks_max" : "Description of the metric", + "api_acquireReadWriteLocks_median" : "Description of the metric", + "api_acquireReadWriteLocks_stddev" : "Description of the metric", + "api_acquireReadWriteLocks_75thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_95thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_98thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_99thpercentile" : "Description of the metric", + "api_acquireReadWriteLocks_999thpercentile" : "Description of the metric", + "api_compile_count" : "Description of the metric", + "api_compile_mean_rate" : "Description of the metric", + "api_compile_1min_rate" : "Description of the metric", + "api_compile_5min_rate" : "Description of the metric", + "api_compile_15min_rate" : "Description of the metric", + "api_compile_mean" : "Description of the metric", + "api_compile_min" : "Description of the metric", + "api_compile_max" : "Description of the metric", + "api_compile_median" : "Description of the metric", + "api_compile_stddev" : "Description of the metric", + "api_compile_75thpercentile" : "Description of the metric", + "api_compile_95thpercentile" : "Description of the metric", + "api_compile_98thpercentile" : "Description of the metric", + "api_compile_99thpercentile" : "Description of the metric", + "api_compile_999thpercentile" : "Description of the metric", + "api_deserializePlan_count" : "Description of the metric", + "api_deserializePlan_mean_rate" : "Description of the metric", + "api_deserializePlan_1min_rate" : "Description of the metric", + "api_deserializePlan_5min_rate" : "Description of the metric", + "api_deserializePlan_15min_rate" : "Description of the metric", + "api_deserializePlan_mean" : "Description of the metric", + "api_deserializePlan_min" : "Description of the metric", + "api_deserializePlan_max" : "Description of the metric", + "api_deserializePlan_median" : "Description of the metric", + "api_deserializePlan_stddev" : "Description of the metric", + "api_deserializePlan_75thpercentile" : "Description of the metric", + "api_deserializePlan_95thpercentile" : "Description of the metric", + "api_deserializePlan_98thpercentile" : "Description of the metric", + "api_deserializePlan_99thpercentile" : "Description of the metric", + "api_deserializePlan_999thpercentile" : "Description of the metric", + "api_doAuthorization_count" : "Description of the metric", + "api_doAuthorization_mean_rate" : "Description of the metric", + "api_doAuthorization_1min_rate" : "Description of the metric", + "api_doAuthorization_5min_rate" : "Description of the metric", + "api_doAuthorization_15min_rate" : "Description of the metric", + "api_doAuthorization_mean" : "Description of the metric", + "api_doAuthorization_min" : "Description of the metric", + "api_doAuthorization_max" : "Description of the metric", + "api_doAuthorization_median" : "Description of the metric", + "api_doAuthorization_stddev" : "Description of the metric", + "api_doAuthorization_75thpercentile" : "Description of the metric", + "api_doAuthorization_95thpercentile" : "Description of the metric", + "api_doAuthorization_98thpercentile" : "Description of the metric", + "api_doAuthorization_99thpercentile" : "Description of the metric", + "api_doAuthorization_999thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_count" : "Description of the metric", + "api_hs2_operation_INITIALIZED_mean_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_1min_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_5min_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_15min_rate" : "Description of the metric", + "api_hs2_operation_INITIALIZED_mean" : "Description of the metric", + "api_hs2_operation_INITIALIZED_min" : "Description of the metric", + "api_hs2_operation_INITIALIZED_max" : "Description of the metric", + "api_hs2_operation_INITIALIZED_median" : "Description of the metric", + "api_hs2_operation_INITIALIZED_stddev" : "Description of the metric", + "api_hs2_operation_INITIALIZED_75thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_95thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_98thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_99thpercentile" : "Description of the metric", + "api_hs2_operation_INITIALIZED_999thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_count" : "Description of the metric", + "api_hs2_operation_PENDING_mean_rate" : "Description of the metric", + "api_hs2_operation_PENDING_1min_rate" : "Description of the metric", + "api_hs2_operation_PENDING_5min_rate" : "Description of the metric", + "api_hs2_operation_PENDING_15min_rate" : "Description of the metric", + "api_hs2_operation_PENDING_mean" : "Description of the metric", + "api_hs2_operation_PENDING_min" : "Description of the metric", + "api_hs2_operation_PENDING_max" : "Description of the metric", + "api_hs2_operation_PENDING_median" : "Description of the metric", + "api_hs2_operation_PENDING_stddev" : "Description of the metric", + "api_hs2_operation_PENDING_75thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_95thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_98thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_99thpercentile" : "Description of the metric", + "api_hs2_operation_PENDING_999thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_count" : "Description of the metric", + "api_hs2_operation_RUNNING_mean_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_1min_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_5min_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_15min_rate" : "Description of the metric", + "api_hs2_operation_RUNNING_mean" : "Description of the metric", + "api_hs2_operation_RUNNING_min" : "Description of the metric", + "api_hs2_operation_RUNNING_max" : "Description of the metric", + "api_hs2_operation_RUNNING_median" : "Description of the metric", + "api_hs2_operation_RUNNING_stddev" : "Description of the metric", + "api_hs2_operation_RUNNING_75thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_95thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_98thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_99thpercentile" : "Description of the metric", + "api_hs2_operation_RUNNING_999thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_count" : "Description of the metric", + "api_hs2_sql_operation_PENDING_mean_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_1min_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_5min_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_15min_rate" : "Description of the metric", + "api_hs2_sql_operation_PENDING_mean" : "Description of the metric", + "api_hs2_sql_operation_PENDING_min" : "Description of the metric", + "api_hs2_sql_operation_PENDING_max" : "Description of the metric", + "api_hs2_sql_operation_PENDING_median" : "Description of the metric", + "api_hs2_sql_operation_PENDING_stddev" : "Description of the metric", + "api_hs2_sql_operation_PENDING_75thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_95thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_98thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_99thpercentile" : "Description of the metric", + "api_hs2_sql_operation_PENDING_999thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_count" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_mean_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_1min_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_5min_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_15min_rate" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_mean" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_min" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_max" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_median" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_stddev" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_75thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_95thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_98thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_99thpercentile" : "Description of the metric", + "api_hs2_sql_operation_RUNNING_999thpercentile" : "Description of the metric", + "api_optimizer_count" : "Description of the metric", + "api_optimizer_mean_rate" : "Description of the metric", + "api_optimizer_1min_rate" : "Description of the metric", + "api_optimizer_5min_rate" : "Description of the metric", + "api_optimizer_15min_rate" : "Description of the metric", + "api_optimizer_mean" : "Description of the metric", + "api_optimizer_min" : "Description of the metric", + "api_optimizer_max" : "Description of the metric", + "api_optimizer_median" : "Description of the metric", + "api_optimizer_stddev" : "Description of the metric", + "api_optimizer_75thpercentile" : "Description of the metric", + "api_optimizer_95thpercentile" : "Description of the metric", + "api_optimizer_98thpercentile" : "Description of the metric", + "api_optimizer_99thpercentile" : "Description of the metric", + "api_optimizer_999thpercentile" : "Description of the metric", + "api_parse_count" : "Description of the metric", + "api_parse_mean_rate" : "Description of the metric", + "api_parse_1min_rate" : "Description of the metric", + "api_parse_5min_rate" : "Description of the metric", + "api_parse_15min_rate" : "Description of the metric", + "api_parse_mean" : "Description of the metric", + "api_parse_min" : "Description of the metric", + "api_parse_max" : "Description of the metric", + "api_parse_median" : "Description of the metric", + "api_parse_stddev" : "Description of the metric", + "api_parse_75thpercentile" : "Description of the metric", + "api_parse_95thpercentile" : "Description of the metric", + "api_parse_98thpercentile" : "Description of the metric", + "api_parse_99thpercentile" : "Description of the metric", + "api_parse_999thpercentile" : "Description of the metric", + "api_partition-retrieving_count" : "Description of the metric", + "api_partition-retrieving_mean_rate" : "Description of the metric", + "api_partition-retrieving_1min_rate" : "Description of the metric", + "api_partition-retrieving_5min_rate" : "Description of the metric", + "api_partition-retrieving_15min_rate" : "Description of the metric", + "api_partition-retrieving_mean" : "Description of the metric", + "api_partition-retrieving_min" : "Description of the metric", + "api_partition-retrieving_max" : "Description of the metric", + "api_partition-retrieving_median" : "Description of the metric", + "api_partition-retrieving_stddev" : "Description of the metric", + "api_partition-retrieving_75thpercentile" : "Description of the metric", + "api_partition-retrieving_95thpercentile" : "Description of the metric", + "api_partition-retrieving_98thpercentile" : "Description of the metric", + "api_partition-retrieving_99thpercentile" : "Description of the metric", + "api_partition-retrieving_999thpercentile" : "Description of the metric", + "api_releaseLocks_count" : "Description of the metric", + "api_releaseLocks_mean_rate" : "Description of the metric", + "api_releaseLocks_1min_rate" : "Description of the metric", + "api_releaseLocks_5min_rate" : "Description of the metric", + "api_releaseLocks_15min_rate" : "Description of the metric", + "api_releaseLocks_mean" : "Description of the metric", + "api_releaseLocks_min" : "Description of the metric", + "api_releaseLocks_max" : "Description of the metric", + "api_releaseLocks_median" : "Description of the metric", + "api_releaseLocks_stddev" : "Description of the metric", + "api_releaseLocks_75thpercentile" : "Description of the metric", + "api_releaseLocks_95thpercentile" : "Description of the metric", + "api_releaseLocks_98thpercentile" : "Description of the metric", + "api_releaseLocks_99thpercentile" : "Description of the metric", + "api_releaseLocks_999thpercentile" : "Description of the metric", + "api_runTasks_count" : "Description of the metric", + "api_runTasks_mean_rate" : "Description of the metric", + "api_runTasks_1min_rate" : "Description of the metric", + "api_runTasks_5min_rate" : "Description of the metric", + "api_runTasks_15min_rate" : "Description of the metric", + "api_runTasks_mean" : "Description of the metric", + "api_runTasks_min" : "Description of the metric", + "api_runTasks_max" : "Description of the metric", + "api_runTasks_median" : "Description of the metric", + "api_runTasks_stddev" : "Description of the metric", + "api_runTasks_75thpercentile" : "Description of the metric", + "api_runTasks_95thpercentile" : "Description of the metric", + "api_runTasks_98thpercentile" : "Description of the metric", + "api_runTasks_99thpercentile" : "Description of the metric", + "api_runTasks_999thpercentile" : "Description of the metric", + "api_semanticAnalyze_count" : "Description of the metric", + "api_semanticAnalyze_mean_rate" : "Description of the metric", + "api_semanticAnalyze_1min_rate" : "Description of the metric", + "api_semanticAnalyze_5min_rate" : "Description of the metric", + "api_semanticAnalyze_15min_rate" : "Description of the metric", + "api_semanticAnalyze_mean" : "Description of the metric", + "api_semanticAnalyze_min" : "Description of the metric", + "api_semanticAnalyze_max" : "Description of the metric", + "api_semanticAnalyze_median" : "Description of the metric", + "api_semanticAnalyze_stddev" : "Description of the metric", + "api_semanticAnalyze_75thpercentile" : "Description of the metric", + "api_semanticAnalyze_95thpercentile" : "Description of the metric", + "api_semanticAnalyze_98thpercentile" : "Description of the metric", + "api_semanticAnalyze_99thpercentile" : "Description of the metric", + "api_semanticAnalyze_999thpercentile" : "Description of the metric", + "api_serializePlan_count" : "Description of the metric", + "api_serializePlan_mean_rate" : "Description of the metric", + "api_serializePlan_1min_rate" : "Description of the metric", + "api_serializePlan_5min_rate" : "Description of the metric", + "api_serializePlan_15min_rate" : "Description of the metric", + "api_serializePlan_mean" : "Description of the metric", + "api_serializePlan_min" : "Description of the metric", + "api_serializePlan_max" : "Description of the metric", + "api_serializePlan_median" : "Description of the metric", + "api_serializePlan_stddev" : "Description of the metric", + "api_serializePlan_75thpercentile" : "Description of the metric", + "api_serializePlan_95thpercentile" : "Description of the metric", + "api_serializePlan_98thpercentile" : "Description of the metric", + "api_serializePlan_99thpercentile" : "Description of the metric", + "api_serializePlan_999thpercentile" : "Description of the metric", + "api_waitCompile_count" : "Description of the metric", + "api_waitCompile_mean_rate" : "Description of the metric", + "api_waitCompile_1min_rate" : "Description of the metric", + "api_waitCompile_5min_rate" : "Description of the metric", + "api_waitCompile_15min_rate" : "Description of the metric", + "api_waitCompile_mean" : "Description of the metric", + "api_waitCompile_min" : "Description of the metric", + "api_waitCompile_max" : "Description of the metric", + "api_waitCompile_median" : "Description of the metric", + "api_waitCompile_stddev" : "Description of the metric", + "api_waitCompile_75thpercentile" : "Description of the metric", + "api_waitCompile_95thpercentile" : "Description of the metric", + "api_waitCompile_98thpercentile" : "Description of the metric", + "api_waitCompile_99thpercentile" : "Description of the metric", + "api_waitCompile_999thpercentile" : "Description of the metric", + "hs2_compiling_queries_count" : "Description of the metric", + "hs2_compiling_queries_mean_rate" : "Description of the metric", + "hs2_compiling_queries_1min_rate" : "Description of the metric", + "hs2_compiling_queries_5min_rate" : "Description of the metric", + "hs2_compiling_queries_15min_rate" : "Description of the metric", + "hs2_compiling_queries_mean" : "Description of the metric", + "hs2_compiling_queries_min" : "Description of the metric", + "hs2_compiling_queries_max" : "Description of the metric", + "hs2_compiling_queries_median" : "Description of the metric", + "hs2_compiling_queries_stddev" : "Description of the metric", + "hs2_compiling_queries_75thpercentile" : "Description of the metric", + "hs2_compiling_queries_95thpercentile" : "Description of the metric", + "hs2_compiling_queries_98thpercentile" : "Description of the metric", + "hs2_compiling_queries_99thpercentile" : "Description of the metric", + "hs2_compiling_queries_999thpercentile" : "Description of the metric", + "hs2_executing_queries_count" : "Description of the metric", + "hs2_executing_queries_mean_rate" : "Description of the metric", + "hs2_executing_queries_1min_rate" : "Description of the metric", + "hs2_executing_queries_5min_rate" : "Description of the metric", + "hs2_executing_queries_15min_rate" : "Description of the metric", + "hs2_executing_queries_mean" : "Description of the metric", + "hs2_executing_queries_min" : "Description of the metric", + "hs2_executing_queries_max" : "Description of the metric", + "hs2_executing_queries_median" : "Description of the metric", + "hs2_executing_queries_stddev" : "Description of the metric", + "hs2_executing_queries_75thpercentile" : "Description of the metric", + "hs2_executing_queries_95thpercentile" : "Description of the metric", + "hs2_executing_queries_98thpercentile" : "Description of the metric", + "hs2_executing_queries_99thpercentile" : "Description of the metric", + "hs2_executing_queries_999thpercentile" : "Description of the metric", + "hs2_submitted_queries_count" : "Description of the metric", + "hs2_submitted_queries_mean_rate" : "Description of the metric", + "hs2_submitted_queries_1min_rate" : "Description of the metric", + "hs2_submitted_queries_5min_rate" : "Description of the metric", + "hs2_submitted_queries_15min_rate" : "Description of the metric", + "hs2_submitted_queries_mean" : "Description of the metric", + "hs2_submitted_queries_min" : "Description of the metric", + "hs2_submitted_queries_max" : "Description of the metric", + "hs2_submitted_queries_median" : "Description of the metric", + "hs2_submitted_queries_stddev" : "Description of the metric", + "hs2_submitted_queries_75thpercentile" : "Description of the metric", + "hs2_submitted_queries_95thpercentile" : "Description of the metric", + "hs2_submitted_queries_98thpercentile" : "Description of the metric", + "hs2_submitted_queries_99thpercentile" : "Description of the metric", + "hs2_submitted_queries_999thpercentile" : "Description of the metric" +} \ No newline at end of file diff --git a/metrics_orig/journalnode/JournalNode.json b/metrics_orig/journalnode/JournalNode.json new file mode 100644 index 0000000..adc2693 --- /dev/null +++ b/metrics_orig/journalnode/JournalNode.json @@ -0,0 +1,29 @@ +{ + "Syncs60sNumOps": "Number of sync operations (1 minute granularity)", + "Syncs60s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs60s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 minute granularity)", + "Syncs300sNumOps": "Number of sync operations (5 minutes granularity)", + "Syncs300s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs300s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (5 minutes granularity)", + "Syncs3600sNumOps": "Number of sync operations (1 hour granularity)", + "Syncs3600s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 hour granularity)", + "Syncs3600s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 hour granularity)", + "BatchesWritten": "Total number of batches written since startup", + "TxnsWritten": "Total number of transactions written since startup", + "BytesWritten": "Total number of bytes written since startup", + "BatchesWrittenWhileLagging": "Total number of batches written where this node was lagging", + "LastWriterEpoch": "Current writer’s epoch number", + "CurrentLagTxns": "The number of transactions that this JournalNode is lagging", + "LastWrittenTxId": "The highest transaction id stored on this JournalNode", + "LastPromisedEpoch": "The last epoch number which this node has promised not to accept any lower epoch, or 0 if no promises have been made", + "LastJournalTimestamp": "The timestamp of last successfully written transaction" +} \ No newline at end of file diff --git a/metrics_orig/namenode/FSNamesystem.json b/metrics_orig/namenode/FSNamesystem.json new file mode 100644 index 0000000..6ee64c9 --- /dev/null +++ b/metrics_orig/namenode/FSNamesystem.json @@ -0,0 +1,36 @@ +{ + "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state", + "MissingBlocks": "Current number of missing blocks", + "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1", + "ExpiredHeartbeats": "Total number of expired heartbeats", + "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint", + "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll", + "LastWrittenTransactionId": "Last transaction ID written to the edit log", + "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint", + "CapacityTotal": "Current raw capacity of DataNodes in bytes", + "CapacityUsed": "Current used capacity across all DataNodes in bytes", + "CapacityRemaining": "Current remaining capacity in bytes", + "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes", + "TotalLoad": "Current number of connections", + "SnapshottableDirectories": "Current number of snapshottable directories", + "Snapshots": "Current number of snapshots", + "NumEncryptionZones": "Current number of encryption zones", + "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock", + "BlocksTotal": "Current number of allocated blocks in the system", + "NumFilesUnderConstruction": "Current number of files under construction", + "NumActiveClients": "Current number of active clients holding lease", + "FilesTotal": "Current number of files and directories", + "PendingReplicationBlocks": "Current number of blocks pending to be replicated", + "UnderReplicatedBlocks": "Current number of blocks under replicated", + "CorruptBlocks": "Current number of blocks with corrupt replicas.", + "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications", + "PendingDeletionBlocks": "Current number of blocks pending deletion", + "ExcessBlocks": "Current number of excess blocks", + "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.", + "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate", + "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode", + "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0", + "BlockCapacity": "Current number of block capacity", + "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat", + "TotalSyncCount": "Total number of sync operations performed by edit log" +} \ No newline at end of file diff --git a/metrics_orig/namenode/FSNamesystemState.json b/metrics_orig/namenode/FSNamesystemState.json new file mode 100644 index 0000000..6b241c9 --- /dev/null +++ b/metrics_orig/namenode/FSNamesystemState.json @@ -0,0 +1,16 @@ +{ + "FsLockQueueLength": "Filesystem lock queue length", + "MaxObjects": "Max objects", + "BlockDeletionStartTime": "Start time of block deletion", + "NumLiveDataNodes": "Number of datanodes which are currently live", + "NumDeadDataNodes": "Number of datanodes which are currently dead", + "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live", + "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead", + "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state", + "NumStaleDataNodes": "Number of datanodes marked as content stale", + "VolumeFailuresTotal": "Total number of volume failures across all Datanodes", + "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures", + "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)", + "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)", + "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation" +} \ No newline at end of file diff --git a/metrics_orig/namenode/NameNode.json b/metrics_orig/namenode/NameNode.json new file mode 100644 index 0000000..c055ce8 --- /dev/null +++ b/metrics_orig/namenode/NameNode.json @@ -0,0 +1,133 @@ +{ + "CreateFileOps": "Total number of files created.", + "FilesCreated": "Total number of files and directories created by create or mkdir operations.", + "FilesAppended": "Total number of files appended.", + "GetBlockLocations": "Total number of getBlockLocations operations.", + "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).", + "GetListingOps": "Total number of directory listing operations.", + "DeleteFileOps": "Total number of delete operations.", + "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.", + "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.", + "AddBlockOps": "Total number of addBlock operations succeeded.", + "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.", + "CreateSymlinkOps": "Total number of createSymlink operations.", + "GetLinkTargetOps": "Total number of getLinkTarget operations.", + "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.", + "AllowSnapshotOps": "Total number of allowSnapshot operations.", + "DisallowSnapshotOps": "Total number of disallowSnapshot operations.", + "CreateSnapshotOps": "Total number of createSnapshot operations.", + "DeleteSnapshotOps": "Total number of deleteSnapshot operations.", + "RenameSnapshotOps": "Total number of renameSnapshot operations.", + "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.", + "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.", + "TransactionsNumOps": "Total number of Journal transactions.", + "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.", + "SyncsNumOps": "Total number of Journal syncs.", + "SyncsAvgTime": "Average time of Journal syncs in milliseconds.", + "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.", + "BlockReportNumOps": "Total number of processing block reports from DataNode.", + "BlockReportAvgTime": "Average time of processing block reports in milliseconds.", + "CacheReportNumOps": "Total number of processing cache reports from DataNode.", + "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.", + "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).", + "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.", + "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.", + "GetEditAvgTime": "Average edits download time in milliseconds.", + "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.", + "GetImageAvgTime": "Average fsimage download time in milliseconds.", + "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.", + "PutImageAvgTime": "Average fsimage upload time in milliseconds.", + "TotalFileOps": "Total number of all file operations.", + "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state", + "MissingBlocks": "Current number of missing blocks", + "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1", + "ExpiredHeartbeats": "Total number of expired heartbeats", + "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint", + "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll", + "LastWrittenTransactionId": "Last transaction ID written to the edit log", + "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint", + "CapacityTotal": "Current raw capacity of DataNodes in bytes", + "CapacityUsed": "Current used capacity across all DataNodes in bytes", + "CapacityRemaining": "Current remaining capacity in bytes", + "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes", + "TotalLoad": "Current number of connections", + "SnapshottableDirectories": "Current number of snapshottable directories", + "Snapshots": "Current number of snapshots", + "NumEncryptionZones": "Current number of encryption zones", + "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock", + "BlocksTotal": "Current number of allocated blocks in the system", + "NumFilesUnderConstruction": "Current number of files under construction", + "NumActiveClients": "Current number of active clients holding lease", + "FilesTotal": "Current number of files and directories", + "PendingReplicationBlocks": "Current number of blocks pending to be replicated", + "UnderReplicatedBlocks": "Current number of blocks under replicated", + "CorruptBlocks": "Current number of blocks with corrupt replicas.", + "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications", + "PendingDeletionBlocks": "Current number of blocks pending deletion", + "ExcessBlocks": "Current number of excess blocks", + "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.", + "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate", + "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode", + "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0", + "BlockCapacity": "Current number of block capacity", + "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat", + "TotalSyncCount": "Total number of sync operations performed by edit log", + "FsLockQueueLength": "Filesystem lock queue length", + "MaxObjects": "Max objects", + "BlockDeletionStartTime": "Start time of block deletion", + "NumLiveDataNodes": "Number of datanodes which are currently live", + "NumDeadDataNodes": "Number of datanodes which are currently dead", + "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live", + "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead", + "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state", + "NumStaleDataNodes": "Number of datanodes marked as content stale", + "VolumeFailuresTotal": "Total number of volume failures across all Datanodes", + "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures", + "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)", + "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)", + "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation", + "Total": "Total", + "TotalBlocks": "Total number of blocks", + "Used": "Total used space by data nodes", + "Free": "Total free space by data nodes", + "Safemode": "Is in safe mode. 0: no, 1: yes", + "NonDfsUsedSpace": "Total used space by data nodes for non DFS purposes such as storing temporary files on the local file system", + "PercentUsed": "Total used space by data nodes as percentage of total capacity", + "BlockPoolUsedSpace": "Block pool used space", + "PercentBlockPoolUsed": "Percent of block pool used", + "PercentRemaining": "Total remaining space by data nodes as percentage of total capacity", + "CacheCapacity": "Cache Capacity", + "CacheUsed": "Cache Used", + "TotalFiles": "Total Files", + "NumberOfMissingBlocks": "Number of missing blocks", + "NumberOfMissingBlocksWithReplicationFactorOne": "Number of missing blocks with replication factor one", + "LiveNodes": "Live nodes", + "SoftwareVersion": "Software version", + "DeadNodes": "Dead nodes", + "DecomNodes": "Decom nodes", + "EnteringMaintenanceNodes": "Entering maintenance nodes", + "NodeUsage": "Node Usage", + "NNStartedTimeInMillis": "NameNode started time in millis", + "CorruptFiles": "Corrupt file list", + "CacheHit": "Total number of RetryCache hit.", + "CacheCleared": "Total number of RetryCache cleared.", + "CacheUpdated": "Total number of RetryCache updated.", + "ElapsedTime": "Total elapsed time in milliseconds.", + "PercentComplete": "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0).", + "LoadingFsImageCount": "", + "LoadingFsImageElapsedTime": "", + "LoadingFsImageTotal": "", + "LoadingFsImagePercentComplete": "", + "LoadingEditsCount": "", + "LoadingEditsElapsedTime": "", + "LoadingEditsTotal": "", + "LoadingEditsPercentComplete": "", + "SavingCheckpointCount": "", + "SavingCheckpointElapsedTime": "", + "SavingCheckpointTotal": "", + "SavingCheckpointPercentComplete": "", + "SafeModeCount": "", + "SafeModeElapsedTime": "", + "SafeModeTotal": "", + "SafeModePercentComplete": "" +} diff --git a/metrics_orig/namenode/NameNodeActivity.json b/metrics_orig/namenode/NameNodeActivity.json new file mode 100644 index 0000000..b5850fe --- /dev/null +++ b/metrics_orig/namenode/NameNodeActivity.json @@ -0,0 +1,41 @@ +{ + "CreateFileOps": "Total number of files created.", + "FilesCreated": "Total number of files and directories created by create or mkdir operations.", + "FilesAppended": "Total number of files appended.", + "GetBlockLocations": "Total number of getBlockLocations operations.", + "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).", + "GetListingOps": "Total number of directory listing operations.", + "DeleteFileOps": "Total number of delete operations.", + "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.", + "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.", + "AddBlockOps": "Total number of addBlock operations succeeded.", + "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.", + "CreateSymlinkOps": "Total number of createSymlink operations.", + "GetLinkTargetOps": "Total number of getLinkTarget operations.", + "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.", + "AllowSnapshotOps": "Total number of allowSnapshot operations.", + "DisallowSnapshotOps": "Total number of disallowSnapshot operations.", + "CreateSnapshotOps": "Total number of createSnapshot operations.", + "DeleteSnapshotOps": "Total number of deleteSnapshot operations.", + "RenameSnapshotOps": "Total number of renameSnapshot operations.", + "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.", + "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.", + "TransactionsNumOps": "Total number of Journal transactions.", + "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.", + "SyncsNumOps": "Total number of Journal syncs.", + "SyncsAvgTime": "Average time of Journal syncs in milliseconds.", + "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.", + "BlockReportNumOps": "Total number of processing block reports from DataNode.", + "BlockReportAvgTime": "Average time of processing block reports in milliseconds.", + "CacheReportNumOps": "Total number of processing cache reports from DataNode.", + "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.", + "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).", + "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.", + "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.", + "GetEditAvgTime": "Average edits download time in milliseconds.", + "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.", + "GetImageAvgTime": "Average fsimage download time in milliseconds.", + "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.", + "PutImageAvgTime": "Average fsimage upload time in milliseconds.", + "TotalFileOps": "Total number of all file operations." +} diff --git a/metrics_orig/namenode/NameNodeInfo.json b/metrics_orig/namenode/NameNodeInfo.json new file mode 100644 index 0000000..0d353b4 --- /dev/null +++ b/metrics_orig/namenode/NameNodeInfo.json @@ -0,0 +1,25 @@ +{ + "Total": "Total", + "TotalBlocks": "Total number of blocks", + "Used": "Total used space by data nodes", + "Free": "Total free space by data nodes", + "Safemode": "Is in safe mode. 0: no, 1: yes", + "NonDfsUsedSpace": "Total used space by data nodes for non DFS purposes such as storing temporary files on the local file system", + "PercentUsed": "Total used space by data nodes as percentage of total capacity", + "BlockPoolUsedSpace": "Block pool used space", + "PercentBlockPoolUsed": "Percent of block pool used", + "PercentRemaining": "Total remaining space by data nodes as percentage of total capacity", + "CacheCapacity": "Cache Capacity", + "CacheUsed": "Cache Used", + "TotalFiles": "Total Files", + "NumberOfMissingBlocks": "Number of missing blocks", + "NumberOfMissingBlocksWithReplicationFactorOne": "Number of missing blocks with replication factor one", + "LiveNodes": "Live nodes", + "SoftwareVersion": "Software version", + "DeadNodes": "Dead nodes", + "DecomNodes": "Decom nodes", + "EnteringMaintenanceNodes": "Entering maintenance nodes", + "NodeUsage": "Node Usage", + "NNStartedTimeInMillis": "NameNode started time in millis", + "CorruptFiles": "Corrupt file list" +} diff --git a/metrics_orig/namenode/RetryCache.json b/metrics_orig/namenode/RetryCache.json new file mode 100644 index 0000000..caa37b3 --- /dev/null +++ b/metrics_orig/namenode/RetryCache.json @@ -0,0 +1,5 @@ +{ + "CacheHit": "Total number of RetryCache hit.", + "CacheCleared": "Total number of RetryCache cleared.", + "CacheUpdated": "Total number of RetryCache updated." +} diff --git a/metrics_orig/namenode/StartupProgress.json b/metrics_orig/namenode/StartupProgress.json new file mode 100644 index 0000000..d25e8a4 --- /dev/null +++ b/metrics_orig/namenode/StartupProgress.json @@ -0,0 +1,20 @@ +{ + "ElapsedTime": "Total elapsed time in milliseconds.", + "PercentComplete": "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0).", + "LoadingFsImageCount": "", + "LoadingFsImageElapsedTime": "", + "LoadingFsImageTotal": "", + "LoadingFsImagePercentComplete": "", + "LoadingEditsCount": "", + "LoadingEditsElapsedTime": "", + "LoadingEditsTotal": "", + "LoadingEditsPercentComplete": "", + "SavingCheckpointCount": "", + "SavingCheckpointElapsedTime": "", + "SavingCheckpointTotal": "", + "SavingCheckpointPercentComplete": "", + "SafeModeCount": "", + "SafeModeElapsedTime": "", + "SafeModeTotal": "", + "SafeModePercentComplete": "" +} \ No newline at end of file diff --git a/metrics_orig/nodemanager/NodeManagerMetrics.json b/metrics_orig/nodemanager/NodeManagerMetrics.json new file mode 100644 index 0000000..daa5ba2 --- /dev/null +++ b/metrics_orig/nodemanager/NodeManagerMetrics.json @@ -0,0 +1,19 @@ +{ + "ContainersLaunched": "Count of launched container", + "ContainersCompleted": "Count of completed container", + "ContainersFailed": "Count of failed container", + "ContainersKilled": "Count of killed container", + "ContainersIniting": "Count of initing container", + "ContainersRunning": "Count of running container", + "AllocatedGB": "Memory size of allocated (in GB)", + "AllocatedContainers": "Count of allocated container", + "AvailableGB": "Memory size of available (in GB)", + "AllocatedVCores": "Count of allocated VCores", + "AvailableVCores": "Count of available VCores", + "ContainerLaunchDurationNumOps": "Count of launched container", + "ContainerLaunchDurationAvgTime": "Average time of launching container (in ms)", + "BadLocalDirs": "Count of bad local directory", + "BadLogDirs": "Count of bad log directory", + "GoodLocalDirsDiskUtilizationPerc": "Percent of good local directory disk utilization", + "GoodLogDirsDiskUtilizationPerc": "Percent of good local log directory disk utilization" +} diff --git a/metrics_orig/nodemanager/ShuffleMetrics.json b/metrics_orig/nodemanager/ShuffleMetrics.json new file mode 100644 index 0000000..7d8d041 --- /dev/null +++ b/metrics_orig/nodemanager/ShuffleMetrics.json @@ -0,0 +1,6 @@ +{ + "ShuffleOutputBytes": "Output byte of shuffle", + "ShuffleOutputsFailed": "Output failed of shuffle", + "ShuffleOutputsOK": "Output ok of shuffle", + "ShuffleConnections": "Connection count of shuffle" +} diff --git a/metrics_orig/resourcemanager/ClusterMetrics.json b/metrics_orig/resourcemanager/ClusterMetrics.json new file mode 100644 index 0000000..71af2cc --- /dev/null +++ b/metrics_orig/resourcemanager/ClusterMetrics.json @@ -0,0 +1,11 @@ +{ + "NumActiveNMs": "Current number of active NodeManagers", + "NumDecommissionedNMs": "Current number of decommissioned NodeManagers", + "NumLostNMs": "Current number of lost NodeManagers for not sending heartbeats", + "NumUnhealthyNMs": "Current number of unhealthy NodeManagers", + "NumRebootedNMs": "Current number of rebooted NodeManagers", + "AMLaunchDelayNumOps": "Total number of AMs launched", + "AMLaunchDelayAvgTime": "Average time in milliseconds RM spends to launch AM containers after the AM container is allocated", + "AMRegisterDelayNumOps": "Total number of AMs registered", + "AMRegisterDelayAvgTime": "Average time in milliseconds AM spends to register with RM after the AM container gets launched" +} \ No newline at end of file diff --git a/metrics_orig/resourcemanager/QueueMetrics.json b/metrics_orig/resourcemanager/QueueMetrics.json new file mode 100644 index 0000000..497f42d --- /dev/null +++ b/metrics_orig/resourcemanager/QueueMetrics.json @@ -0,0 +1,33 @@ +{ + "running_0": "Current number of running applications whose elapsed time are less than 60 minutes.", + "running_60": "Current number of running applications whose elapsed time are between 60 and 300 minutes.", + "running_300": "Current number of running applications whose elapsed time are between 300 and 1440 minutes.", + "running_1440": "Current number of running applications elapsed time are more than 1440 minutes.", + "AppsSubmitted": "Total number of submitted applications.", + "AppsRunning": "Current number of running applications.", + "AppsPending": "Current number of applications that have not yet been assigned by any containers.", + "AppsCompleted": "Total number of completed applications.", + "AppsKilled": "Total number of killed applications.", + "AppsFailed": "Total number of failed applications.", + "AllocatedMB": "Current allocated memory in MB.", + "AllocatedVCores": "Current allocated CPU in virtual cores.", + "AllocatedContainers": "Current number of allocated containers.", + "AggregateContainersAllocated": "Total number of allocated containers.", + "AggregateContainersReleased": "Total number of released containers.", + "AvailableMB": "Current available memory in MB.", + "AvailableVCores": "Current available CPU in virtual cores.", + "PendingMB": "Current pending memory resource requests in MB that are not yet fulfilled by the scheduler.", + "PendingVCores": "Current pending CPU allocation requests in virtual cores that are not yet fulfilled by the scheduler.", + "PendingContainers": "Current pending resource requests that are not yet fulfilled by the scheduler.", + "ReservedMB": "Current reserved memory in MB.", + "ReservedVCores": "Current reserved CPU in virtual cores.", + "ReservedContainers": "Current number of reserved containers.", + "ActiveUsers": "Current number of active users.", + "ActiveApplications": "Current number of active applications.", + "FairShareMB": "(FairScheduler only) Current fair share of memory in MB.", + "FairShareVCores": "(FairScheduler only) Current fair share of CPU in virtual cores.", + "MinShareMB": "(FairScheduler only) Minimum share of memory in MB.", + "MinShareVCores": "(FairScheduler only) Minimum share of CPU in virtual cores.", + "MaxShareMB": "(FairScheduler only) Maximum share of memory in MB.", + "MaxShareVCores": "(FairScheduler only) Maximum share of CPU in virtual cores." +} \ No newline at end of file diff --git a/metrics_orig/resourcemanager/RMNMInfo.json b/metrics_orig/resourcemanager/RMNMInfo.json new file mode 100644 index 0000000..8d70b64 --- /dev/null +++ b/metrics_orig/resourcemanager/RMNMInfo.json @@ -0,0 +1,6 @@ +{ + "NumContainers": "Total number of containers currently running on the host", + "State": "State of the host - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONED, LOST, REBOOTED", + "UsedMemoryMB": "The total amount of memory currently used on the host (in MB)", + "AvailableMemoryMB": "The total amount of memory currently available on the host (in MB)" +} \ No newline at end of file From ff7a9fbc501d6be1792e31760b2244e4fd98812a Mon Sep 17 00:00:00 2001 From: akenO8 Date: Mon, 1 Sep 2025 17:40:34 +0800 Subject: [PATCH 5/5] [UPDATE] add trino metric --- cmd/trino_coordinator.py | 14 +- cmd/trino_worker.py | 222 ++++++++++++++++++ cmd/utils.py | 1 + hadoop_jmx_exporter.py | 3 + metrics/coordinator/CkCache.json | 8 + .../coordinator/G1OldGarbageCollector.json | 6 + .../coordinator/G1YoungGarbageCollector.json | 6 + metrics/hivemetastore/api.json | 8 + metrics/worker/AlluxioCacheStats.json | 8 + metrics/worker/TaskExecutor.json | 4 + 10 files changed, 275 insertions(+), 5 deletions(-) create mode 100644 cmd/trino_worker.py create mode 100644 metrics/coordinator/CkCache.json create mode 100644 metrics/hivemetastore/api.json create mode 100644 metrics/worker/AlluxioCacheStats.json create mode 100644 metrics/worker/TaskExecutor.json diff --git a/cmd/trino_coordinator.py b/cmd/trino_coordinator.py index 5d85179..2f02e95 100644 --- a/cmd/trino_coordinator.py +++ b/cmd/trino_coordinator.py @@ -66,10 +66,12 @@ def setup_metrics_labels(self, beans): self.setup_trino_coor_labels('GcMonitor') if 'java.lang:name=G1 Young Generation,type=GarbageCollector' in beans[i]['objectName']: # self.setup_trino_coor_labels('G1YoungGarbageCollector') - self.setup_young_gc_labels(); + self.setup_young_gc_labels() if 'java.lang:name=G1 Old Generation,type=GarbageCollector' in beans[i]['objectName']: # self.setup_trino_coor_labels('G1OldGarbageCollector') - self.setup_old_gc_labels(); + self.setup_old_gc_labels() + if 'io.trino.plugin.jdbc:type=CachingJdbcClient,name=ck' in beans[i]['objectName']: + self.setup_trino_coor_labels('CkCache') def setup_trino_coor_labels(self, kind): label = ["cluster", "method", "_target"] @@ -143,10 +145,12 @@ def get_metrics(self, beans): self.get_trino_coor_labels(beans[i], 'GcMonitor') if 'java.lang:name=G1 Young Generation,type=GarbageCollector' in beans[i]['objectName']: # self.get_trino_coor_labels(beans[i], 'G1YoungGarbageCollector') - self.get_young_gc_labels(beans[i]); + self.get_young_gc_labels(beans[i]) if 'java.lang:name=G1 Old Generation,type=GarbageCollector' in beans[i]['objectName']: # self.get_trino_coor_labels(beans[i], 'G1OldGarbageCollector') - self.get_old_gc_labels(beans[i]); + self.get_old_gc_labels(beans[i]) + if 'io.trino.plugin.jdbc:type=CachingJdbcClient,name=ck' in beans[i]['objectName']: + self.get_trino_coor_labels(beans[i], 'CkCache') def get_trino_coor_labels(self, bean, kind): # type(bean) = dict @@ -207,4 +211,4 @@ def get_old_gc_labels(self, bean): elif attr['name'] == metric: key = kind value = attr['value'] - self.trino_coordinator_metrics[kind][key].add_metric(label, value) \ No newline at end of file + self.trino_coordinator_metrics[kind][key].add_metric(label, value) diff --git a/cmd/trino_worker.py b/cmd/trino_worker.py new file mode 100644 index 0000000..759d9e6 --- /dev/null +++ b/cmd/trino_worker.py @@ -0,0 +1,222 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from prometheus_client.core import GaugeMetricFamily +from scraper import ScrapeMetrics + +from utils import get_module_logger +from common import MetricCollector + +logger = get_module_logger(__name__) + + +class TrinoWorkerMetricCollector(MetricCollector): + + def __init__(self, cluster, urls): + MetricCollector.__init__(self, cluster, "trino", "worker") + self.trino_worker_prefix = 'trino_worker' + self.target = "-" + self.urls = urls + + self.trino_worker_metrics = {} + for i in range(len(self.file_list)): + self.trino_worker_metrics.setdefault(self.file_list[i], {}) + + self.scrape_metrics = ScrapeMetrics(urls) + + def collect(self): + isSetup = False + # isGetHost = False + beans_list = self.scrape_metrics.scrape() + for beans in beans_list: + if not isSetup: + self.setup_metrics_labels(beans) + isSetup = True + # if not isGetHost: + for i in range(len(beans)): + if 'java.lang:type=Runtime' in beans[i]['objectName']: + self.target = beans[i]['attributes'][0]['value'].split('@')[1] + # isGetHost = True + break + self.get_metrics(beans) + + for i in range(len(self.merge_list)): + service = self.merge_list[i] + if service in self.trino_worker_metrics: + for metric in self.trino_worker_metrics[service]: + yield self.trino_worker_metrics[service][metric] + + def setup_metrics_labels(self, beans): + for i in range(len(beans)): + # if 'java.lang:type=Memory' in beans[i]['objectName']: + # self.setup_trino_coor_labels('Memory') + # if 'java.lang:type=Threading' in beans[i]['objectName']: + # self.setup_trino_coor_labels('Threading') + # if 'trino.execution:name=QueryManager' in beans[i]['objectName']: + # self.setup_trino_coor_labels('QueryManager') + # if 'trino.execution:name=SqlTaskManager' in beans[i]['objectName']: + # self.setup_trino_coor_labels('SqlTaskManager') + # if 'trino.failuredetector:name=HeartbeatFailureDetector' in beans[i]['objectName']: + # self.setup_trino_coor_labels('HeartbeatFailureDetector') + # if 'trino.memory:name=ClusterMemoryManager' in beans[i]['objectName']: + # self.setup_trino_coor_labels('ClusterMemoryManager') + # if 'trino.memory:type=ClusterMemoryPool,name=general' in beans[i]['objectName']: + # self.setup_trino_coor_labels('ClusterMemoryPool') + # if 'io.airlift.stats:name=GcMonitor' in beans[i]['objectName']: + # self.setup_trino_coor_labels('GcMonitor') + # if 'java.lang:name=G1 Young Generation,type=GarbageCollector' in beans[i]['objectName']: + # # self.setup_trino_coor_labels('G1YoungGarbageCollector') + # self.setup_young_gc_labels() + # if 'java.lang:name=G1 Old Generation,type=GarbageCollector' in beans[i]['objectName']: + # # self.setup_trino_coor_labels('G1OldGarbageCollector') + # self.setup_old_gc_labels() + # if 'io.trino.plugin.jdbc:type=CachingJdbcClient,name=ck' in beans[i]['objectName']: + # self.setup_trino_coor_labels('CkCache') + if 'io.trino.filesystem.alluxio:type=AlluxioCacheStats,name=hive' in beans[i]['objectName']: + self.setup_trino_worker_labels('AlluxioCacheStats') + if 'trino.execution.executor:name=TaskExecutor' in beans[i]['objectName']: + self.setup_trino_worker_labels('TaskExecutor') + + def setup_trino_worker_labels(self, kind): + label = ["cluster", "method", "_target"] + name = "_".join([self.trino_worker_prefix, kind]) + description = "Trino Worker {0} metric.".format(kind) + # 暂时没有细分,如果后面在kind内部继续划分key,可以用上 + key = kind + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + + def setup_young_gc_labels(self): + kind = 'G1YoungGarbageCollector' + label = ["cluster", "method", "_target"] + name = "_".join([self.trino_worker_prefix, kind]) + + young_before, young_after = 1, 1 + for metric in self.metrics[kind]: + if metric.split('.')[0] == 'LastGcInfo': + key = metric.split('.')[1] + if key == 'memoryUsageBeforeGc' and young_before: + description = 'Trino node memory usage metric before Young GC.' + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_before = 0 + elif key == 'memoryUsageAfterGc' and young_after: + description = 'Trino node memory usage metric after Young GC.' + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_after = 0 + else: + key = kind + description = 'Trino node Old Young metric' + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + + def setup_old_gc_labels(self): + kind = 'G1OldGarbageCollector' + label = ["cluster", "method", "_target"] + name = "_".join([self.trino_worker_prefix, kind]) + + young_before, young_after = 1, 1 + for metric in self.metrics[kind]: + if metric.split('.')[0] == 'LastGcInfo': + key = metric.split('.')[1] + if key == 'memoryUsageBeforeGc' and young_before: + description = 'Trino node memory usage metric before Old GC.' + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_before = 0 + elif key == 'memoryUsageAfterGc' and young_after: + description = 'Trino node memory usage metric after Old GC.' + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + young_after = 0 + else: + key = kind + description = 'Trino node Old GC metric' + self.trino_worker_metrics[kind][key] = GaugeMetricFamily(name, description, labels=label) + + def get_metrics(self, beans): + for i in range(len(beans)): + # if 'java.lang:type=Memory' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'Memory') + # if 'java.lang:type=Threading' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'Threading') + # if 'trino.execution:name=QueryManager' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'QueryManager') + # if 'trino.execution:name=SqlTaskManager' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'SqlTaskManager') + # if 'trino.failuredetector:name=HeartbeatFailureDetector' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'HeartbeatFailureDetector') + # if 'trino.memory:name=ClusterMemoryManager' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'ClusterMemoryManager') + # if 'trino.memory:type=ClusterMemoryPool,name=general' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'ClusterMemoryPool') + # if 'io.airlift.stats:name=GcMonitor' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'GcMonitor') + # if 'java.lang:name=G1 Young Generation,type=GarbageCollector' in beans[i]['objectName']: + # # self.get_trino_coor_labels(beans[i], 'G1YoungGarbageCollector') + # self.get_young_gc_labels(beans[i]) + # if 'java.lang:name=G1 Old Generation,type=GarbageCollector' in beans[i]['objectName']: + # # self.get_trino_coor_labels(beans[i], 'G1OldGarbageCollector') + # self.get_old_gc_labels(beans[i]) + # if 'io.trino.plugin.jdbc:type=CachingJdbcClient,name=ck' in beans[i]['objectName']: + # self.get_trino_coor_labels(beans[i], 'CkCache') + if 'io.trino.filesystem.alluxio:type=AlluxioCacheStats,name=hive' in beans[i]['objectName']: + self.get_trino_worker_labels(beans[i], 'AlluxioCacheStats') + if 'trino.execution.executor:name=TaskExecutor' in beans[i]['objectName']: + self.get_trino_worker_labels(beans[i], 'TaskExecutor') + + def get_trino_worker_labels(self, bean, kind): + # type(bean) = dict + for metric in self.metrics[kind]: + key = kind + label = [self.cluster, '', self.target] + value = 0 + for attr in bean['attributes']: + # type(attr) = dict + method = metric.replace('.', '_').replace(':', '_').replace('-', '_') + label = [self.cluster, method, self.target] + if attr['name'] == metric: + if kind == 'Memory' and 'HeapMemoryUsage' in metric: + manu = 'used' + value = attr['value'][manu] + else: + value = attr['value'] + break + if not self.trino_worker_metrics[kind].has_key(key): + self.setup_trino_worker_labels(kind) + self.trino_worker_metrics[kind][key].add_metric(label, value) + + def get_young_gc_labels(self, bean): + kind = 'G1YoungGarbageCollector' + for metric in self.metrics[kind]: + method = metric.replace('.', '_').replace(':', '_').replace('-', '_') + label = [self.cluster, method, self.target] + value = 0 + for attr in bean['attributes']: + if attr['name'] == 'LastGcInfo' and len(metric.split('.')) > 2 and metric.split('.')[0] == 'LastGcInfo': + if 'value' in attr: + key = metric.split('.')[1] + for vl in attr['value'][key]: + if vl['key'] == metric.split('.')[2]: + manu = 'used' + value = vl['value'][manu] + self.trino_worker_metrics[kind][key].add_metric(label, value) + elif attr['name'] == metric: + key = kind + value = attr['value'] + self.trino_worker_metrics[kind][key].add_metric(label, value) + + def get_old_gc_labels(self, bean): + kind = 'G1OldGarbageCollector' + for metric in self.metrics[kind]: + method = metric.replace('.', '_').replace(':', '_').replace('-', '_') + label = [self.cluster, method, self.target] + value = 0 + for attr in bean['attributes']: + if attr['name'] == 'LastGcInfo' and len(metric.split('.')) > 2 and metric.split('.')[0] == 'LastGcInfo': + if 'value' in attr: + key = metric.split('.')[1] + for vl in attr['value'][key]: + if vl['key'] == metric.split('.')[2]: + manu = 'used' + value = vl['value'][manu] + self.trino_worker_metrics[kind][key].add_metric(label, value) + elif attr['name'] == metric: + key = kind + value = attr['value'] + self.trino_worker_metrics[kind][key].add_metric(label, value) diff --git a/cmd/utils.py b/cmd/utils.py index 8726b1f..1b8675d 100644 --- a/cmd/utils.py +++ b/cmd/utils.py @@ -69,6 +69,7 @@ def parse_args(): parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*") parser.add_argument('-hss', required=False, metavar='hiveserver_jmx_url', help='Hadoop hiveserver jmx metrics URL.', nargs="*") parser.add_argument('-tcs', required=False, metavar='trino_coordinator_jmx_url', help='Trino coordinator jmx metrics URL.', nargs="*") + parser.add_argument('-tws', required=False, metavar='trino_worker_jmx_url', help='Trino worker jmx metrics URL.', nargs="*") parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0') parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688) return parser.parse_args() diff --git a/hadoop_jmx_exporter.py b/hadoop_jmx_exporter.py index 01c75ca..1242926 100755 --- a/hadoop_jmx_exporter.py +++ b/hadoop_jmx_exporter.py @@ -14,6 +14,7 @@ from cmd.yarn_resourcemanager import ResourceManagerMetricCollector from cmd.hive_server import HiveServerMetricCollector from cmd.yarn_nodemanager import NodeManagerMetricCollector +from cmd.trino_worker import TrinoWorkerMetricCollector logger = get_module_logger(__name__) @@ -35,6 +36,8 @@ def register_prometheus(cluster, args): REGISTRY.register(HiveServerMetricCollector(cluster, args.hss)) if args.tcs is not None and len(args.tcs) > 0: REGISTRY.register(TrinoCoordinatorMetricCollector(cluster, args.tcs)) + if args.tws is not None and len(args.tws) > 0: + REGISTRY.register(TrinoWorkerMetricCollector(cluster, args.tws)) def main(): diff --git a/metrics/coordinator/CkCache.json b/metrics/coordinator/CkCache.json new file mode 100644 index 0000000..0f2c41f --- /dev/null +++ b/metrics/coordinator/CkCache.json @@ -0,0 +1,8 @@ +{ + "ColumnsCache.HitRate": "", + "SchemaNamesStats.HitRate": "", + "StatisticsCache.HitRate": "", + "TableHandlesByNameCache.HitRate": "", + "TableHandlesByQueryCache.HitRate": "", + "TableNamesCache.HitRate": "" +} \ No newline at end of file diff --git a/metrics/coordinator/G1OldGarbageCollector.json b/metrics/coordinator/G1OldGarbageCollector.json index 7290747..ebc062d 100644 --- a/metrics/coordinator/G1OldGarbageCollector.json +++ b/metrics/coordinator/G1OldGarbageCollector.json @@ -4,11 +4,17 @@ "LastGcInfo.memoryUsageBeforeGc.Compressed Class Space": "", "LastGcInfo.memoryUsageBeforeGc.Metaspace": "", "LastGcInfo.memoryUsageBeforeGc.G1 Eden Space": "", + "LastGcInfo.memoryUsageBeforeGc.CodeHeap 'profiled nmethods'": "", + "LastGcInfo.memoryUsageBeforeGc.CodeHeap 'non-profiled nmethods'": "", + "LastGcInfo.memoryUsageBeforeGc.CodeHeap 'non-nmethods'": "", "LastGcInfo.memoryUsageAfterGc.G1 Old Gen": "", "LastGcInfo.memoryUsageAfterGc.G1 Survivor Space": "", "LastGcInfo.memoryUsageAfterGc.Compressed Class Space": "", "LastGcInfo.memoryUsageAfterGc.Metaspace": "", "LastGcInfo.memoryUsageAfterGc.G1 Eden Space": "", + "LastGcInfo.memoryUsageAfterGc.CodeHeap 'profiled nmethods'": "", + "LastGcInfo.memoryUsageAfterGc.CodeHeap 'non-profiled nmethods'": "", + "LastGcInfo.memoryUsageAfterGc.CodeHeap 'non-nmethods'": "", "CollectionCount": "FGC", "CollectionTime": "FGCT" } \ No newline at end of file diff --git a/metrics/coordinator/G1YoungGarbageCollector.json b/metrics/coordinator/G1YoungGarbageCollector.json index b108d50..72e5028 100644 --- a/metrics/coordinator/G1YoungGarbageCollector.json +++ b/metrics/coordinator/G1YoungGarbageCollector.json @@ -4,11 +4,17 @@ "LastGcInfo.memoryUsageBeforeGc.Compressed Class Space": "", "LastGcInfo.memoryUsageBeforeGc.Metaspace": "", "LastGcInfo.memoryUsageBeforeGc.G1 Eden Space": "", + "LastGcInfo.memoryUsageBeforeGc.CodeHeap 'profiled nmethods'": "", + "LastGcInfo.memoryUsageBeforeGc.CodeHeap 'non-profiled nmethods'": "", + "LastGcInfo.memoryUsageBeforeGc.CodeHeap 'non-nmethods'": "", "LastGcInfo.memoryUsageAfterGc.G1 Old Gen": "", "LastGcInfo.memoryUsageAfterGc.G1 Survivor Space": "", "LastGcInfo.memoryUsageAfterGc.Compressed Class Space": "", "LastGcInfo.memoryUsageAfterGc.Metaspace": "", "LastGcInfo.memoryUsageAfterGc.G1 Eden Space": "", + "LastGcInfo.memoryUsageAfterGc.CodeHeap 'profiled nmethods'": "", + "LastGcInfo.memoryUsageAfterGc.CodeHeap 'non-profiled nmethods'": "", + "LastGcInfo.memoryUsageAfterGc.CodeHeap 'non-nmethods'": "", "CollectionCount": "YGC", "CollectionTime": "YGCT" } \ No newline at end of file diff --git a/metrics/hivemetastore/api.json b/metrics/hivemetastore/api.json new file mode 100644 index 0000000..c9e37eb --- /dev/null +++ b/metrics/hivemetastore/api.json @@ -0,0 +1,8 @@ +{ + "CacheReads.AllTime.Avg": "", + "CacheReads.AllTime.Count": "", + "CacheReads.AllTime.Total": "", + "ExternalReads.AllTime.Avg": "", + "ExternalReads.AllTime.Count": "", + "ExternalReads.AllTime.Total": "" +} \ No newline at end of file diff --git a/metrics/worker/AlluxioCacheStats.json b/metrics/worker/AlluxioCacheStats.json new file mode 100644 index 0000000..c9e37eb --- /dev/null +++ b/metrics/worker/AlluxioCacheStats.json @@ -0,0 +1,8 @@ +{ + "CacheReads.AllTime.Avg": "", + "CacheReads.AllTime.Count": "", + "CacheReads.AllTime.Total": "", + "ExternalReads.AllTime.Avg": "", + "ExternalReads.AllTime.Count": "", + "ExternalReads.AllTime.Total": "" +} \ No newline at end of file diff --git a/metrics/worker/TaskExecutor.json b/metrics/worker/TaskExecutor.json new file mode 100644 index 0000000..1a9ae18 --- /dev/null +++ b/metrics/worker/TaskExecutor.json @@ -0,0 +1,4 @@ +{ + "RunnerThreads": "", + "RunningSplits": "" +} \ No newline at end of file