diff --git a/ldms/python/ldmsd/ldmsd_controller b/ldms/python/ldmsd/ldmsd_controller index 5f3bc63af..a50a3632f 100755 --- a/ldms/python/ldmsd/ldmsd_controller +++ b/ldms/python/ldmsd/ldmsd_controller @@ -1079,12 +1079,13 @@ class LdmsdCmdParser(cmd.Cmd): rc, msg = self.comm.strgp_status(arg['name']) if rc == 0: policies = fmt_status(msg) - print("Name Container Schema Plugin flush(sec) State") - print("---------------- ---------------- ---------------- ---------------- ------------ ----------------") + print(f"{'Name':16} {'Container':16} {'Schema':16} {'Regex':16} {'Plugin':16} {'Flush':16} {'State':10} {'Decomposition':20}") + print(f"{'-'*16} {'-'*16} {'-'*16} {'-'*16} {'-'*16} {'-'*16} {'-'*10} {'-'*20}") for strgp in policies: - print("{0:16} {1:16} {2:16} {3:16} {4:16} {5}".format( - strgp['name'], strgp['container'], strgp['schema'], - strgp['plugin'], strgp['flush'], strgp['state'])) + print(f"{strgp['name']:16} {strgp['container']:16} " + f"{strgp['schema']:16} {strgp['regex']:16} " + f"{strgp['plugin']:16} {strgp['flush']:16} {strgp['state']:10} " + f"{strgp['decomp']}") print(" producers: ", end='') for prdcr in strgp['producers']: print("{0} ".format(prdcr), end='') diff --git a/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics.cpp b/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics.cpp index 21626d71d..9d95462f7 100644 --- a/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics.cpp +++ b/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics.cpp @@ -62,31 +62,31 @@ void printGpuMetrics(ze_device_handle_t device, uint32_t devNumber) { uint32_t metricNumber = 0; cout << fixed << setprecision(2); cout << " " << devNumber << " " << metricNumber++ << " " - << getGpuUtilization(device) << " gpu_util (%)" << endl; + << getGpuUtilization(device) << " gpu_util" << endl; cout << " " << devNumber << " " << metricNumber++ << " " - << getMemoryUtilization(device) << " mem_util (%)" << endl; + << getMemoryUtilization(device) << " mem_util" << endl; cout << " " << devNumber << " " << metricNumber++ << " " << getMemVRAMUsed(device) << " mem_vram_used" << endl; cout << " " << devNumber << " " << metricNumber++ << " " - << getSysClockFreq(device) << " sys_clock_freq (MHz)" << endl; + << getSysClockFreq(device) << " sys_clock_freq" << endl; cout << " " << devNumber << " " << metricNumber++ << " " - << getMemoryReadBandwidth(device) << " mem_read_bandwidth (kilobaud)" << endl; + << getMemoryReadBandwidth(device) << " mem_read_bandwidth" << endl; cout << " " << devNumber << " " << metricNumber++ << " " - << getMemoryWriteBandwidth(device) << " mem_write_bandwidth (kilobaud)" << endl; + << getMemoryWriteBandwidth(device) << " mem_write_bandwidth" << endl; cout << " " << devNumber << " " << metricNumber++ << " " << getPerfLevel(device) << " perf_level" << endl; cout << " " << devNumber << " " << metricNumber++ << " " - << getPowerUsage(device) << " power_usage (mW)" << endl; + << getPowerUsage(device) << " power_usage" << endl; cout << " " << devNumber << " " << metricNumber++ << " " - << getGpuTemp(device) << " gpu_temp (Celsius)" << endl; + << getGpuTemp(device) << " gpu_temp" << endl; cout << " " << devNumber << " " << metricNumber++ << " " << getRasFatalAcceleratorResetsError(device) << " ue_accelerator_eng_err" << endl; diff --git a/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics_from_one_api.h b/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics_from_one_api.h index 6350b608e..84e7943ce 100644 --- a/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics_from_one_api.h +++ b/ldms/src/contrib/sampler/gpu_metrics_sampler/gather_gpu_metrics_from_one_api.h @@ -56,7 +56,7 @@ #include -#define SAMP "gpu_metrics" +#define SAMP "gpumetrics" #define MAX_METRIC_NAME_LENGTH 256 #define MAX_NUMBER_DEVICE_INDEX 255 diff --git a/ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_ldms_util.c b/ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_ldms_util.c index a6bd7ceab..30c10c95e 100644 --- a/ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_ldms_util.c +++ b/ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_ldms_util.c @@ -58,8 +58,8 @@ const metric_t metricsDefinitions[] = { {.name = "device_name", .type = LDMS_V_CHAR_ARRAY, .pf = (funcPtr_t) getGpuDeviceName, .count = ZE_MAX_DEVICE_NAME}, {.name = "device_uuid", .type = LDMS_V_U8_ARRAY, .pf = (funcPtr_t) getGpuUuid, .count = ZE_MAX_DEVICE_UUID_SIZE}, {.name = "serial_number", .type = LDMS_V_CHAR_ARRAY, .pf = (funcPtr_t) getGpuSerialNumber, .count = ZES_STRING_PROPERTY_SIZE}, - {.name = "gpu_util (%)", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuUtilization}, - {.name = "mem_util (%)", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryUtilization}, + {.name = "gpu_util", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuUtilization}, + {.name = "mem_util", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryUtilization}, {.name = "mem_vram_used", .type = LDMS_V_U64, .pf = (funcPtr_t) getMemVRAMUsed}, {.name = "ue_accelerator_eng_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasFatalAcceleratorResetsError}, {.name = "ue_cache_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasFatalCachesError}, @@ -75,14 +75,14 @@ const metric_t metricsDefinitions[] = { {.name = "ce_compute_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasCorrectableComputeError}, {.name = "ce_non_compute_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasCorrectableNonComputeError}, {.name = "ce_display_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasCorrectableDisplayError}, - {.name = "sys_clock_freq (MHz)", .type = LDMS_V_S32, .pf = (funcPtr_t) getSysClockFreq}, - {.name = "mem_read_bandwidth (kilobaud)", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryReadBandwidth}, - {.name = "mem_write_bandwidth (kilobaud)", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryWriteBandwidth}, + {.name = "sys_clock_freq", .type = LDMS_V_S32, .pf = (funcPtr_t) getSysClockFreq}, + {.name = "mem_read_bandwidth", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryReadBandwidth}, + {.name = "mem_write_bandwidth", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryWriteBandwidth}, {.name = "perf_level", .type = LDMS_V_D64, .pf = (funcPtr_t) getPerfLevel}, - {.name = "power_usage (mW)", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerUsage}, -// {.name = "power_cap (mW)", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerCap}, // no longer supported - {.name = "gpu_temp (Celsius)", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuTemp}, -// {.name = "pci_max_bandwidth (baud)", .type = LDMS_V_U64, .pf = (funcPtr_t) getPciMaxSpeed} // currently OneAPI does not support this + {.name = "power_usage", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerUsage}, +// {.name = "power_cap", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerCap}, // no longer supported + {.name = "gpu_temp", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuTemp} +// {.name = "pci_max_bandwidth", .type = LDMS_V_U64, .pf = (funcPtr_t) getPciMaxSpeed} // currently OneAPI does not support this }; const size_t c_numMetrics = sizeof(metricsDefinitions) / sizeof(metricsDefinitions[0]); @@ -92,7 +92,7 @@ const size_t c_numMetrics = sizeof(metricsDefinitions) / sizeof(metricsDefinitio */ void constructMetricName(const char *szBaseMetricName, uint8_t deviceId, char *szMetricName) { - snprintf(szMetricName, MAX_METRIC_NAME_LENGTH, "gpu%02x.", deviceId); + snprintf(szMetricName, MAX_METRIC_NAME_LENGTH, "gpu%02x_", deviceId); strncpy(szMetricName + 6, szBaseMetricName, MAX_METRIC_NAME_LENGTH - 6); GMGLOG(LDMSD_LDEBUG, "metricName = %s\n", szMetricName); } diff --git a/ldms/src/ldmsd/ldmsctl.c b/ldms/src/ldmsd/ldmsctl.c index 1e5632bdc..79d384af8 100644 --- a/ldms/src/ldmsd/ldmsctl.c +++ b/ldms/src/ldmsd/ldmsctl.c @@ -1336,25 +1336,29 @@ void __print_strgp_status(json_entity_t strgp) if (strgp->type != JSON_DICT_VALUE) goto invalid_result_format; - json_entity_t name, container, schema, plugin, state, flush; + json_entity_t name, container, schema, regex, plugin, state, flush, decomp; name = json_value_find(strgp, "name"); container = json_value_find(strgp, "container"); schema = json_value_find(strgp, "schema"); + regex = json_value_find(strgp, "regex"); plugin = json_value_find(strgp, "plugin"); state = json_value_find(strgp, "state"); flush = json_value_find(strgp, "flush"); + decomp = json_value_find(strgp, "decomp"); - if (!name || !container || !schema || !plugin || !state || !flush) + if (!name || !container || !plugin || !state || !flush || !regex || !decomp) goto invalid_result_format; - printf("%-16s %-16s %-16s %-16s %-16s %s\n", + printf("%-16s %-16s %-16s %-16s %-16s %-12s %-10s %s\n", json_value_str(name)->str, json_value_str(container)->str, json_value_str(schema)->str, + json_value_str(regex)->str, json_value_str(plugin)->str, json_value_str(flush)->str, - json_value_str(state)->str); + json_value_str(state)->str, + json_value_str(decomp)->str); json_entity_t prdcrs, metrics; prdcrs = json_value_find(strgp, "producers"); @@ -1421,8 +1425,8 @@ static void resp_strgp_status(ldmsd_req_hdr_t resp, size_t len, uint32_t rsp_err printf("Unrecognized producer status format\n"); goto out; } - printf("Name Container Schema Plugin Flush(sec) State\n"); - printf("---------------- ---------------- ---------------- ---------------- ------------ ------------\n"); + printf("Name Container Schema Regex Plugin Flush(sec) State Decomposition\n"); + printf("---------------- ---------------- ---------------- ---------------- ---------------- ------------ ---------- --------------------- \n"); for (strgp = json_item_first(json); strgp; strgp = json_item_next(strgp)) { __print_strgp_status(strgp); @@ -2866,7 +2870,7 @@ int main(int argc, char *argv[]) host = port = sockname = xprt = NULL; char *source, *script; source = script = NULL; - int rc, is_inband = 1; + int is_inband = 1; struct attr_value_list *auth_opt = NULL; const int AUTH_OPT_MAX = 128; ssize_t cnt; @@ -3008,9 +3012,7 @@ int main(int argc, char *argv[]) add_history(linebuf); #endif /* HAVE_READLINE_HISTORY */ - rc = __handle_cmd(ctrl, linebuf); - if (rc) - break; + (void) __handle_cmd(ctrl, linebuf); } while (linebuf); ctrl->close(ctrl); diff --git a/ldms/src/ldmsd/ldmsd.h b/ldms/src/ldmsd/ldmsd.h index be3fa5b89..b337adb23 100644 --- a/ldms/src/ldmsd/ldmsd.h +++ b/ldms/src/ldmsd/ldmsd.h @@ -475,6 +475,7 @@ struct ldmsd_strgp { /** Regular expression for the schema */ regex_t schema_regex; + char *regex_s; struct ldmsd_stat stat; int prdset_cnt; /* Number of producer sets strgp stores */ diff --git a/ldms/src/ldmsd/ldmsd_request.c b/ldms/src/ldmsd/ldmsd_request.c index ce16a7e69..deadb0efd 100644 --- a/ldms/src/ldmsd/ldmsd_request.c +++ b/ldms/src/ldmsd/ldmsd_request.c @@ -2530,6 +2530,9 @@ static int strgp_add_handler(ldmsd_req_ctxt_t reqc) char regex_err[512] = ""; if (regex) { + strgp->regex_s = strdup(regex); + if (!strgp->regex_s) + goto enomem; rc = ldmsd_compile_regex(&strgp->schema_regex, regex, regex_err, sizeof(regex_err)); if (rc) goto eregex; @@ -3025,17 +3028,21 @@ int __strgp_status_json_obj(ldmsd_req_ctxt_t reqc, ldmsd_strgp_t strgp, "{\"name\":\"%s\"," "\"container\":\"%s\"," "\"schema\":\"%s\"," + "\"regex\":\"%s\"," "\"plugin\":\"%s\"," "\"flush\":\"%ld.%06ld\"," "\"state\":\"%s\"," + "\"decomp\":\"%s\"," "\"producers\":[", strgp->obj.name, strgp->container, - strgp->schema, + ((strgp->schema)?strgp->schema:"-"), + ((strgp->regex_s)?strgp->regex_s:"-"), strgp->plugin_name, strgp->flush_interval.tv_sec, (strgp->flush_interval.tv_nsec/1000), - ldmsd_strgp_state_str(strgp->state)); + ldmsd_strgp_state_str(strgp->state), + ((strgp->decomp_name)?strgp->decomp_name:"-")); if (rc) goto out; diff --git a/ldms/src/ldmsd/ldmsd_strgp.c b/ldms/src/ldmsd/ldmsd_strgp.c index 819420e35..10bb17821 100644 --- a/ldms/src/ldmsd/ldmsd_strgp.c +++ b/ldms/src/ldmsd/ldmsd_strgp.c @@ -75,6 +75,7 @@ void ldmsd_strgp___del(ldmsd_cfgobj_t obj) free(strgp->container); if (strgp->metric_arry) free(strgp->metric_arry); + free(strgp->regex_s); struct ldmsd_strgp_metric *metric; while (!TAILQ_EMPTY(&strgp->metric_list) ) {