Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] change schema name to aggregate data #1188

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions ldms/python/ldmsd/ldmsd_controller
Original file line number Diff line number Diff line change
Expand Up @@ -1079,12 +1079,13 @@ class LdmsdCmdParser(cmd.Cmd):
rc, msg = self.comm.strgp_status(arg['name'])
if rc == 0:
policies = fmt_status(msg)
print("Name Container Schema Plugin flush(sec) State")
print("---------------- ---------------- ---------------- ---------------- ------------ ----------------")
print(f"{'Name':16} {'Container':16} {'Schema':16} {'Regex':16} {'Plugin':16} {'Flush':16} {'State':10} {'Decomposition':20}")
print(f"{'-'*16} {'-'*16} {'-'*16} {'-'*16} {'-'*16} {'-'*16} {'-'*10} {'-'*20}")
for strgp in policies:
print("{0:16} {1:16} {2:16} {3:16} {4:16} {5}".format(
strgp['name'], strgp['container'], strgp['schema'],
strgp['plugin'], strgp['flush'], strgp['state']))
print(f"{strgp['name']:16} {strgp['container']:16} "
f"{strgp['schema']:16} {strgp['regex']:16} "
f"{strgp['plugin']:16} {strgp['flush']:16} {strgp['state']:10} "
f"{strgp['decomp']}")
print(" producers: ", end='')
for prdcr in strgp['producers']:
print("{0} ".format(prdcr), end='')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,31 +62,31 @@ void printGpuMetrics(ze_device_handle_t device, uint32_t devNumber) {
uint32_t metricNumber = 0;
cout << fixed << setprecision(2);
cout << " " << devNumber << " " << metricNumber++ << " "
<< getGpuUtilization(device) << " gpu_util (%)" << endl;
<< getGpuUtilization(device) << " gpu_util" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getMemoryUtilization(device) << " mem_util (%)" << endl;
<< getMemoryUtilization(device) << " mem_util" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getMemVRAMUsed(device) << " mem_vram_used" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getSysClockFreq(device) << " sys_clock_freq (MHz)" << endl;
<< getSysClockFreq(device) << " sys_clock_freq" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getMemoryReadBandwidth(device) << " mem_read_bandwidth (kilobaud)" << endl;
<< getMemoryReadBandwidth(device) << " mem_read_bandwidth" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getMemoryWriteBandwidth(device) << " mem_write_bandwidth (kilobaud)" << endl;
<< getMemoryWriteBandwidth(device) << " mem_write_bandwidth" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getPerfLevel(device) << " perf_level" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getPowerUsage(device) << " power_usage (mW)" << endl;
<< getPowerUsage(device) << " power_usage" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getGpuTemp(device) << " gpu_temp (Celsius)" << endl;
<< getGpuTemp(device) << " gpu_temp" << endl;

cout << " " << devNumber << " " << metricNumber++ << " "
<< getRasFatalAcceleratorResetsError(device) << " ue_accelerator_eng_err" << endl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
#include <level_zero/zes_api.h>


#define SAMP "gpu_metrics"
#define SAMP "gpumetrics"
#define MAX_METRIC_NAME_LENGTH 256
#define MAX_NUMBER_DEVICE_INDEX 255

Expand Down
20 changes: 10 additions & 10 deletions ldms/src/contrib/sampler/gpu_metrics_sampler/gmg_ldms_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ const metric_t metricsDefinitions[] = {
{.name = "device_name", .type = LDMS_V_CHAR_ARRAY, .pf = (funcPtr_t) getGpuDeviceName, .count = ZE_MAX_DEVICE_NAME},
{.name = "device_uuid", .type = LDMS_V_U8_ARRAY, .pf = (funcPtr_t) getGpuUuid, .count = ZE_MAX_DEVICE_UUID_SIZE},
{.name = "serial_number", .type = LDMS_V_CHAR_ARRAY, .pf = (funcPtr_t) getGpuSerialNumber, .count = ZES_STRING_PROPERTY_SIZE},
{.name = "gpu_util (%)", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuUtilization},
{.name = "mem_util (%)", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryUtilization},
{.name = "gpu_util", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuUtilization},
{.name = "mem_util", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryUtilization},
{.name = "mem_vram_used", .type = LDMS_V_U64, .pf = (funcPtr_t) getMemVRAMUsed},
{.name = "ue_accelerator_eng_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasFatalAcceleratorResetsError},
{.name = "ue_cache_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasFatalCachesError},
Expand All @@ -75,14 +75,14 @@ const metric_t metricsDefinitions[] = {
{.name = "ce_compute_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasCorrectableComputeError},
{.name = "ce_non_compute_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasCorrectableNonComputeError},
{.name = "ce_display_err", .type = LDMS_V_S32, .pf = (funcPtr_t) getRasCorrectableDisplayError},
{.name = "sys_clock_freq (MHz)", .type = LDMS_V_S32, .pf = (funcPtr_t) getSysClockFreq},
{.name = "mem_read_bandwidth (kilobaud)", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryReadBandwidth},
{.name = "mem_write_bandwidth (kilobaud)", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryWriteBandwidth},
{.name = "sys_clock_freq", .type = LDMS_V_S32, .pf = (funcPtr_t) getSysClockFreq},
{.name = "mem_read_bandwidth", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryReadBandwidth},
{.name = "mem_write_bandwidth", .type = LDMS_V_D64, .pf = (funcPtr_t) getMemoryWriteBandwidth},
{.name = "perf_level", .type = LDMS_V_D64, .pf = (funcPtr_t) getPerfLevel},
{.name = "power_usage (mW)", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerUsage},
// {.name = "power_cap (mW)", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerCap}, // no longer supported
{.name = "gpu_temp (Celsius)", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuTemp},
// {.name = "pci_max_bandwidth (baud)", .type = LDMS_V_U64, .pf = (funcPtr_t) getPciMaxSpeed} // currently OneAPI does not support this
{.name = "power_usage", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerUsage},
// {.name = "power_cap", .type = LDMS_V_S32, .pf = (funcPtr_t) getPowerCap}, // no longer supported
{.name = "gpu_temp", .type = LDMS_V_D64, .pf = (funcPtr_t) getGpuTemp}
// {.name = "pci_max_bandwidth", .type = LDMS_V_U64, .pf = (funcPtr_t) getPciMaxSpeed} // currently OneAPI does not support this
};

const size_t c_numMetrics = sizeof(metricsDefinitions) / sizeof(metricsDefinitions[0]);
Expand All @@ -92,7 +92,7 @@ const size_t c_numMetrics = sizeof(metricsDefinitions) / sizeof(metricsDefinitio
*/

void constructMetricName(const char *szBaseMetricName, uint8_t deviceId, char *szMetricName) {
snprintf(szMetricName, MAX_METRIC_NAME_LENGTH, "gpu%02x.", deviceId);
snprintf(szMetricName, MAX_METRIC_NAME_LENGTH, "gpu%02x_", deviceId);
strncpy(szMetricName + 6, szBaseMetricName, MAX_METRIC_NAME_LENGTH - 6);
GMGLOG(LDMSD_LDEBUG, "metricName = %s\n", szMetricName);
}
Expand Down
22 changes: 12 additions & 10 deletions ldms/src/ldmsd/ldmsctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1336,25 +1336,29 @@ void __print_strgp_status(json_entity_t strgp)
if (strgp->type != JSON_DICT_VALUE)
goto invalid_result_format;

json_entity_t name, container, schema, plugin, state, flush;
json_entity_t name, container, schema, regex, plugin, state, flush, decomp;

name = json_value_find(strgp, "name");
container = json_value_find(strgp, "container");
schema = json_value_find(strgp, "schema");
regex = json_value_find(strgp, "regex");
plugin = json_value_find(strgp, "plugin");
state = json_value_find(strgp, "state");
flush = json_value_find(strgp, "flush");
decomp = json_value_find(strgp, "decomp");

if (!name || !container || !schema || !plugin || !state || !flush)
if (!name || !container || !plugin || !state || !flush || !regex || !decomp)
goto invalid_result_format;

printf("%-16s %-16s %-16s %-16s %-16s %s\n",
printf("%-16s %-16s %-16s %-16s %-16s %-12s %-10s %s\n",
json_value_str(name)->str,
json_value_str(container)->str,
json_value_str(schema)->str,
json_value_str(regex)->str,
json_value_str(plugin)->str,
json_value_str(flush)->str,
json_value_str(state)->str);
json_value_str(state)->str,
json_value_str(decomp)->str);

json_entity_t prdcrs, metrics;
prdcrs = json_value_find(strgp, "producers");
Expand Down Expand Up @@ -1421,8 +1425,8 @@ static void resp_strgp_status(ldmsd_req_hdr_t resp, size_t len, uint32_t rsp_err
printf("Unrecognized producer status format\n");
goto out;
}
printf("Name Container Schema Plugin Flush(sec) State\n");
printf("---------------- ---------------- ---------------- ---------------- ------------ ------------\n");
printf("Name Container Schema Regex Plugin Flush(sec) State Decomposition\n");
printf("---------------- ---------------- ---------------- ---------------- ---------------- ------------ ---------- --------------------- \n");

for (strgp = json_item_first(json); strgp; strgp = json_item_next(strgp)) {
__print_strgp_status(strgp);
Expand Down Expand Up @@ -2866,7 +2870,7 @@ int main(int argc, char *argv[])
host = port = sockname = xprt = NULL;
char *source, *script;
source = script = NULL;
int rc, is_inband = 1;
int is_inband = 1;
struct attr_value_list *auth_opt = NULL;
const int AUTH_OPT_MAX = 128;
ssize_t cnt;
Expand Down Expand Up @@ -3008,9 +3012,7 @@ int main(int argc, char *argv[])
add_history(linebuf);
#endif /* HAVE_READLINE_HISTORY */

rc = __handle_cmd(ctrl, linebuf);
if (rc)
break;
(void) __handle_cmd(ctrl, linebuf);
} while (linebuf);

ctrl->close(ctrl);
Expand Down
1 change: 1 addition & 0 deletions ldms/src/ldmsd/ldmsd.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ struct ldmsd_strgp {

/** Regular expression for the schema */
regex_t schema_regex;
char *regex_s;

struct ldmsd_stat stat;
int prdset_cnt; /* Number of producer sets strgp stores */
Expand Down
11 changes: 9 additions & 2 deletions ldms/src/ldmsd/ldmsd_request.c
Original file line number Diff line number Diff line change
Expand Up @@ -2530,6 +2530,9 @@ static int strgp_add_handler(ldmsd_req_ctxt_t reqc)

char regex_err[512] = "";
if (regex) {
strgp->regex_s = strdup(regex);
if (!strgp->regex_s)
goto enomem;
rc = ldmsd_compile_regex(&strgp->schema_regex, regex, regex_err, sizeof(regex_err));
if (rc)
goto eregex;
Expand Down Expand Up @@ -3025,17 +3028,21 @@ int __strgp_status_json_obj(ldmsd_req_ctxt_t reqc, ldmsd_strgp_t strgp,
"{\"name\":\"%s\","
"\"container\":\"%s\","
"\"schema\":\"%s\","
"\"regex\":\"%s\","
"\"plugin\":\"%s\","
"\"flush\":\"%ld.%06ld\","
"\"state\":\"%s\","
"\"decomp\":\"%s\","
"\"producers\":[",
strgp->obj.name,
strgp->container,
strgp->schema,
((strgp->schema)?strgp->schema:"-"),
((strgp->regex_s)?strgp->regex_s:"-"),
strgp->plugin_name,
strgp->flush_interval.tv_sec,
(strgp->flush_interval.tv_nsec/1000),
ldmsd_strgp_state_str(strgp->state));
ldmsd_strgp_state_str(strgp->state),
((strgp->decomp_name)?strgp->decomp_name:"-"));
if (rc)
goto out;

Expand Down
1 change: 1 addition & 0 deletions ldms/src/ldmsd/ldmsd_strgp.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ void ldmsd_strgp___del(ldmsd_cfgobj_t obj)
free(strgp->container);
if (strgp->metric_arry)
free(strgp->metric_arry);
free(strgp->regex_s);

struct ldmsd_strgp_metric *metric;
while (!TAILQ_EMPTY(&strgp->metric_list) ) {
Expand Down