diff --git a/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man b/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man index 217e4739e..c695ba87c 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man +++ b/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man @@ -19,7 +19,7 @@ GPU power consumption in Watts (aggregated across all GPUs on the socket, and reported as -1 on unsupported platforms); and memory power consumption in Watts. .PP -The variorum sampler depends on Variorum 0.6.0 or higher and Jansson. The sampler cannot be built without these libraries. If either library is installed in a non-standard location, paths to the respective install directories should be provided to Autoconf using +The variorum sampler depends on Variorum 0.8.0 or higher and Jansson. The sampler cannot be built without these libraries. If either library is installed in a non-standard location, paths to the respective install directories should be provided to Autoconf using the --with-libjansson-prefix and/or --with-libvariorum-prefix flag. .SH CONFIGURATION ATTRIBUTE SYNTAX diff --git a/ldms/src/contrib/sampler/variorum_sampler/README.md b/ldms/src/contrib/sampler/variorum_sampler/README.md index abc2983f0..2f46fe469 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/README.md +++ b/ldms/src/contrib/sampler/variorum_sampler/README.md @@ -9,7 +9,7 @@ architecture and implementation. Build Requirements ------------------ -The Variorum LDMS sampler currently requires version 0.6.0 or higher +The Variorum LDMS sampler currently requires version 0.8.0 or higher of the Variorum library (``libvariorum.so``). This library must be built from source. The sampler also requires jansson, which is a Variorum dependency. If both libraries are installed in standard locations, @@ -52,7 +52,7 @@ Using the Variorum LDMS Sampler The sampler, when configured, automatically detects the number of sockets on the host machine and then provides, for each socket, an LDMS record -containing power data. The sampler calls ``variorum_get_node_power_json`` +containing power data. The sampler calls ``variorum_get_power_json`` internally, for which documentation can be found here: [Variorum JSON-Support Functions](https://variorum.readthedocs.io/en/latest/api/json_support_functions.html) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index dfeb7196d..c882ac386 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -25,16 +25,14 @@ static ldms_set_t set = NULL; static ovis_log_t mylog; static base_data_t base; static int nsockets; -static const char *SOCKET_METRICS[] = {"power_cpu_watts_socket_", "power_gpu_watts_socket_", "power_mem_watts_socket_"}; -static char** metric_names = NULL; static int i_node; static int i_sock; static int i_cpu; static int i_gpu; static int i_mem; static int lh_idx; -static ldms_mval_t* rec_idxs; -static char* result_string; +static ldms_mval_t *rec_idxs; +static char *result_string; static int create_metric_set(base_data_t base) { @@ -44,24 +42,18 @@ static int create_metric_set(base_data_t base) ldms_schema_t schema; ldms_mval_t rec_inst; - // allocate space for metric names - if (!metric_names) { - metric_names = malloc(3 * nsockets * sizeof(char*)); - } - for (metric = 0; metric < (3 * nsockets); metric++) { - metric_names[metric] = malloc(39); - } - // allocate space for record pointers - if (!rec_idxs) { + if (!rec_idxs) + { rec_idxs = malloc(nsockets * sizeof(ldms_mval_t)); } schema = base_schema_new(base); - if (!schema) { + if (!schema) + { ovis_log(mylog, OVIS_LERROR, - "%s: The schema '%s' could not be created, errno=%d.\n", - __FILE__, base->schema_name, errno); + "%s: The schema '%s' could not be created, errno=%d.\n", + __FILE__, base->schema_name, errno); rc = errno; goto err; } @@ -84,14 +76,16 @@ static int create_metric_set(base_data_t base) int lh_idx = ldms_schema_metric_list_add(schema, "power", NULL, heap_sz); set = base_set_new(base); - if (!set) { + if (!set) + { rc = errno; goto err; } ldms_mval_t lh = ldms_metric_get(set, lh_idx); - for(socket = 0; socket < nsockets; socket++) { + for (socket = 0; socket < nsockets; socket++) + { // create a new record rec_inst = ldms_record_alloc(set, rec_def_idx); rec_idxs[socket] = rec_inst; @@ -99,23 +93,15 @@ static int create_metric_set(base_data_t base) ldms_record_set_u64(rec_inst, i_sock, socket); // put the record into the list ldms_list_append_record(set, lh, rec_inst); - // create metric name list (for querying json object later on) - for(metric = 0; metric < 3; metric++) { - strcpy(metric_name,SOCKET_METRICS[metric]); - sprintf(socket_num,"%d",socket); - strcat(metric_name,socket_num); - strcpy(metric_names[(metric*nsockets)+socket], metric_name); - } } // allocate space for sampling JSON data depending on number of sockets - result_string = (char *) malloc((nsockets * 150 + 500) * sizeof(char)); + result_string = (char *)malloc((nsockets * 150 + 500) * sizeof(char)); return 0; - err: +err: return rc; - } static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct attr_value_list *avl) @@ -124,7 +110,8 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct int rc; int depth; - if (set) { + if (set) + { ovis_log(mylog, OVIS_LERROR, "Set already created.\n"); return EINVAL; } @@ -132,21 +119,29 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct // determine number of sockets nsockets = variorum_get_num_sockets(); + // 9/12/2024: TP Note: We need to know number of GPUs per socket here + // so we can create a metric set that also gives per GPU power. + // We don't have a good solution for this. We could call the JSON API + // once and parse that value out, but the overhead for that can be high. + // To be addressed after the first pass works. + // prepare the base for metric collection base = base_config(avl, SAMP, SAMP, mylog); - if (!base) { + if (!base) + { rc = errno; goto err; } rc = create_metric_set(base); - if (rc) { + if (rc) + { ovis_log(mylog, OVIS_LERROR, "failed to create a metric set.\n"); goto err; } return 0; - err: +err: base_del(base); return rc; } @@ -154,9 +149,11 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct static int sample(struct ldmsd_sampler *self) { json_t *power_obj = NULL; + json_t *node_obj = NULL; int ret, socket; - if (!set) { + if (!set) + { ovis_log(mylog, OVIS_LERROR, "plugin not initialized\n"); return EINVAL; } @@ -164,57 +161,124 @@ static int sample(struct ldmsd_sampler *self) base_sample_begin(base); // get variorum data - ret = variorum_get_node_power_json(&result_string); - if (ret != 0) { + ret = variorum_get_power_json(&result_string); + if (ret != 0) + { ovis_log(mylog, OVIS_LERROR, "unable to obtain JSON object data\n"); return EINVAL; } power_obj = json_loads(result_string, JSON_DECODE_ANY, NULL); + void *iter = json_object_iter(power_obj); + while (iter) + { + node_obj = json_object_iter_value(iter); + if (node_obj == NULL) + { + printf("JSON object not found"); + exit(0); + } + /* The following should return NULL after the first call per our object. */ + iter = json_object_iter_next(power_obj, iter); + } + + double power_node = -1.0; + double power_cpu = -1.0; + double power_gpu = -1.0; + double power_mem = -1.0; + int num_gpus_per_socket = -1; + char socketID[20]; + + // If we're on a GPU-only build, we don't have power_node_watts. + if (json_object_get(node_obj, "power_node_watts") != NULL) + { + power_node = json_real_value(json_object_get(node_obj, "power_node_watts")); + } - double power_node = json_real_value(json_object_get(power_obj, "power_node_watts")); - double power_cpu, power_gpu, power_mem; + // If we're on a CPU-only build, we don't have num_gpus_per_socket + if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) + { + num_gpus_per_socket = json_integer_value(json_object_get(node_obj, + "num_gpus_per_socket")); + } // update each record - for(socket = 0; socket < nsockets; socket++) { + for (socket = 0; socket < nsockets; socket++) + { + // Node power is same on both sockets. ldms_record_set_double(rec_idxs[socket], i_node, power_node); - power_cpu = json_real_value(json_object_get(power_obj, metric_names[socket])); + + // Obtain Socket Object + snprintf(socketID, 20, "socket_%d", socket); + json_t *socket_obj = json_object_get(node_obj, socketID); + if (socket_obj == NULL) + { + printf("Socket object not found!\n"); + exit(0); + } + + // If we're on a GPU-only build, we don't have power_cpu_watts + if (json_object_get(socket_obj, "power_cpu_watts") != NULL) + { + power_cpu = json_real_value(json_object_get(socket_obj, "power_cpu_watts")); + } + + // If we're on a GPU-only build on an unsupported platform, + // we don't have power_mem_watts. + if (json_object_get(socket_obj, "power_mem_watts") != NULL) + { + power_mem = json_real_value(json_object_get(socket_obj, "power_mem_watts")); + } + + // If we have GPUs, obtatin the GPU object + if (num_gpus_per_socket > 0) + { + json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts"); + if (gpu_obj == NULL) + { + printf("GPU object not found! \n"); + exit(0); + } + const char *key; + json_t *value; + power_gpu = 0.0; + + json_object_foreach(gpu_obj, key, value) + { + // We will add power of multiple GPUs as a first cut. See note on line 135. + power_gpu += json_real_value(value); + } + } + + // Set the LDMS records for the socket ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu); - power_gpu = json_real_value(json_object_get(power_obj, metric_names[nsockets+socket])); ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); - power_mem = json_real_value(json_object_get(power_obj, metric_names[(2*nsockets)+socket])); ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); } - ldms_metric_modify(set, lh_idx); json_decref(power_obj); base_sample_end(base); return 0; - } static void term(struct ldmsd_plugin *self) { - int metric; - - if (metric_names) { - for (metric = 0; metric < 3 * nsockets; metric++) { - free(metric_names[metric]); - } - free(metric_names); - } - if (result_string) { + if (result_string) + { free(result_string); } - if (rec_idxs) { + if (rec_idxs) + { free(rec_idxs); } - if (base) { + if (base) + { base_del(base); } - if (set) { + if (set) + { ldms_set_delete(set); } set = NULL; @@ -222,25 +286,26 @@ static void term(struct ldmsd_plugin *self) static const char *usage(struct ldmsd_plugin *self) { - return "config name=" SAMP " " BASE_CONFIG_USAGE; + return "config name=" SAMP " " BASE_CONFIG_USAGE; } static struct ldmsd_sampler variorum_sampler_plugin = { - .base = { - .name = SAMP, - .type = LDMSD_PLUGIN_SAMPLER, - .term = term, - .config = config, - .usage= usage, - }, - .sample = sample, + .base = { + .name = SAMP, + .type = LDMSD_PLUGIN_SAMPLER, + .term = term, + .config = config, + .usage = usage, + }, + .sample = sample, }; struct ldmsd_plugin *get_plugin() { - mylog = ovis_log_register("sampler."SAMP, "Messages for the " SAMP " plugin"); - if (!mylog) { - ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's log subsystem"); - } - return &variorum_sampler_plugin.base; + mylog = ovis_log_register("sampler." SAMP, "Messages for the " SAMP " plugin"); + if (!mylog) + { + ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's log subsystem"); + } + return &variorum_sampler_plugin.base; }