From 96d63b98dac4bc5dd9af7485818ab76041b29e18 Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Mon, 26 Aug 2024 19:15:39 -0700 Subject: [PATCH 1/7] Work in progress to update the LDMS Variorum Sampler to use Variorum v0.8 --- .../sampler/variorum_sampler/variorum_sampler.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index dfeb7196d..abfc9f7f2 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -154,6 +154,7 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct static int sample(struct ldmsd_sampler *self) { json_t *power_obj = NULL; + json_t *node_obj = NULL; int ret, socket; if (!set) { @@ -164,14 +165,25 @@ static int sample(struct ldmsd_sampler *self) base_sample_begin(base); // get variorum data - ret = variorum_get_node_power_json(&result_string); + ret = variorum_get_power_json(&result_string); if (ret != 0) { ovis_log(mylog, OVIS_LERROR, "unable to obtain JSON object data\n"); return EINVAL; } power_obj = json_loads(result_string, JSON_DECODE_ANY, NULL); + void *iter = json_object_iter(power_obj); + while (iter) { + node_obj = json_object_iter_value(iter); + if (node_obj == NULL) { + printf("JSON object not found"); + exit(0); + } + /* The following should return NULL after the first call per our object. */ + iter = json_object_iter_next(power_obj, iter); + } + // TODO UPDATE FROM HERE, Check for GPU-onnly, CPU-only and BOTH build. double power_node = json_real_value(json_object_get(power_obj, "power_node_watts")); double power_cpu, power_gpu, power_mem; From 942fd5207216c1178eb1d3e5df90e150c0d17f30 Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Thu, 12 Sep 2024 23:26:57 -0700 Subject: [PATCH 2/7] Edited to variorum0.8, untested --- .../variorum_sampler/variorum_sampler.c | 95 +++++++++++++++++-- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index abfc9f7f2..bdee44302 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -25,7 +25,7 @@ static ldms_set_t set = NULL; static ovis_log_t mylog; static base_data_t base; static int nsockets; -static const char *SOCKET_METRICS[] = {"power_cpu_watts_socket_", "power_gpu_watts_socket_", "power_mem_watts_socket_"}; +static const char *SOCKET_METRICS[] = {"power_cpu_watts", "power_gpu_watts", "power_mem_watts"}; static char** metric_names = NULL; static int i_node; static int i_sock; @@ -100,6 +100,8 @@ static int create_metric_set(base_data_t base) // put the record into the list ldms_list_append_record(set, lh, rec_inst); // create metric name list (for querying json object later on) + //TP NOTE: This is not needed anymore, as we don't need to append the ID here. + // Will fix this once the initial build works. for(metric = 0; metric < 3; metric++) { strcpy(metric_name,SOCKET_METRICS[metric]); sprintf(socket_num,"%d",socket); @@ -132,6 +134,12 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct // determine number of sockets nsockets = variorum_get_num_sockets(); + // 9/12/2024: TP Note: We need to know number of GPUs per socket here + // so we can create a metric set that also gives per GPU power. + // We don't have a good solution for this. We could call the JSON API + // once and parse that value out, but the overhead for that can be high. + // To be addressed after the first pass works. + // prepare the base for metric collection base = base_config(avl, SAMP, SAMP, mylog); if (!base) { @@ -183,12 +191,87 @@ static int sample(struct ldmsd_sampler *self) iter = json_object_iter_next(power_obj, iter); } - // TODO UPDATE FROM HERE, Check for GPU-onnly, CPU-only and BOTH build. - double power_node = json_real_value(json_object_get(power_obj, "power_node_watts")); - double power_cpu, power_gpu, power_mem; + double power_node = -1.0; + double power_cpu = -1.0; + // We will add power of multiple GPUs as a first cut. See note on line 135. + double power_gpu = -1.0; + double power_mem = -1.0; + int num_gpus_per_socket = -1; + char socketID[20]; + + // If we're on a GPU-only build, we don't have power_node_watts. + if (json_object_get(node_obj, "power_node_watts") != NULL) + { + power_node = json_real_value(json_object_get(node_obj, "power_node_watts")); + // printf("Node Power: %0.2lf Watts\n", power_node); + } + + // If we're on a CPU-only build, we don't have num_gpus_per_socket + if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) + { + num_gpus_per_socket = json_integer_value(json_object_get(node_obj, + "num_gpus_per_socket")); + // printf("Number of GPUs per socket: %d\n", num_gpus_per_socket); + } // update each record - for(socket = 0; socket < nsockets; socket++) { + for(socket = 0; socket < nsockets; socket++) + { + // Node power is same on both sockets. + ldms_record_set_double(rec_idxs[socket], i_node, power_node); + + // Obtain Socket Object + snprintf(socketID, 20, "socket_%d", i); + json_t *socket_obj = json_object_get(node_obj, socketID); + if (socket_obj == NULL) + { + printf("Socket object not found!\n"); + exit(0); + } + + // If we're on a GPU-only build, we don't have power_cpu_watts + if (json_object_get(socket_obj, "power_cpu_watts") != NULL) + { + power_cpu = json_real_value(json_object_get(socket_obj, "power_cpu_watts")); + // printf("Socket %d, CPU Power: %0.2lf Watts\n", i, power_cpu); + } + + // If we're on a GPU-only build on an unsupported platform, + // we don't have power_mem_watts. + if (json_object_get(socket_obj, "power_mem_watts") != NULL) + { + power_mem = json_real_value(json_object_get(socket_obj, "power_mem_watts")); + // printf("Socket %d, Mem Power: %0.2lf Watts\n", i, power_mem); + } + + // If we have GPUs, obtatin the GPU object + // As a first cut, add up the power of multiple GPUs on that socket. + if (num_gpus_per_socket > 0) + { + json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts"); + if (gpu_obj == NULL) + { + printf("GPU object not found! \n"); + exit(0); + } + const char *key; + json_t *value; + power_gpu = 0.0; + + json_object_foreach(gpu_obj, key, value) + { + power_gpu += json_real_value(value); + // printf("Socket %d, %s Power: %0.2lf Watts\n", i, key, json_real_value(value)); + } + } + + // Set the LDMS records for the socket + ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu); + ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); + ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); + } + } + /* ldms_record_set_double(rec_idxs[socket], i_node, power_node); power_cpu = json_real_value(json_object_get(power_obj, metric_names[socket])); ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu); @@ -196,7 +279,7 @@ static int sample(struct ldmsd_sampler *self) ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); power_mem = json_real_value(json_object_get(power_obj, metric_names[(2*nsockets)+socket])); ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); - } + */ ldms_metric_modify(set, lh_idx); From ec26b627472df64f428742daae08a45ff9d0e58f Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Thu, 12 Sep 2024 23:35:31 -0700 Subject: [PATCH 3/7] Fix minor compilation error --- .../variorum_sampler/variorum_sampler.c | 30 +++++++------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index bdee44302..838dcd66d 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -200,57 +200,49 @@ static int sample(struct ldmsd_sampler *self) char socketID[20]; // If we're on a GPU-only build, we don't have power_node_watts. - if (json_object_get(node_obj, "power_node_watts") != NULL) - { + if (json_object_get(node_obj, "power_node_watts") != NULL) { power_node = json_real_value(json_object_get(node_obj, "power_node_watts")); // printf("Node Power: %0.2lf Watts\n", power_node); } // If we're on a CPU-only build, we don't have num_gpus_per_socket - if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) - { + if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) { num_gpus_per_socket = json_integer_value(json_object_get(node_obj, "num_gpus_per_socket")); // printf("Number of GPUs per socket: %d\n", num_gpus_per_socket); } // update each record - for(socket = 0; socket < nsockets; socket++) - { + for(socket = 0; socket < nsockets; socket++) { // Node power is same on both sockets. ldms_record_set_double(rec_idxs[socket], i_node, power_node); // Obtain Socket Object snprintf(socketID, 20, "socket_%d", i); json_t *socket_obj = json_object_get(node_obj, socketID); - if (socket_obj == NULL) - { + if (socket_obj == NULL) { printf("Socket object not found!\n"); exit(0); } // If we're on a GPU-only build, we don't have power_cpu_watts - if (json_object_get(socket_obj, "power_cpu_watts") != NULL) - { + if (json_object_get(socket_obj, "power_cpu_watts") != NULL) { power_cpu = json_real_value(json_object_get(socket_obj, "power_cpu_watts")); // printf("Socket %d, CPU Power: %0.2lf Watts\n", i, power_cpu); } // If we're on a GPU-only build on an unsupported platform, // we don't have power_mem_watts. - if (json_object_get(socket_obj, "power_mem_watts") != NULL) - { + if (json_object_get(socket_obj, "power_mem_watts") != NULL) { power_mem = json_real_value(json_object_get(socket_obj, "power_mem_watts")); // printf("Socket %d, Mem Power: %0.2lf Watts\n", i, power_mem); } // If we have GPUs, obtatin the GPU object // As a first cut, add up the power of multiple GPUs on that socket. - if (num_gpus_per_socket > 0) - { + if (num_gpus_per_socket > 0) { json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts"); - if (gpu_obj == NULL) - { + if (gpu_obj == NULL) { printf("GPU object not found! \n"); exit(0); } @@ -258,8 +250,7 @@ static int sample(struct ldmsd_sampler *self) json_t *value; power_gpu = 0.0; - json_object_foreach(gpu_obj, key, value) - { + json_object_foreach(gpu_obj, key, value) { power_gpu += json_real_value(value); // printf("Socket %d, %s Power: %0.2lf Watts\n", i, key, json_real_value(value)); } @@ -269,8 +260,7 @@ static int sample(struct ldmsd_sampler *self) ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu); ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); - } - } + } /* ldms_record_set_double(rec_idxs[socket], i_node, power_node); power_cpu = json_real_value(json_object_get(power_obj, metric_names[socket])); From 1b26bb94d51a03f1cb4b0bde156dc8e9615d4bee Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Thu, 12 Sep 2024 23:37:30 -0700 Subject: [PATCH 4/7] Fix another minor error --- ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index 838dcd66d..16acd136d 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -218,7 +218,7 @@ static int sample(struct ldmsd_sampler *self) ldms_record_set_double(rec_idxs[socket], i_node, power_node); // Obtain Socket Object - snprintf(socketID, 20, "socket_%d", i); + snprintf(socketID, 20, "socket_%d", socket); json_t *socket_obj = json_object_get(node_obj, socketID); if (socket_obj == NULL) { printf("Socket object not found!\n"); From d480434029eba843ae57208d9c1dfd17938e7faf Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Fri, 13 Sep 2024 10:51:13 -0700 Subject: [PATCH 5/7] Comment metrics-names to test before cleaning up. --- .../variorum_sampler/variorum_sampler.c | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index 16acd136d..4217816f1 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -25,8 +25,8 @@ static ldms_set_t set = NULL; static ovis_log_t mylog; static base_data_t base; static int nsockets; -static const char *SOCKET_METRICS[] = {"power_cpu_watts", "power_gpu_watts", "power_mem_watts"}; -static char** metric_names = NULL; +// static const char *SOCKET_METRICS[] = {"power_cpu_watts", "power_gpu_watts", "power_mem_watts"}; +// static char** metric_names = NULL; static int i_node; static int i_sock; static int i_cpu; @@ -44,13 +44,14 @@ static int create_metric_set(base_data_t base) ldms_schema_t schema; ldms_mval_t rec_inst; - // allocate space for metric names - if (!metric_names) { - metric_names = malloc(3 * nsockets * sizeof(char*)); - } - for (metric = 0; metric < (3 * nsockets); metric++) { - metric_names[metric] = malloc(39); - } + // DELETE + // // allocate space for metric names + // if (!metric_names) { + // metric_names = malloc(3 * nsockets * sizeof(char*)); + // } + // for (metric = 0; metric < (3 * nsockets); metric++) { + // metric_names[metric] = malloc(39); + // } // allocate space for record pointers if (!rec_idxs) { @@ -99,15 +100,17 @@ static int create_metric_set(base_data_t base) ldms_record_set_u64(rec_inst, i_sock, socket); // put the record into the list ldms_list_append_record(set, lh, rec_inst); + + // DELETE // create metric name list (for querying json object later on) //TP NOTE: This is not needed anymore, as we don't need to append the ID here. // Will fix this once the initial build works. - for(metric = 0; metric < 3; metric++) { - strcpy(metric_name,SOCKET_METRICS[metric]); - sprintf(socket_num,"%d",socket); - strcat(metric_name,socket_num); - strcpy(metric_names[(metric*nsockets)+socket], metric_name); - } + // for(metric = 0; metric < 3; metric++) { + // strcpy(metric_name,SOCKET_METRICS[metric]); + // sprintf(socket_num,"%d",socket); + // strcat(metric_name,socket_num); + // strcpy(metric_names[(metric*nsockets)+socket], metric_name); + // } } // allocate space for sampling JSON data depending on number of sockets @@ -261,6 +264,7 @@ static int sample(struct ldmsd_sampler *self) ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); } + // DELETE /* ldms_record_set_double(rec_idxs[socket], i_node, power_node); power_cpu = json_real_value(json_object_get(power_obj, metric_names[socket])); @@ -284,12 +288,14 @@ static void term(struct ldmsd_plugin *self) { int metric; - if (metric_names) { - for (metric = 0; metric < 3 * nsockets; metric++) { - free(metric_names[metric]); - } - free(metric_names); - } + // DELETE + // if (metric_names) { + // for (metric = 0; metric < 3 * nsockets; metric++) { + // free(metric_names[metric]); + // } + // free(metric_names); + // } + if (result_string) { free(result_string); } From 879d227cfd1ff5789cb9296da9c98099971e7ce5 Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Wed, 18 Sep 2024 14:30:46 -0700 Subject: [PATCH 6/7] Code cleanup, update Readme and man file. --- .../Plugin_variorum_sampler.man | 2 +- .../sampler/variorum_sampler/README.md | 4 +- .../variorum_sampler/variorum_sampler.c | 59 ++----------------- 3 files changed, 8 insertions(+), 57 deletions(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man b/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man index 217e4739e..c695ba87c 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man +++ b/ldms/src/contrib/sampler/variorum_sampler/Plugin_variorum_sampler.man @@ -19,7 +19,7 @@ GPU power consumption in Watts (aggregated across all GPUs on the socket, and reported as -1 on unsupported platforms); and memory power consumption in Watts. .PP -The variorum sampler depends on Variorum 0.6.0 or higher and Jansson. The sampler cannot be built without these libraries. If either library is installed in a non-standard location, paths to the respective install directories should be provided to Autoconf using +The variorum sampler depends on Variorum 0.8.0 or higher and Jansson. The sampler cannot be built without these libraries. If either library is installed in a non-standard location, paths to the respective install directories should be provided to Autoconf using the --with-libjansson-prefix and/or --with-libvariorum-prefix flag. .SH CONFIGURATION ATTRIBUTE SYNTAX diff --git a/ldms/src/contrib/sampler/variorum_sampler/README.md b/ldms/src/contrib/sampler/variorum_sampler/README.md index abc2983f0..2f46fe469 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/README.md +++ b/ldms/src/contrib/sampler/variorum_sampler/README.md @@ -9,7 +9,7 @@ architecture and implementation. Build Requirements ------------------ -The Variorum LDMS sampler currently requires version 0.6.0 or higher +The Variorum LDMS sampler currently requires version 0.8.0 or higher of the Variorum library (``libvariorum.so``). This library must be built from source. The sampler also requires jansson, which is a Variorum dependency. If both libraries are installed in standard locations, @@ -52,7 +52,7 @@ Using the Variorum LDMS Sampler The sampler, when configured, automatically detects the number of sockets on the host machine and then provides, for each socket, an LDMS record -containing power data. The sampler calls ``variorum_get_node_power_json`` +containing power data. The sampler calls ``variorum_get_power_json`` internally, for which documentation can be found here: [Variorum JSON-Support Functions](https://variorum.readthedocs.io/en/latest/api/json_support_functions.html) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index 4217816f1..7606e4fdc 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -25,8 +25,6 @@ static ldms_set_t set = NULL; static ovis_log_t mylog; static base_data_t base; static int nsockets; -// static const char *SOCKET_METRICS[] = {"power_cpu_watts", "power_gpu_watts", "power_mem_watts"}; -// static char** metric_names = NULL; static int i_node; static int i_sock; static int i_cpu; @@ -44,15 +42,6 @@ static int create_metric_set(base_data_t base) ldms_schema_t schema; ldms_mval_t rec_inst; - // DELETE - // // allocate space for metric names - // if (!metric_names) { - // metric_names = malloc(3 * nsockets * sizeof(char*)); - // } - // for (metric = 0; metric < (3 * nsockets); metric++) { - // metric_names[metric] = malloc(39); - // } - // allocate space for record pointers if (!rec_idxs) { rec_idxs = malloc(nsockets * sizeof(ldms_mval_t)); @@ -99,18 +88,7 @@ static int create_metric_set(base_data_t base) // set the socket number ldms_record_set_u64(rec_inst, i_sock, socket); // put the record into the list - ldms_list_append_record(set, lh, rec_inst); - - // DELETE - // create metric name list (for querying json object later on) - //TP NOTE: This is not needed anymore, as we don't need to append the ID here. - // Will fix this once the initial build works. - // for(metric = 0; metric < 3; metric++) { - // strcpy(metric_name,SOCKET_METRICS[metric]); - // sprintf(socket_num,"%d",socket); - // strcat(metric_name,socket_num); - // strcpy(metric_names[(metric*nsockets)+socket], metric_name); - // } + ldms_list_append_record(set, lh, rec_inst); } // allocate space for sampling JSON data depending on number of sockets @@ -196,7 +174,6 @@ static int sample(struct ldmsd_sampler *self) double power_node = -1.0; double power_cpu = -1.0; - // We will add power of multiple GPUs as a first cut. See note on line 135. double power_gpu = -1.0; double power_mem = -1.0; int num_gpus_per_socket = -1; @@ -205,15 +182,13 @@ static int sample(struct ldmsd_sampler *self) // If we're on a GPU-only build, we don't have power_node_watts. if (json_object_get(node_obj, "power_node_watts") != NULL) { power_node = json_real_value(json_object_get(node_obj, "power_node_watts")); - // printf("Node Power: %0.2lf Watts\n", power_node); } // If we're on a CPU-only build, we don't have num_gpus_per_socket if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) { num_gpus_per_socket = json_integer_value(json_object_get(node_obj, "num_gpus_per_socket")); - // printf("Number of GPUs per socket: %d\n", num_gpus_per_socket); - } + } // update each record for(socket = 0; socket < nsockets; socket++) { @@ -231,18 +206,15 @@ static int sample(struct ldmsd_sampler *self) // If we're on a GPU-only build, we don't have power_cpu_watts if (json_object_get(socket_obj, "power_cpu_watts") != NULL) { power_cpu = json_real_value(json_object_get(socket_obj, "power_cpu_watts")); - // printf("Socket %d, CPU Power: %0.2lf Watts\n", i, power_cpu); } // If we're on a GPU-only build on an unsupported platform, // we don't have power_mem_watts. if (json_object_get(socket_obj, "power_mem_watts") != NULL) { power_mem = json_real_value(json_object_get(socket_obj, "power_mem_watts")); - // printf("Socket %d, Mem Power: %0.2lf Watts\n", i, power_mem); - } + } // If we have GPUs, obtatin the GPU object - // As a first cut, add up the power of multiple GPUs on that socket. if (num_gpus_per_socket > 0) { json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts"); if (gpu_obj == NULL) { @@ -254,9 +226,9 @@ static int sample(struct ldmsd_sampler *self) power_gpu = 0.0; json_object_foreach(gpu_obj, key, value) { + // We will add power of multiple GPUs as a first cut. See note on line 135. power_gpu += json_real_value(value); - // printf("Socket %d, %s Power: %0.2lf Watts\n", i, key, json_real_value(value)); - } + } } // Set the LDMS records for the socket @@ -264,17 +236,6 @@ static int sample(struct ldmsd_sampler *self) ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); } - // DELETE - /* - ldms_record_set_double(rec_idxs[socket], i_node, power_node); - power_cpu = json_real_value(json_object_get(power_obj, metric_names[socket])); - ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu); - power_gpu = json_real_value(json_object_get(power_obj, metric_names[nsockets+socket])); - ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); - power_mem = json_real_value(json_object_get(power_obj, metric_names[(2*nsockets)+socket])); - ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); - */ - ldms_metric_modify(set, lh_idx); json_decref(power_obj); @@ -286,16 +247,6 @@ static int sample(struct ldmsd_sampler *self) static void term(struct ldmsd_plugin *self) { - int metric; - - // DELETE - // if (metric_names) { - // for (metric = 0; metric < 3 * nsockets; metric++) { - // free(metric_names[metric]); - // } - // free(metric_names); - // } - if (result_string) { free(result_string); } From e09036724e19859204fb8ab06050297dabaf3023 Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Wed, 18 Sep 2024 15:52:53 -0700 Subject: [PATCH 7/7] Formatting. --- .../variorum_sampler/variorum_sampler.c | 165 ++++++++++-------- 1 file changed, 94 insertions(+), 71 deletions(-) diff --git a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c index 7606e4fdc..c882ac386 100644 --- a/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c +++ b/ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c @@ -31,8 +31,8 @@ static int i_cpu; static int i_gpu; static int i_mem; static int lh_idx; -static ldms_mval_t* rec_idxs; -static char* result_string; +static ldms_mval_t *rec_idxs; +static char *result_string; static int create_metric_set(base_data_t base) { @@ -43,15 +43,17 @@ static int create_metric_set(base_data_t base) ldms_mval_t rec_inst; // allocate space for record pointers - if (!rec_idxs) { + if (!rec_idxs) + { rec_idxs = malloc(nsockets * sizeof(ldms_mval_t)); } schema = base_schema_new(base); - if (!schema) { + if (!schema) + { ovis_log(mylog, OVIS_LERROR, - "%s: The schema '%s' could not be created, errno=%d.\n", - __FILE__, base->schema_name, errno); + "%s: The schema '%s' could not be created, errno=%d.\n", + __FILE__, base->schema_name, errno); rc = errno; goto err; } @@ -74,31 +76,32 @@ static int create_metric_set(base_data_t base) int lh_idx = ldms_schema_metric_list_add(schema, "power", NULL, heap_sz); set = base_set_new(base); - if (!set) { + if (!set) + { rc = errno; goto err; } ldms_mval_t lh = ldms_metric_get(set, lh_idx); - for(socket = 0; socket < nsockets; socket++) { + for (socket = 0; socket < nsockets; socket++) + { // create a new record rec_inst = ldms_record_alloc(set, rec_def_idx); rec_idxs[socket] = rec_inst; // set the socket number ldms_record_set_u64(rec_inst, i_sock, socket); // put the record into the list - ldms_list_append_record(set, lh, rec_inst); + ldms_list_append_record(set, lh, rec_inst); } // allocate space for sampling JSON data depending on number of sockets - result_string = (char *) malloc((nsockets * 150 + 500) * sizeof(char)); + result_string = (char *)malloc((nsockets * 150 + 500) * sizeof(char)); return 0; - err: +err: return rc; - } static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct attr_value_list *avl) @@ -107,7 +110,8 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct int rc; int depth; - if (set) { + if (set) + { ovis_log(mylog, OVIS_LERROR, "Set already created.\n"); return EINVAL; } @@ -117,25 +121,27 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct // 9/12/2024: TP Note: We need to know number of GPUs per socket here // so we can create a metric set that also gives per GPU power. - // We don't have a good solution for this. We could call the JSON API + // We don't have a good solution for this. We could call the JSON API // once and parse that value out, but the overhead for that can be high. // To be addressed after the first pass works. // prepare the base for metric collection base = base_config(avl, SAMP, SAMP, mylog); - if (!base) { + if (!base) + { rc = errno; goto err; } rc = create_metric_set(base); - if (rc) { + if (rc) + { ovis_log(mylog, OVIS_LERROR, "failed to create a metric set.\n"); goto err; } return 0; - err: +err: base_del(base); return rc; } @@ -143,10 +149,11 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct static int sample(struct ldmsd_sampler *self) { json_t *power_obj = NULL; - json_t *node_obj = NULL; + json_t *node_obj = NULL; int ret, socket; - if (!set) { + if (!set) + { ovis_log(mylog, OVIS_LERROR, "plugin not initialized\n"); return EINVAL; } @@ -155,69 +162,80 @@ static int sample(struct ldmsd_sampler *self) // get variorum data ret = variorum_get_power_json(&result_string); - if (ret != 0) { + if (ret != 0) + { ovis_log(mylog, OVIS_LERROR, "unable to obtain JSON object data\n"); return EINVAL; } power_obj = json_loads(result_string, JSON_DECODE_ANY, NULL); - void *iter = json_object_iter(power_obj); - while (iter) { - node_obj = json_object_iter_value(iter); - if (node_obj == NULL) { - printf("JSON object not found"); - exit(0); - } - /* The following should return NULL after the first call per our object. */ - iter = json_object_iter_next(power_obj, iter); - } - - double power_node = -1.0; + void *iter = json_object_iter(power_obj); + while (iter) + { + node_obj = json_object_iter_value(iter); + if (node_obj == NULL) + { + printf("JSON object not found"); + exit(0); + } + /* The following should return NULL after the first call per our object. */ + iter = json_object_iter_next(power_obj, iter); + } + + double power_node = -1.0; double power_cpu = -1.0; - double power_gpu = -1.0; + double power_gpu = -1.0; double power_mem = -1.0; int num_gpus_per_socket = -1; char socketID[20]; - // If we're on a GPU-only build, we don't have power_node_watts. - if (json_object_get(node_obj, "power_node_watts") != NULL) { + // If we're on a GPU-only build, we don't have power_node_watts. + if (json_object_get(node_obj, "power_node_watts") != NULL) + { power_node = json_real_value(json_object_get(node_obj, "power_node_watts")); } // If we're on a CPU-only build, we don't have num_gpus_per_socket - if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) { + if (json_object_get(node_obj, "num_gpus_per_socket") != NULL) + { num_gpus_per_socket = json_integer_value(json_object_get(node_obj, - "num_gpus_per_socket")); - } + "num_gpus_per_socket")); + } // update each record - for(socket = 0; socket < nsockets; socket++) { + for (socket = 0; socket < nsockets; socket++) + { // Node power is same on both sockets. ldms_record_set_double(rec_idxs[socket], i_node, power_node); // Obtain Socket Object snprintf(socketID, 20, "socket_%d", socket); json_t *socket_obj = json_object_get(node_obj, socketID); - if (socket_obj == NULL) { + if (socket_obj == NULL) + { printf("Socket object not found!\n"); exit(0); } // If we're on a GPU-only build, we don't have power_cpu_watts - if (json_object_get(socket_obj, "power_cpu_watts") != NULL) { + if (json_object_get(socket_obj, "power_cpu_watts") != NULL) + { power_cpu = json_real_value(json_object_get(socket_obj, "power_cpu_watts")); } - // If we're on a GPU-only build on an unsupported platform, + // If we're on a GPU-only build on an unsupported platform, // we don't have power_mem_watts. - if (json_object_get(socket_obj, "power_mem_watts") != NULL) { + if (json_object_get(socket_obj, "power_mem_watts") != NULL) + { power_mem = json_real_value(json_object_get(socket_obj, "power_mem_watts")); - } - + } + // If we have GPUs, obtatin the GPU object - if (num_gpus_per_socket > 0) { - json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts"); - if (gpu_obj == NULL) { + if (num_gpus_per_socket > 0) + { + json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts"); + if (gpu_obj == NULL) + { printf("GPU object not found! \n"); exit(0); } @@ -225,38 +243,42 @@ static int sample(struct ldmsd_sampler *self) json_t *value; power_gpu = 0.0; - json_object_foreach(gpu_obj, key, value) { + json_object_foreach(gpu_obj, key, value) + { // We will add power of multiple GPUs as a first cut. See note on line 135. - power_gpu += json_real_value(value); - } + power_gpu += json_real_value(value); + } } // Set the LDMS records for the socket ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu); ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu); ldms_record_set_double(rec_idxs[socket], i_mem, power_mem); - } + } ldms_metric_modify(set, lh_idx); json_decref(power_obj); base_sample_end(base); return 0; - } static void term(struct ldmsd_plugin *self) { - if (result_string) { + if (result_string) + { free(result_string); } - if (rec_idxs) { + if (rec_idxs) + { free(rec_idxs); } - if (base) { + if (base) + { base_del(base); } - if (set) { + if (set) + { ldms_set_delete(set); } set = NULL; @@ -264,25 +286,26 @@ static void term(struct ldmsd_plugin *self) static const char *usage(struct ldmsd_plugin *self) { - return "config name=" SAMP " " BASE_CONFIG_USAGE; + return "config name=" SAMP " " BASE_CONFIG_USAGE; } static struct ldmsd_sampler variorum_sampler_plugin = { - .base = { - .name = SAMP, - .type = LDMSD_PLUGIN_SAMPLER, - .term = term, - .config = config, - .usage= usage, - }, - .sample = sample, + .base = { + .name = SAMP, + .type = LDMSD_PLUGIN_SAMPLER, + .term = term, + .config = config, + .usage = usage, + }, + .sample = sample, }; struct ldmsd_plugin *get_plugin() { - mylog = ovis_log_register("sampler."SAMP, "Messages for the " SAMP " plugin"); - if (!mylog) { - ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's log subsystem"); - } - return &variorum_sampler_plugin.base; + mylog = ovis_log_register("sampler." SAMP, "Messages for the " SAMP " plugin"); + if (!mylog) + { + ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's log subsystem"); + } + return &variorum_sampler_plugin.base; }