Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update LDMS Variorum sampler to support Variorum v0.8 #1447

Draft
wants to merge 9 commits into
base: OVIS-4
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ GPU power consumption in Watts (aggregated across all GPUs on the socket, and
reported as -1 on unsupported platforms); and memory power consumption in Watts.

.PP
The variorum sampler depends on Variorum 0.6.0 or higher and Jansson. The sampler cannot be built without these libraries. If either library is installed in a non-standard location, paths to the respective install directories should be provided to Autoconf using
The variorum sampler depends on Variorum 0.8.0 or higher and Jansson. The sampler cannot be built without these libraries. If either library is installed in a non-standard location, paths to the respective install directories should be provided to Autoconf using
the --with-libjansson-prefix and/or --with-libvariorum-prefix flag.

.SH CONFIGURATION ATTRIBUTE SYNTAX
Expand Down
4 changes: 2 additions & 2 deletions ldms/src/contrib/sampler/variorum_sampler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ architecture and implementation.
Build Requirements
------------------

The Variorum LDMS sampler currently requires version 0.6.0 or higher
The Variorum LDMS sampler currently requires version 0.8.0 or higher
of the Variorum library (``libvariorum.so``). This library must be built
from source. The sampler also requires jansson, which is a Variorum
dependency. If both libraries are installed in standard locations,
Expand Down Expand Up @@ -52,7 +52,7 @@ Using the Variorum LDMS Sampler

The sampler, when configured, automatically detects the number of sockets
on the host machine and then provides, for each socket, an LDMS record
containing power data. The sampler calls ``variorum_get_node_power_json``
containing power data. The sampler calls ``variorum_get_power_json``
internally, for which documentation can be found here:
[Variorum JSON-Support Functions](https://variorum.readthedocs.io/en/latest/api/json_support_functions.html)

Expand Down
203 changes: 134 additions & 69 deletions ldms/src/contrib/sampler/variorum_sampler/variorum_sampler.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,14 @@ static ldms_set_t set = NULL;
static ovis_log_t mylog;
static base_data_t base;
static int nsockets;
static const char *SOCKET_METRICS[] = {"power_cpu_watts_socket_", "power_gpu_watts_socket_", "power_mem_watts_socket_"};
static char** metric_names = NULL;
static int i_node;
static int i_sock;
static int i_cpu;
static int i_gpu;
static int i_mem;
static int lh_idx;
static ldms_mval_t* rec_idxs;
static char* result_string;
static ldms_mval_t *rec_idxs;
static char *result_string;

static int create_metric_set(base_data_t base)
{
Expand All @@ -44,24 +42,18 @@ static int create_metric_set(base_data_t base)
ldms_schema_t schema;
ldms_mval_t rec_inst;

// allocate space for metric names
if (!metric_names) {
metric_names = malloc(3 * nsockets * sizeof(char*));
}
for (metric = 0; metric < (3 * nsockets); metric++) {
metric_names[metric] = malloc(39);
}

// allocate space for record pointers
if (!rec_idxs) {
if (!rec_idxs)
{
rec_idxs = malloc(nsockets * sizeof(ldms_mval_t));
}

schema = base_schema_new(base);
if (!schema) {
if (!schema)
{
ovis_log(mylog, OVIS_LERROR,
"%s: The schema '%s' could not be created, errno=%d.\n",
__FILE__, base->schema_name, errno);
"%s: The schema '%s' could not be created, errno=%d.\n",
__FILE__, base->schema_name, errno);
rc = errno;
goto err;
}
Expand All @@ -84,38 +76,32 @@ static int create_metric_set(base_data_t base)
int lh_idx = ldms_schema_metric_list_add(schema, "power", NULL, heap_sz);

set = base_set_new(base);
if (!set) {
if (!set)
{
rc = errno;
goto err;
}

ldms_mval_t lh = ldms_metric_get(set, lh_idx);

for(socket = 0; socket < nsockets; socket++) {
for (socket = 0; socket < nsockets; socket++)
{
// create a new record
rec_inst = ldms_record_alloc(set, rec_def_idx);
rec_idxs[socket] = rec_inst;
// set the socket number
ldms_record_set_u64(rec_inst, i_sock, socket);
// put the record into the list
ldms_list_append_record(set, lh, rec_inst);
// create metric name list (for querying json object later on)
for(metric = 0; metric < 3; metric++) {
strcpy(metric_name,SOCKET_METRICS[metric]);
sprintf(socket_num,"%d",socket);
strcat(metric_name,socket_num);
strcpy(metric_names[(metric*nsockets)+socket], metric_name);
}
}

// allocate space for sampling JSON data depending on number of sockets
result_string = (char *) malloc((nsockets * 150 + 500) * sizeof(char));
result_string = (char *)malloc((nsockets * 150 + 500) * sizeof(char));

return 0;

err:
err:
return rc;

}

static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct attr_value_list *avl)
Expand All @@ -124,123 +110,202 @@ static int config(struct ldmsd_plugin *self, struct attr_value_list *kwl, struct
int rc;
int depth;

if (set) {
if (set)
{
ovis_log(mylog, OVIS_LERROR, "Set already created.\n");
return EINVAL;
}

// determine number of sockets
nsockets = variorum_get_num_sockets();

// 9/12/2024: TP Note: We need to know number of GPUs per socket here
// so we can create a metric set that also gives per GPU power.
// We don't have a good solution for this. We could call the JSON API
// once and parse that value out, but the overhead for that can be high.
// To be addressed after the first pass works.

// prepare the base for metric collection
base = base_config(avl, SAMP, SAMP, mylog);
if (!base) {
if (!base)
{
rc = errno;
goto err;
}

rc = create_metric_set(base);
if (rc) {
if (rc)
{
ovis_log(mylog, OVIS_LERROR, "failed to create a metric set.\n");
goto err;
}

return 0;
err:
err:
base_del(base);
return rc;
}

static int sample(struct ldmsd_sampler *self)
{
json_t *power_obj = NULL;
json_t *node_obj = NULL;
int ret, socket;

if (!set) {
if (!set)
{
ovis_log(mylog, OVIS_LERROR, "plugin not initialized\n");
return EINVAL;
}

base_sample_begin(base);

// get variorum data
ret = variorum_get_node_power_json(&result_string);
if (ret != 0) {
ret = variorum_get_power_json(&result_string);
if (ret != 0)
{
ovis_log(mylog, OVIS_LERROR, "unable to obtain JSON object data\n");
return EINVAL;
}

power_obj = json_loads(result_string, JSON_DECODE_ANY, NULL);
void *iter = json_object_iter(power_obj);
while (iter)
{
node_obj = json_object_iter_value(iter);
if (node_obj == NULL)
{
printf("JSON object not found");
exit(0);
}
/* The following should return NULL after the first call per our object. */
iter = json_object_iter_next(power_obj, iter);
}

double power_node = -1.0;
double power_cpu = -1.0;
double power_gpu = -1.0;
double power_mem = -1.0;
int num_gpus_per_socket = -1;
char socketID[20];

// If we're on a GPU-only build, we don't have power_node_watts.
if (json_object_get(node_obj, "power_node_watts") != NULL)
{
power_node = json_real_value(json_object_get(node_obj, "power_node_watts"));
}

double power_node = json_real_value(json_object_get(power_obj, "power_node_watts"));
double power_cpu, power_gpu, power_mem;
// If we're on a CPU-only build, we don't have num_gpus_per_socket
if (json_object_get(node_obj, "num_gpus_per_socket") != NULL)
{
num_gpus_per_socket = json_integer_value(json_object_get(node_obj,
"num_gpus_per_socket"));
}

// update each record
for(socket = 0; socket < nsockets; socket++) {
for (socket = 0; socket < nsockets; socket++)
{
// Node power is same on both sockets.
ldms_record_set_double(rec_idxs[socket], i_node, power_node);
power_cpu = json_real_value(json_object_get(power_obj, metric_names[socket]));

// Obtain Socket Object
snprintf(socketID, 20, "socket_%d", socket);
json_t *socket_obj = json_object_get(node_obj, socketID);
if (socket_obj == NULL)
{
printf("Socket object not found!\n");
exit(0);
}

// If we're on a GPU-only build, we don't have power_cpu_watts
if (json_object_get(socket_obj, "power_cpu_watts") != NULL)
{
power_cpu = json_real_value(json_object_get(socket_obj, "power_cpu_watts"));
}

// If we're on a GPU-only build on an unsupported platform,
// we don't have power_mem_watts.
if (json_object_get(socket_obj, "power_mem_watts") != NULL)
{
power_mem = json_real_value(json_object_get(socket_obj, "power_mem_watts"));
}

// If we have GPUs, obtatin the GPU object
if (num_gpus_per_socket > 0)
{
json_t *gpu_obj = json_object_get(socket_obj, "power_gpu_watts");
if (gpu_obj == NULL)
{
printf("GPU object not found! \n");
exit(0);
}
const char *key;
json_t *value;
power_gpu = 0.0;

json_object_foreach(gpu_obj, key, value)
{
// We will add power of multiple GPUs as a first cut. See note on line 135.
power_gpu += json_real_value(value);
}
}

// Set the LDMS records for the socket
ldms_record_set_double(rec_idxs[socket], i_cpu, power_cpu);
power_gpu = json_real_value(json_object_get(power_obj, metric_names[nsockets+socket]));
ldms_record_set_double(rec_idxs[socket], i_gpu, power_gpu);
power_mem = json_real_value(json_object_get(power_obj, metric_names[(2*nsockets)+socket]));
ldms_record_set_double(rec_idxs[socket], i_mem, power_mem);
}

ldms_metric_modify(set, lh_idx);

json_decref(power_obj);
base_sample_end(base);

return 0;

}

static void term(struct ldmsd_plugin *self)
{
int metric;

if (metric_names) {
for (metric = 0; metric < 3 * nsockets; metric++) {
free(metric_names[metric]);
}
free(metric_names);
}
if (result_string) {
if (result_string)
{
free(result_string);
}
if (rec_idxs) {
if (rec_idxs)
{
free(rec_idxs);
}
if (base) {
if (base)
{
base_del(base);
}
if (set) {
if (set)
{
ldms_set_delete(set);
}
set = NULL;
}

static const char *usage(struct ldmsd_plugin *self)
{
return "config name=" SAMP " " BASE_CONFIG_USAGE;
return "config name=" SAMP " " BASE_CONFIG_USAGE;
}

static struct ldmsd_sampler variorum_sampler_plugin = {
.base = {
.name = SAMP,
.type = LDMSD_PLUGIN_SAMPLER,
.term = term,
.config = config,
.usage= usage,
},
.sample = sample,
.base = {
.name = SAMP,
.type = LDMSD_PLUGIN_SAMPLER,
.term = term,
.config = config,
.usage = usage,
},
.sample = sample,
};

struct ldmsd_plugin *get_plugin()
{
mylog = ovis_log_register("sampler."SAMP, "Messages for the " SAMP " plugin");
if (!mylog) {
ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's log subsystem");
}
return &variorum_sampler_plugin.base;
mylog = ovis_log_register("sampler." SAMP, "Messages for the " SAMP " plugin");
if (!mylog)
{
ovis_log(NULL, OVIS_LWARN, "Failed to create the " SAMP " plugin's log subsystem");
}
return &variorum_sampler_plugin.base;
}
Loading