Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update utilization API to build on cpu-only systems, rename API to be generic #525

Merged
merged 11 commits into from
Mar 11, 2024
7 changes: 3 additions & 4 deletions src/docs/sphinx/VariorumAPI.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,10 @@ The API to obtain node utilization has the following format. It takes a string
(``char**``) by reference as input, and populates this string with a JSON object
with total CPU, system CPU, user CPU, total memory, and GPU (when available)
utilizations. It reports the utilization of each available GPU. GPU utilization
is accomplished using the ``int variorum_get_gpu_utilization_json(char
**get_gpu_util_obj_str)`` function. The total memory utilization is computed
is obtained using the NVML and RSMI APIs. The total memory utilization is computed
using ``/proc/meminfo``, and CPU utilizations is computed using ``/proc/stat``.

The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function
The ``variorum_get_utilization_json(char **get_util_obj_str)`` function
returns a string type nested JSON object. An example is provided below:

.. code::
Expand All @@ -150,7 +149,7 @@ returns a string type nested JSON object. An example is provided below:

The ``*`` here refers to socket ID, and the ``#`` refers to GPU ID.

The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function
The ``variorum_get_utilization_json(char **get_util_obj_str)`` function
returns a string type nested JSON object. An example is provided below:

.. code::
Expand Down
3 changes: 1 addition & 2 deletions src/docs/sphinx/api/json_support_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,5 @@ Defined in ``variorum/variorum.h``.

.. doxygenfunction:: variorum_get_frequency_json

.. doxygenfunction:: variorum_get_node_utilization_json
.. doxygenfunction:: variorum_get_utilization_json

.. doxygenfunction:: variorum_get_gpu_utilization_json
3 changes: 1 addition & 2 deletions src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@ set(BASIC_EXAMPLES
variorum-disable-turbo-example
variorum-enable-turbo-example
variorum-get-frequency-json-example
variorum-get-gpu-utilization-json-example
variorum-get-node-power-domain-info-json-example
variorum-get-power-json-example
variorum-get-thermals-json-example
variorum-get-node-utilization-json-example
variorum-get-utilization-json-example
variorum-get-topology-info-example
variorum-integration-using-json-example
variorum-monitoring-to-file-example
Expand Down
88 changes: 0 additions & 88 deletions src/examples/variorum-get-node-utilization-json-example.c

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ int main(int argc, char **argv)
return -1;
}
}
ret = variorum_get_gpu_utilization_json(&s);
ret = variorum_get_utilization_json(&s);
if (ret != 0)
{
printf("First run: JSON get node utilization failed!\n");
Expand All @@ -69,7 +69,7 @@ int main(int argc, char **argv)
x += do_work(i);
}
printf("Final result: %f\n", x);
ret = variorum_get_gpu_utilization_json(&s);
ret = variorum_get_utilization_json(&s);
if (ret != 0)
{
printf("Second run: JSON get node utilization failed!\n");
Expand Down
3 changes: 1 addition & 2 deletions src/variorum/AMD_GPU/amd_gpu_power_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,6 @@ void get_gpu_utilization_data_json(int chipid, int total_sockets,
rsmi_status_t ret;
uint32_t num_devices;
int gpus_per_socket;
int d = 0;
char socket_id[12];
char hostname[1024];
char device_id[12];
Expand Down Expand Up @@ -869,7 +868,7 @@ void get_gpu_utilization_data_json(int chipid, int total_sockets,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}
snprintf(device_id, 12, "GPU%d_util%%", d);
snprintf(device_id, 12, "GPU%d_util%%", i);
json_object_set_new(socket_obj, device_id, json_integer(utilpercent));
}

Expand Down
2 changes: 1 addition & 1 deletion src/variorum/AMD_GPU/config_amd_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ int set_amd_gpu_func_ptrs(int idx)
amd_gpu_instinct_get_gpu_utilization;
g_platform[idx].variorum_get_thermals_json = amd_gpu_instinct_get_thermals_json;
g_platform[idx].variorum_get_frequency_json = amd_gpu_instinct_get_clocks_json;
g_platform[idx].variorum_get_gpu_utilization_json =
g_platform[idx].variorum_get_utilization_json =
amd_gpu_instinct_get_gpu_utilization_json;
/* Initialize control interfaces */
g_platform[idx].variorum_cap_each_gpu_power_limit =
Expand Down
2 changes: 1 addition & 1 deletion src/variorum/Nvidia_GPU/config_nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int set_nvidia_func_ptrs(int idx)
g_platform[idx].variorum_print_gpu_utilization = volta_get_gpu_utilization;
g_platform[idx].variorum_get_thermals_json = volta_get_thermals_json;
g_platform[idx].variorum_get_frequency_json = volta_get_clocks_json;
g_platform[idx].variorum_get_gpu_utilization_json =
g_platform[idx].variorum_get_utilization_json =
volta_get_gpu_utilization_json;
/* Initialize control interfaces */
g_platform[idx].variorum_cap_each_gpu_power_limit =
Expand Down
2 changes: 1 addition & 1 deletion src/variorum/config_architecture.c
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ void variorum_init_func_ptrs()
g_platform[i].variorum_print_turbo = NULL;
g_platform[i].variorum_poll_power = NULL;
g_platform[i].variorum_print_gpu_utilization = NULL;
g_platform[i].variorum_get_gpu_utilization_json = NULL;
g_platform[i].variorum_get_utilization_json = NULL;
g_platform[i].variorum_monitoring = NULL;
g_platform[i].variorum_get_power_json = NULL;
g_platform[i].variorum_get_node_power_domain_info_json = NULL;
Expand Down
4 changes: 2 additions & 2 deletions src/variorum/config_architecture.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,10 @@ struct platform
/// @return Error code.
int (*variorum_print_gpu_utilization)(int long_ver);

/// @brief Function pointer to get JSON object for GPU utilization
/// @brief Function pointer to get JSON object for utilization
///
/// @return Error code.
int (*variorum_get_gpu_utilization_json)(char **get_gpu_util_obj_str);
int (*variorum_get_utilization_json)(char **get_util_obj_str);

/// @brief Function pointer to get JSON object for node power data.
///
Expand Down
97 changes: 39 additions & 58 deletions src/variorum/variorum.c
Original file line number Diff line number Diff line change
Expand Up @@ -1090,7 +1090,7 @@ int variorum_get_power_json(char **get_power_obj_str)
return err;
}

int variorum_get_node_utilization_json(char **get_util_obj_str)
int variorum_get_utilization_json(char **get_util_obj_str)
{
int err = 0;
err = variorum_enter(__FILE__, __FUNCTION__, __LINE__);
Expand All @@ -1099,20 +1099,12 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
return -1;
}

err = variorum_exit(__FILE__, __FUNCTION__, __LINE__);
if (err)
{
return -1;
}

char hostname[1024];
struct timeval tv;
uint64_t ts;
char *gpu_util_str = NULL;
gethostname(hostname, 1024);
gettimeofday(&tv, NULL);
ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec;
int ret;
char str[100];
const char d[2] = " ";
char *token, *s, *p;
Expand All @@ -1136,33 +1128,63 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
uint64_t mem_free = 0;
uint64_t sys_time = 0;
int strcp;
int idx = -1;

json_t *get_util_obj = NULL;
json_t *get_cpu_util_obj = NULL;
json_t *get_timestamp_obj = NULL;
json_t *cpu_util_obj = NULL;

// Look for a GPU build and get an ID.
for (idx = 0; idx < P_NUM_PLATFORMS; idx++)
{
#ifdef VARIORUM_WITH_INTEL_GPU
idx = P_INTEL_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_NVIDIA_GPU
idx = P_NVIDIA_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_AMD_GPU
idx = P_AMD_GPU_IDX;
break;
#endif
}

// If we have a GPU build, obtain the GPU object first.
#if defined(VARIORUM_WITH_NVIDIA_GPU) || defined(VARIORUM_WITH_AMD_GPU) || defined(VARIORUM_WITH_INTEL_GPU)
int ret;
char *gpu_util_str = NULL;
// get gpu utilization
ret = variorum_get_gpu_utilization_json(&gpu_util_str);
ret = g_platform[idx].variorum_get_utilization_json(&gpu_util_str);
if (ret != 0)
{
printf("JSON get gpu utilization failed. Exiting.\n");
free(gpu_util_str);
return -1;
}

/* Load the string as a JSON object using Jansson */
json_t *get_util_obj = json_loads(gpu_util_str, JSON_DECODE_ANY, NULL);
/* Load the existing GPU string as a JSON object using Jansson */
get_util_obj = json_loads(gpu_util_str, JSON_DECODE_ANY, NULL);
get_cpu_util_obj = json_object_get(get_util_obj, hostname);
get_timestamp_obj = json_object_get(get_cpu_util_obj, "timestamp");
cpu_util_obj = json_object_get(get_cpu_util_obj, "CPU");
#endif

json_t *get_cpu_util_obj = json_object_get(get_util_obj, hostname);
if (get_cpu_util_obj == NULL)
//CPU-only build will have this object as NULL.
if (get_util_obj == NULL)
{
get_util_obj = json_object();
get_cpu_util_obj = json_object();
json_object_set_new(get_util_obj, hostname, get_cpu_util_obj);
}

json_t *get_timestamp_obj = json_object_get(get_util_obj, "timestamp");
if (get_timestamp_obj == NULL)
{
json_object_set_new(get_cpu_util_obj, "timestamp", json_integer(ts));
}

json_t *cpu_util_obj = json_object_get(get_cpu_util_obj, "CPU");
if (cpu_util_obj == NULL)
{
cpu_util_obj = json_object();
Expand Down Expand Up @@ -1237,6 +1259,7 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
last_sum = sum;
last_sys_time = sys_time;
last_idle = sum_idle;

json_object_set_new(cpu_util_obj, "total_util%", json_real(cpu_util));
json_object_set_new(cpu_util_obj, "user_util%", json_real(user_util));
json_object_set_new(cpu_util_obj, "system_util%", json_real(sys_util));
Expand Down Expand Up @@ -1293,48 +1316,6 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
*get_util_obj_str = json_dumps(get_util_obj, JSON_INDENT(4));
json_decref(get_util_obj);
state = 1;
return 0;
}

int variorum_get_gpu_utilization_json(char **get_gpu_util_obj_str)
{
int err = 0;
int i;
err = variorum_enter(__FILE__, __FUNCTION__, __LINE__);
if (err)
{
return -1;
}

for (i = 0; i < P_NUM_PLATFORMS; i++)
{
#ifdef VARIORUM_WITH_INTEL_GPU
i = P_INTEL_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_NVIDIA_GPU
i = P_NVIDIA_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_AMD_GPU
i = P_AMD_GPU_IDX;
break;
#endif
}

if (g_platform[i].variorum_get_gpu_utilization_json == NULL)
{
variorum_error_handler("Feature not yet implemented or is not supported",
VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED,
getenv("HOSTNAME"), __FILE__,
__FUNCTION__, __LINE__);
return -1;
}
err = g_platform[i].variorum_get_gpu_utilization_json(get_gpu_util_obj_str);
if (err)
{
return -1;
}

err = variorum_exit(__FILE__, __FUNCTION__, __LINE__);
if (err)
Expand Down
Loading
Loading