From 0ae41a2a6e98a7957ff626d5414bccb68c9fee06 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 1 Dec 2022 13:16:56 +0000 Subject: [PATCH] CP-33044 replace gpumon shutdown with NVML detach/attach We want to avoid stopping gpumon while starting VMs with vGPUs because it limits scalabiity and a daemon should run continiously. Use the recently added capability to detach/attach the NVML library instead. Introduce feature flag nvidia-gpumon-detach; when true, we use the new attach/detach feature to keep gpumon running over the start of a VM. This does not yet work with all GPUs but it would help cross-team development to make this code available. Signed-off-by: Christian Lindig --- ocaml/xapi/xapi_globs.ml | 9 +++++++++ ocaml/xapi/xapi_gpumon.ml | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 62e373590e9..3bfd8af9a1d 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -1159,6 +1159,10 @@ type nvidia_t4_sriov = Nvidia_T4_SRIOV | Nvidia_LEGACY | Nvidia_DEFAULT let nvidia_t4_sriov = ref Nvidia_DEFAULT +(** CP-41126. true - we are detaching the NVML library in gpumon; false - + we stop gpumon. *) +let nvidia_gpumon_detach = ref false + let failed_login_alert_freq = ref 3600 let other_options = @@ -1470,6 +1474,11 @@ let other_options = , (fun () -> string_of_int !max_observer_file_size) , "The maximum size of log files for saving spans" ) + ; ( "nvidia-gpumon-detach" + , Arg.Set nvidia_gpumon_detach + , (fun () -> string_of_bool !nvidia_gpumon_detach) + , "On VM start, detach the NVML library rather than stopping gpumon" + ) ] (* The options can be set with the variable xapiflags in /etc/sysconfig/xapi. diff --git a/ocaml/xapi/xapi_gpumon.ml b/ocaml/xapi/xapi_gpumon.ml index 384ab68f822..fa71fce96b5 100644 --- a/ocaml/xapi/xapi_gpumon.ml +++ b/ocaml/xapi/xapi_gpumon.ml @@ -18,6 +18,8 @@ open D let gpumon = "xcp-rrdd-gpumon" +let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute + module Gpumon = Daemon_manager.Make (struct let check = Daemon_manager.Function @@ -40,7 +42,36 @@ module Gpumon = Daemon_manager.Make (struct Xapi_systemctl.stop ~wait_until_success:false gpumon end) -let with_gpumon_stopped = Gpumon.with_daemon_stopped +let gpumon_m = Mutex.create () + +let with_gpumon_stopped ?(timeout = 30.0) f = + match !Xapi_globs.nvidia_gpumon_detach with + | false -> + Gpumon.with_daemon_stopped ~timeout f + | true -> ( + debug "%s: about to acquire lock" __FUNCTION__ ; + with_lock gpumon_m @@ fun () -> + let module GPU = Gpumon_client.Client.Nvidia in + match GPU.nvml_is_attached __FUNCTION__ with + | false -> + (* nothing to do, just execute f *) + debug "%s: NVML is detached; nothing to do" __FUNCTION__ ; + f () + | true -> + (* detach, execute f, re-attach in any case. Be aware + that both xenopsd, xapi call /usr/lib/nvidia/sriov-manage, + which may stop stop gpumon *) + Fun.protect + (fun () -> + debug "%s: about to detach NVML" __FUNCTION__ ; + GPU.nvml_detach __FUNCTION__ ; + f () + ) + ~finally:(fun () -> + debug "%s: about to attach NVML" __FUNCTION__ ; + GPU.nvml_attach __FUNCTION__ + ) + ) module Nvidia = struct let key = "nvidia"