diff --git a/doc/content/toolstack/features/NUMA/hwloc.svg b/doc/content/toolstack/features/NUMA/hwloc.svg new file mode 100644 index 00000000000..aacb295b93e --- /dev/null +++ b/doc/content/toolstack/features/NUMA/hwloc.svg @@ -0,0 +1,232 @@ + + + + Machine (125GB total) + + Package L#0 + + L3 (19MB) + + Group0 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#0 + + PU L#0 + P#0 + + PU L#1 + P#24 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#1 + + PU L#2 + P#4 + + PU L#3 + P#28 + + + + 6x total + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#5 + + PU L#10 + P#20 + + PU L#11 + P#44 + + NUMANode L#0 P#0 (31GB) + + Group0 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#6 + + PU L#12 + P#2 + + PU L#13 + P#26 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#7 + + PU L#14 + P#6 + + PU L#15 + P#30 + + + + 6x total + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#11 + + PU L#22 + P#22 + + PU L#23 + P#46 + + NUMANode L#1 P#2 (31GB) + + Package L#1 + + L3 (19MB) + + Group0 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#12 + + PU L#24 + P#1 + + PU L#25 + P#25 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#13 + + PU L#26 + P#5 + + PU L#27 + P#29 + + + + 6x total + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#17 + + PU L#34 + P#21 + + PU L#35 + P#45 + + NUMANode L#2 P#1 (31GB) + + Group0 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#18 + + PU L#36 + P#3 + + PU L#37 + P#27 + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#19 + + PU L#38 + P#7 + + PU L#39 + P#31 + + + + 6x total + + L2 (1024KB) + + L1d (32KB) + + L1i (32KB) + + Core L#23 + + PU L#46 + P#23 + + PU L#47 + P#47 + + NUMANode L#3 P#3 (31GB) + + MemoryModule + + MemoryModule + + MemoryModule + + MemoryModule + + MemoryModule + + MemoryModule + + MemoryModule + + MemoryModule + + Host: perfuk-18-06d + Date: Tue 14 Nov 2023 15:26:17 UTC + diff --git a/doc/content/toolstack/features/NUMA/index.md b/doc/content/toolstack/features/NUMA/index.md new file mode 100644 index 00000000000..128dbb759f0 --- /dev/null +++ b/doc/content/toolstack/features/NUMA/index.md @@ -0,0 +1,189 @@ ++++ +title = "NUMA" ++++ + +## NUMA in a nutshell + +Systems that contain more than one CPU socket are typically built on a Non-Uniform Memory Architecture (NUMA) [^xen_numa][^kernel_numa]. +In a NUMA system each node has fast, lower latency access to local memory. + +![hwloc](hwloc.svg) + +In the diagram [^lstopo] above we have 4 NUMA nodes: + * 2 of those are due to 2 separate physical packages (sockets) + * a further 2 is due to Sub-NUMA-Clustering (aka Nodes Per Socket for AMD) where the L3 cache is split + +The L3 cache is shared among multiple cores, but cores `0-5` have lower latency access to one part of it, than cores `6-11`, and this is also reflected by splitting memory addresses into 4 31GiB ranges in total. + +In the diagram the closer the memory is to the core, the lower the access latency: + * per-core caches: L1, L2 + * per-package shared cache: L3 (local part), L3 (remote part) + * local NUMA node (to a group of cores, e.g. `L#0 P#0`), node 0 + * remote NUMA node in same package (`L#1 P#2`), node 1 + * remote NUMA node in other packages (`L#2 P#1` and 'L#3P#3'), node 2 and 3 + +### The NUMA distance matrix + +Accessing remote NUMA node in the other package has to go through a shared interconnect, which has lower bandwidth than the direct connections, and also a bottleneck if both cores have to access remote memory: the bandwidth for a single core is effectively at most half. + +This is reflected in the NUMA distance/latency matrix. +The units are arbitrary, and by convention access latency to the local NUMA node is given distance '10'. + +Relative latency matrix by logical indexes: + +| index | 0 | 2 | 1 | 3 | +| ----- | --- | --- | --- | ---| +| 0 | 10 | 21| 11| 21| +| 2 | 21 | 10| 21| 11| +| 1 | 11 | 21| 10| 21| +| 3 | 21 | 11| 21| 10| + +This follows the latencies described previously: + * fast access to local NUMA node memory (by definition), node 0, cost 10 + * slightly slower access latency to the other NUMA node in same package, node 1, cost 11 + * twice as slow access latency to remote NUMA memory in the other physical package (socket): nodes 2 and 3, cost 21 + +There is also I/O NUMA where a cost is similarly associated to where a PCIe is plugged in, but exploring that is future work (it requires exposing NUMA topology to the Dom0 kernel to benefit from it), and for simplicity the diagram above does not show it. + +## Advantages of NUMA + +NUMA does have advantages though: if each node accesses only its local memory, then each node can independently achieve maximum throughput. + +For best performance we should: + - minimize the amount of interconnect bandwidth we are using + - run code that accesses memory allocated on the closest NUMA node + - maximize the number of NUMA nodes that we use in the system as a whole + +If a VM's memory and vCPUs can entirely fit within a single NUMA node then we should tell Xen to prefer to allocate memory from and run the vCPUs on a single NUMA node. + +## Xen vCPU soft-affinity + +The Xen scheduler supports 2 kinds of constraints: +* hard pinning: a vCPU may only run on the specified set of pCPUs and nowhere else +* soft pinning: a vCPU is *preferably* run on the specified set of pCPUs, but if they are all busy then it may run elsewhere + +The former is useful if you want strict separation, but it can potentially leave part of the system idle while another part is bottlenecked with lots of vCPUs all competing for the same limited set of pCPUs. + +Xen does not migrate workloads between NUMA nodes on its own (the Linux kernel does), although it is possible to achieve a similar effect with explicit migration. +However migration introduces additional delays and is best avoided for entire VMs. + +The latter (soft pinning) is preferred: running a workload now, even on a potentially suboptimal pCPU (higher NUMA latency) is still better than not running it at all and waiting until a pCPU is freed up. + +Xen will also allocate memory for the VM according to the vCPU (soft) pinning: if the vCPUs are pinned only to NUMA nodes A and B, then it will allocate the VM's memory from NUMA nodes A and B (in a round-robin way, resulting in interleaving). + +By default (no pinning) it will interleave memory from all NUMA nodes, which provides average performance, but individual tasks' performance may be significantly higher or lower depending on which NUMA node the application may have "landed" on. +Furthermore restarting processes will speed them up or slow them down as address space randomization picks different memory regions inside a VM. + +Note that this is not the worst case: the worst case would be for memory to be allocated on one NUMA node, but the vCPU always running on the furthest away NUMA node. + +## Best effort NUMA-aware memory allocation for VMs + +By default Xen stripes the VM's memory accross all NUMA nodes of the host, which means that every VM has to go through all the interconnects. +The goal here is to find a better allocation than the default, not necessarily an optimal allocation. +An optimal allocation would require knowing what VMs you would start/create in the future, and planning across hosts too. + +Overall we want to balance the VMs across NUMA nodes, such that we use all NUMA nodes to take advantage of the maximum memory bandwidth available on the system. +For now this proposed balancing will be done only by balancing memory usage: always heuristically allocating VMs on the NUMA node that has the most available memory. +Note that this allocation has a race condition for now when multiple VMs are booted in parallel, because we don't wait until Xen has constructed the domain for each one (that'd serialize domain construction, which is currently parallel). +This may be improved in the future by having an API to query Xen where it has allocated the memory, and to explicitly ask it to place memory on a given NUMA node (instead of best-effort). + +If a VM doesn't fit into a single node then it is not so clear what the best approach is. +One criteria to consider is minimizing the NUMA distance between the nodes chosen for the VM. +Large NUMA systems may not be fully connected in a mesh requiring multiple hops to each a node, or even have assymetric links, or links with different bitwidth. +These tradeoff should be approximatively reflected in the ACPI SLIT tables, as a matrix of distances between nodes. +It is possible that 3 NUMA nodes have a smaller average/maximum distance than 2, so we need to consider all possibilities. + +For N nodes there would be 2^N possibilities, so [Topology.NUMA.candidates] limits the number of choices to 65520+N (full set of 2^N possibilities for 16 NUMA nodes, and a reduced set of choices for larger systems). + +[Topology.NUMA.candidates] is a sorted sequence of node sets, in ascending order of maximum/average distances. +Once we've eliminated the candidates not suitable for this VM (that do not have enough total memory/pCPUs) we are left with a monotonically increasing sequence of nodes. +There are still multiple possibilities with same average distance. +This is where we consider our second criteria - balancing - and pick the node with most available free memory. + +Once a suitable set of NUMA nodes are picked we compute the CPU soft affinity as the union of the CPUs from all these NUMA nodes. +If we didn't find a solution then we let Xen use its default allocation. + +The "distances" between NUMA nodes may not all be equal, e.g. some nodes may have shorter links to some remote NUMA nodes, while others may have to go through multiple hops to reach it. +See page 13 in [^AMD_numa] for a diagram of an AMD Opteron 6272 system. + +## Limitations and tradeoffs + +* Booting multiple VMs in parallel will result in potentially allocating both on the same NUMA node (race condition) +* When we're about to run out of host memory we'll fall back to striping memory again, but the soft affinity mask won't reflect that (this needs an API to query Xen on where it has actually placed the VM, so we can fix up the mask accordingly) +* XAPI is not aware of NUMA balancing across a pool, and choses hosts purely based on total amount of free memory, even if a better NUMA placement could be found on another host +* Very large (>16 NUMA nodes) systems may only explore a limited number of choices (fit into a single node vs fallback to full interleaving) +* The exact VM placement is not yet controllable +* Microbenchmarks with a single VM on a host show both performance improvements and regressions on memory bandwidth usage: previously a single VM may have been able to take advantage of the bandwidth of both NUMA nodes if it happened to allocate memory from the right places, whereas now it'll be forced to use just a single node. + As soon as you have more than 1 VM that is busy on a system enabling NUMA balancing should almost always be an improvement though. +* it is not supported to combine hard vCPU masks with soft affinity: if hard affinities are used then no NUMA scheduling is done by the toolstack and we obey exactly what the user has asked for with hard affinities. + This shouldn't affect other VMs since the memory used by hard-pinned VMs will still be reflected in overall less memory available on individual NUMA nodes. +* Corner case: the ACPI standard allows certain NUMA nodes to be unreachable (distance `0xFF` = `-1` in the Xen bindings). + This is not supported and will cause an exception to be raised. + If this is an issue in practice the NUMA matrix could be pre-filtered to contain only reachable nodes. + NUMA nodes with 0 CPUs *are* accepted (it can result from hard affinity pinnings) +* NUMA balancing is not considered during HA planning +* Dom0 is a single VM that needs to communicate with all other VMs, so NUMA balancing is not applied to it (we'd need to expose NUMA topology to the Dom0 kernel so it can better allocate processes) +* IO NUMA is out of scope for now + +## XAPI datamodel design + +* New API field: `Host.numa_affinity_policy`. +* Choices: `default`, `any`, `best-effort`. +* On upgrade the field is set to `default` +* Changes in the field only affect newly (re)booted VMs, for changes to take effect on existing VMs a host evacuation or reboot is needed + +There may be more choices in the future (e.g. `strict`, which requires both Xen and toolstack changes). + +Meaning of the policy: +* `any`: the Xen default where it allocated memory by striping across NUMA nodes +* `best-effort`: the algorithm described in this document, where soft pinning is used to achieve better balancing and lower latency +* `default`: when the admin hasn't expressed a preference + +* Currently `default` is treated as `any`, but the admin can change it, and then the system will remember that change across upgrades. + If we didn't have a `default` then changing the "default" policy on an upgrade would be tricky: we either risk overriding an explicit choice of the admin, or existing installs cannot take advantage of the improved performance from `best-effort` +* Future XAPI versions may change `default` to mean `best-effort`. + Admins can still override it to `any` if they wish on a host by host basis. + +It is not expected that users would have to change `best-effort`, unless they run very specific workloads, so a pool level control is not provided at this moment. + +There is also no separate feature flag: this host flag acts as a feature flag that can be set through the API without restarting the toolstack. +Although obviously only new VMs will benefit. + +Debugging the allocator is done by running `xl vcpu-list` and investigating the soft pinning masks, and by analyzing xensource.log. + +### Xenopsd implementation + +See the documentation in [softaffinity.mli] and [topology.mli]. + +* [Softaffinity.plan] returns a [CPUSet] given a host's NUMA allocation state and a VM's NUMA allocation request. +* [Topology.CPUSet] provides helpers for operating on a set of CPU indexes. +* [Topology.NUMAResource] is a [CPUSet] and the free memory available on a NUMA node. +* [Topology.NUMARequest] is a request for a given number of vCPUs and memory in bytes. +* [Topology.NUMA] represents a host's NUMA allocation state. +* [Topology.NUMA.candidates] are groups of nodes orderd by minimum average distance. +The sequence is limited to [N+65520], where [N] is the number of NUMA nodes. +This avoids exponential state space explosion on very large systems (>16 NUMA nodes). +* [Topology.NUMA.choose] will choose one NUMA node deterministically, while trying to keep overall NUMA node usage balanced. +* [Domain.numa_placement] builds a [NUMARequest] and uses the above [Topology] and [Softaffinity] functions to compute and apply a plan. + +We used to have a `xenopsd.conf` configuration option to enable numa placement, for backwards compatibility this is still supported, but only if the admin hasn't set an explicit policy on the Host. +It is best to remove the experimental `xenopsd.conf` entry though, a future version may completely drop it. + +Tests are in [test_topology.ml] which checks balancing properties and whether the plan has improved best/worst/average-case access times in a simulated test based on 2 predefined NUMA distance matrixes (one from Intel and one from an AMD system). + +## Future work + +* enable 'best-effort' mode by default once more testing has been done +* an API to query Xen where it has actually allocated the VM's memory. + Currently only an `xl debug-keys` interface exists which is not supported in production as it can result in killing the host via the watchdog, and is not a proper API, but a textual debug output with no stability guarantees. +* more host policies (e.g. `strict`). + Requires the XAPI pool scheduler to be NUMA aware and consider it as part of chosing hosts. +* VM level policy that can set a NUMA affinity index, mapped to a NUMA node modulo NUMA nodes available on the system (this is needed so that after migration we don't end up trying to allocate vCPUs to a non-existent NUMA node) +* VM level anti-affinity rules for NUMA placement (can be achieved by setting unique NUMA affinity indexes) + +[^xen_numa]: [Xen on NUMA Machines](https://wiki.xenproject.org/wiki/Xen_on_NUMA_Machines) +[^kernel_numa]: [What is NUMA?](https://www.kernel.org/doc/html/v6.6/mm/numa.html) +[^kernel_numa_perf]: [NUMA memory performance](https://www.kernel.org/doc/html/v6.6/admin-guide/mm/numaperf.html) +[^kernel_numa_policy]: [NUMA memory policy](https://www.kernel.org/doc/html/v6.6/admin-guide/mm/numa_memory_p +[^AMD_numa]: Lepers, Baptiste. ["Improving performance on NUMA systems."](https://theses.hal.science/tel-01549294/document) PhD diss., Université de Grenoble, 2014. +[^lstopo]: created with `lstopo-no-graphics --no-io --of svg --vert=L3 >hwloc.svg` on a bare metal Linux diff --git a/ocaml/idl/datamodel_common.ml b/ocaml/idl/datamodel_common.ml index ba4f1728219..ec7f759d632 100644 --- a/ocaml/idl/datamodel_common.ml +++ b/ocaml/idl/datamodel_common.ml @@ -10,7 +10,7 @@ open Datamodel_roles to leave a gap for potential hotfixes needing to increment the schema version.*) let schema_major_vsn = 5 -let schema_minor_vsn = 769 +let schema_minor_vsn = 770 (* Historical schema versions just in case this is useful later *) let rio_schema_major_vsn = 5 diff --git a/ocaml/idl/datamodel_host.ml b/ocaml/idl/datamodel_host.ml index 672f34ea8c4..3d1f11677a1 100644 --- a/ocaml/idl/datamodel_host.ml +++ b/ocaml/idl/datamodel_host.ml @@ -1668,6 +1668,36 @@ let cleanup_pool_secret = ] ~allowed_roles:_R_LOCAL_ROOT_ONLY ~hide_from_docs:true () +let host_numa_affinity_policy = + Enum + ( "host_numa_affinity_policy" + , [ + ("any", "VMs are spread across all available NUMA nodes") + ; ( "best-effort" + , "VMs are placed on the smallest number of NUMA nodes that they fit \ + using soft-pinning, but the policy doesn't guarantee a balanced \ + placement, falling back to the 'any' policy." + ) + ; ( "default" + , "Use the NUMA affinity policy that is the default for the current \ + version" + ) + ] + ) + +let set_numa_affinity_policy = + call ~name:"set_numa_affinity_policy" ~lifecycle:[] + ~doc:"Set VM placement NUMA affinity policy" + ~params: + [ + (Ref _host, "self", "The host") + ; ( host_numa_affinity_policy + , "value" + , "The NUMA affinity policy to apply to a host" + ) + ] + ~allowed_roles:_R_POOL_ADMIN () + let host_sched_gran = Enum ( "host_sched_gran" @@ -1925,6 +1955,7 @@ let t = ; cleanup_pool_secret ; set_sched_gran ; get_sched_gran + ; set_numa_affinity_policy ; emergency_disable_tls_verification ; emergency_reenable_tls_verification ; cert_distrib_atom @@ -2164,6 +2195,9 @@ let t = ~default_value:(Some (VEnum "unknown")) "Default as 'unknown', 'yes' if the host is up to date with \ updates synced from remote CDN, otherwise 'no'" + ; field ~qualifier:DynamicRO ~lifecycle:[] ~ty:host_numa_affinity_policy + "numa_affinity_policy" ~default_value:(Some (VEnum "default")) + "NUMA-aware VM memory and vCPU placement policy" ] ) () diff --git a/ocaml/idl/schematest.ml b/ocaml/idl/schematest.ml index 617247d519a..aa95fc01eaa 100644 --- a/ocaml/idl/schematest.ml +++ b/ocaml/idl/schematest.ml @@ -3,7 +3,7 @@ let hash x = Digest.string x |> Digest.to_hex (* BEWARE: if this changes, check that schema has been bumped accordingly in ocaml/idl/datamodel_common.ml, usually schema_minor_vsn *) -let last_known_schema_hash = "96da2c136aa13d4f8e71dd40fe2b84af" +let last_known_schema_hash = "2ec2aeef2405d6bd73ab6beb44185532" let current_schema_hash : string = let open Datamodel_types in diff --git a/ocaml/tests/common/test_common.ml b/ocaml/tests/common/test_common.ml index 1194ed6dbae..510db04352f 100644 --- a/ocaml/tests/common/test_common.ml +++ b/ocaml/tests/common/test_common.ml @@ -208,7 +208,7 @@ let make_host2 ~__context ?(ref = Ref.make ()) ?(uuid = make_uuid ()) ~display:`enabled ~virtual_hardware_platform_versions:[] ~control_domain:Ref.null ~updates_requiring_reboot:[] ~iscsi_iqn:"" ~multipathing:false ~uefi_certificates:"" ~editions:[] ~pending_guidances:[] - ~tls_verification_enabled + ~tls_verification_enabled ~numa_affinity_policy:`default ~last_software_update:(Xapi_host.get_servertime ~__context ~host:ref) ~recommended_guidances:[] ~latest_synced_updates_applied:`unknown ; ref diff --git a/ocaml/xapi-cli-server/record_util.ml b/ocaml/xapi-cli-server/record_util.ml index 9b681ea6772..8c2bad7e338 100644 --- a/ocaml/xapi-cli-server/record_util.ml +++ b/ocaml/xapi-cli-server/record_util.ml @@ -706,6 +706,25 @@ let host_sched_gran_to_string = function | `socket -> "socket" +let host_numa_affinity_policy_to_string = function + | `any -> + "any" + | `besteffort -> + "best-effort" + | `default -> + "default" + +let host_numa_affinity_policy_of_string = function + | "any" -> + `any + | "best-effort" -> + `besteffort + | "default" -> + `default + | s -> + raise + (Record_failure ("Expected 'any', 'best-effort' or 'default', got " ^ s)) + let pgpu_dom0_access_to_string x = host_display_to_string x let string_to_vdi_onboot s = diff --git a/ocaml/xapi-cli-server/records.ml b/ocaml/xapi-cli-server/records.ml index c40c3e083cd..311f5923987 100644 --- a/ocaml/xapi-cli-server/records.ml +++ b/ocaml/xapi-cli-server/records.ml @@ -3219,6 +3219,16 @@ let host_record rpc session_id host = Client.Host.set_uefi_certificates ~rpc ~session_id ~host ~value ) () + ; make_field ~name:"numa-affinity-policy" + ~get:(fun () -> + (x ()).API.host_numa_affinity_policy + |> Record_util.host_numa_affinity_policy_to_string + ) + ~set:(fun value -> + Client.Host.set_numa_affinity_policy ~rpc ~session_id ~self:host + ~value:(Record_util.host_numa_affinity_policy_of_string value) + ) + () ; make_field ~name:"pending-guidances" ~get:(fun () -> map_and_concat Record_util.update_guidance_to_string diff --git a/ocaml/xapi-idl/xen/xenops_interface.ml b/ocaml/xapi-idl/xen/xenops_interface.ml index f472bafefea..ab240a53cb5 100644 --- a/ocaml/xapi-idl/xen/xenops_interface.ml +++ b/ocaml/xapi-idl/xen/xenops_interface.ml @@ -494,6 +494,12 @@ module Host = struct } [@@deriving rpcty] + type numa_affinity_policy = + | Any (** VMs may run on any NUMA nodes. This is the default in 8.2CU1 *) + | Best_effort + (** best effort placement on the smallest number of NUMA nodes where possible *) + [@@deriving rpcty] + type guest_agent_feature_list = guest_agent_feature list [@@deriving rpcty] end @@ -645,6 +651,16 @@ module XenopsAPI (R : RPC) = struct @-> host_policy_p @-> returning (Param.mk Types.bool) err ) + + let set_numa_affinity_policy = + let numa_affinity_policy_p = + Param.mk + ~description:["Host NUMA affinity policy"] + ~name:"numa_affinity_policy" Host.numa_affinity_policy + in + declare "HOST.set_numa_affinity_policy" + ["Sets the host's NUMA aware VM scheduling policy"] + (debug_info_p @-> numa_affinity_policy_p @-> returning unit_p err) end module VM = struct diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index 2c34b2667e5..f76775f3460 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -4017,6 +4017,15 @@ functor Client.Host.get_sched_gran ~rpc ~session_id ~self ) + let set_numa_affinity_policy ~__context ~self ~value = + info "Host.set_numa_affinity_policy: host='%s' policy='%s'" + (host_uuid ~__context self) + (Record_util.host_numa_affinity_policy_to_string value) ; + let local_fn = Local.Host.set_numa_affinity_policy ~self ~value in + do_op_on ~local_fn ~__context ~host:self (fun session_id rpc -> + Client.Host.set_numa_affinity_policy ~rpc ~session_id ~self ~value + ) + let emergency_disable_tls_verification ~__context = info "Host.emergency_disable_tls_verification" ; Local.Host.emergency_disable_tls_verification ~__context diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index e2579b497de..caeed6208e5 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -1039,6 +1039,7 @@ let create ~__context ~uuid ~name_label ~name_description:_ ~hostname ~address ~cpu_configuration:[] (* !!! FIXME hard coding *) ~cpu_info:[] ~chipset_info ~memory_overhead:0L ~sched_policy:"credit" (* !!! FIXME hard coding *) + ~numa_affinity_policy:`default ~supported_bootloaders:(List.map fst Xapi_globs.supported_bootloaders) ~suspend_image_sr:Ref.null ~crash_dump_sr:Ref.null ~logging:[] ~hostname ~address ~metrics ~license_params ~boot_free_mem:0L ~ha_statefiles:[] @@ -2856,6 +2857,10 @@ let notify_send_new_pool_secret ~__context ~host:_ ~old_ps ~new_ps = let cleanup_pool_secret ~__context ~host:_ ~old_ps ~new_ps = Xapi_psr.cleanup ~__context ~old_ps ~new_ps +let set_numa_affinity_policy ~__context ~self ~value = + Db.Host.set_numa_affinity_policy ~__context ~self ~value ; + Xapi_xenops.set_numa_affinity_policy ~__context ~value + let set_sched_gran ~__context ~self ~value = if Helpers.get_localhost ~__context <> self then failwith "Forwarded to the wrong host" ; diff --git a/ocaml/xapi/xapi_host.mli b/ocaml/xapi/xapi_host.mli index 15d79072765..952dbed0f7e 100644 --- a/ocaml/xapi/xapi_host.mli +++ b/ocaml/xapi/xapi_host.mli @@ -532,6 +532,12 @@ val set_sched_gran : val get_sched_gran : __context:Context.t -> self:API.ref_host -> API.host_sched_gran +val set_numa_affinity_policy : + __context:Context.t + -> self:API.ref_host + -> value:API.host_numa_affinity_policy + -> unit + val emergency_disable_tls_verification : __context:Context.t -> unit val alert_if_tls_verification_was_emergency_disabled : diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 73cb08de2cb..0c44113087f 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -3116,7 +3116,30 @@ let resync_all_vms ~__context = in List.iter (fun vm -> refresh_vm ~__context ~self:vm) resident_vms_in_db +let set_numa_affinity_policy ~__context ~value = + let dbg = Context.string_of_task __context in + let open Xapi_xenops_queue in + let module Client = (val make_client (default_xenopsd ()) : XENOPS) in + let value = + let open Xenops_interface.Host in + match value with + | `any -> + Any + | `besteffort -> + Best_effort + | `default -> + Any + in + Client.HOST.set_numa_affinity_policy dbg value + let on_xapi_restart ~__context = + let host = Helpers.get_localhost ~__context in + let value = Db.Host.get_numa_affinity_policy ~__context ~self:host in + info "Setting NUMA affinity policy in xenopsd on startup to %s" + (Record_util.host_numa_affinity_policy_to_string value) ; + set_numa_affinity_policy ~__context ~value ; + + info "Resynchronizing VM state with xenopsd" ; resync_resident_on ~__context ; (* For all available xenopsds, start the event thread. This will cause events on everything xenopsd knows about, hence a refresh of all VMs. *) diff --git a/ocaml/xenopsd/lib/softaffinity.ml b/ocaml/xenopsd/lib/softaffinity.ml index 34274ef45bf..4e38640dcd1 100644 --- a/ocaml/xenopsd/lib/softaffinity.ml +++ b/ocaml/xenopsd/lib/softaffinity.ml @@ -17,64 +17,8 @@ module D = Debug.Make (struct let name = "softaffinity" end) open D -(* On a NUMA system each node has fast, lower latency access to local memory. It - can access memory of other NUMA nodes, but this requires going through the - interconnect (and possible multiple hops), which is higher latency and has - less bandwidth than the link to the local memory. NUMA does have advantages - though: if each node accesses only its local memory, then each node can - independently achieve maximum throughput. For best performance we should: +(* See ../../../doc/toolstack/features/NUMA/index.md *) - - minimize the amount of interconnect bandwidth we are using - - - maximize the number of NUMA nodes that we use in the system as a whole - - If a VM's memory and vCPUs can entirely fit within a single NUMA node then we - should tell Xen to prefer to allocate memory from and run the vCPUs on 1 NUMA - node. - - This can be achieved by using the VM's soft affinity CPU mask: Xen would - allocate memory in a round-robin way only from the NUMA nodes corresponding - to the vCPUs, and it would prefer to schedule the vCPUs on the pCPUs in the - soft affinity mask. If it cannot (e.g. all those pCPUs are busy) then it - would still run the vCPU elsewhere. This is better than hard affinity where - the vCPU would not run at all (running the vCPU, even with slower access to - memory is better than not running it at all). - - By default Xen stripes the VM's memory accross all NUMA nodes of the host, - which means that every VM has to go through all the interconnects. The goal - here is to find a better allocation than the default, not necessarily an - optimal allocation. An optimal allocation would require knowing what VMs you - would start/create in the future, and planning across hosts too. - - Overall we want to balance the VMs across NUMA nodes, such that we use all - NUMA nodes to take advantage of the maximum memory bandwidth available on the - system. For now this balancing is done only by balancing memory usage: always - heuristically allocating VMs on the NUMA node that has the most available - memory. - - If a VM doesn't fit into a single node then it is not so clear what the best - approach is. One criteria to consider is minimizing the NUMA distance between - the nodes chosen for the VM. Large NUMA systems may not be fully connected in - a mesh requiring multiple hops to each a node, or even have assymetric links, - or links with different bitwidth. These tradeoff should be approximatively - reflected in the ACPI SLIT tables, as a matrix of distances between nodes. It - is possible that 3 NUMA nodes have a smaller average/maximum distance than 2, - so we need to consider all possibilities. For N nodes there would be 2^N - possibilities, so [NUMA.candidates] limits the number of choices to 65520+N - (full set of 2^N possibilities for 16 NUMA nodes, and a reduced set of - choices for larger systems). - - [NUMA.candidates] is a sorted sequence of node sets, in ascending order of - maximum/average distances. Once we've eliminated the candidates not suitable - for this VM (that do not have enough total memory/pCPUs) we are left with a - monotonically increasing sequence of nodes. There are still multiple - possibilities with same average distance. This is where we consider our - second criteria - balancing - and pick the node with most available free - memory. - - Once a suitable set of NUMA nodes are picked we compute the CPU soft affinity - as the union of the CPUs from all these NUMA nodes. If we didn't find a - solution then we let Xen use its default allocation. *) let plan host nodes ~vm = (* let host = NUMA.apply_mask host vm.NUMAResource.affinity in *) let pick_node (allocated, picked, requested) (NUMA.Node nodeidx as node) = diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index dec4839bab6..b2a822b659c 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -3404,6 +3404,8 @@ module VIF = struct () end +let numa_placement = ref Xenops_interface.Host.Any + module HOST = struct let stat _ dbg = Debug.with_thread_associated dbg @@ -3414,6 +3416,11 @@ module HOST = struct ) () + let set_numa_affinity_policy _ dbg = + Debug.with_thread_associated dbg @@ fun policy -> + debug "HOST.set_numa_affinity_policy" ; + numa_placement := policy + let get_console_data _ dbg = Debug.with_thread_associated dbg (fun () -> @@ -4103,6 +4110,7 @@ let _ = Server.TASK.destroy (TASK.destroy ()) ; Server.TASK.destroy_on_finish (TASK.destroy_on_finish ()) ; Server.HOST.stat (HOST.stat ()) ; + Server.HOST.set_numa_affinity_policy (HOST.set_numa_affinity_policy ()) ; Server.HOST.get_console_data (HOST.get_console_data ()) ; Server.HOST.get_total_memory_mib (HOST.get_total_memory_mib ()) ; Server.HOST.send_debug_keys (HOST.send_debug_keys ()) ; diff --git a/ocaml/xenopsd/lib/xenopsd.ml b/ocaml/xenopsd/lib/xenopsd.ml index 09b936d6b1c..c029c8d3862 100644 --- a/ocaml/xenopsd/lib/xenopsd.ml +++ b/ocaml/xenopsd/lib/xenopsd.ml @@ -59,10 +59,7 @@ let feature_flags_path = ref "/etc/xenserver/features.d" let pvinpvh_xen_cmdline = ref "pv-shim console=xen" -let numa_placement = ref false - -(* This is for debugging only *) -let numa_placement_strict = ref false +let numa_placement_compat = ref false (* O(N^2) operations, until we get a xenstore cache, so use a small number here *) let vm_guest_agent_xenstore_quota = ref 128 @@ -243,14 +240,9 @@ let options = , "Command line for the inner-xen for PV-in-PVH guests" ) ; ( "numa-placement" - , Arg.Bool (fun x -> numa_placement := x) - , (fun () -> string_of_bool !numa_placement) - , "NUMA-aware placement of VMs" - ) - ; ( "numa-placement-strict" - , Arg.Bool (fun x -> numa_placement_strict := x) - , (fun () -> string_of_bool !numa_placement) - , "Fail if NUMA-aware placement is not possible" + , Arg.Bool (fun x -> numa_placement_compat := x) + , (fun () -> string_of_bool !numa_placement_compat) + , "NUMA-aware placement of VMs (deprecated, use XAPI setting)" ) ; ( "pci-quarantine" , Arg.Bool (fun b -> pci_quarantine := b) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index ca1e7bcc421..96aa1037166 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -809,19 +809,17 @@ let numa_mutex = Mutex.create () let numa_resources = ref None let numa_init () = - if !Xenopsd.numa_placement then ( - let xcext = Xenctrlext.get_handle () in - let host = Lazy.force numa_hierarchy in - let mem = (Xenctrlext.numainfo xcext).memory in - D.debug "Host NUMA information: %s" - (Fmt.to_to_string Topology.NUMA.pp_dump host) ; - Array.iteri - (fun i m -> - let open Xenctrlext in - D.debug "NUMA node %d: %Ld/%Ld memory free" i m.memfree m.memsize - ) - mem - ) + let xcext = Xenctrlext.get_handle () in + let host = Lazy.force numa_hierarchy in + let mem = (Xenctrlext.numainfo xcext).memory in + D.debug "Host NUMA information: %s" + (Fmt.to_to_string Topology.NUMA.pp_dump host) ; + Array.iteri + (fun i m -> + let open Xenctrlext in + D.debug "NUMA node %d: %Ld/%Ld memory free" i m.memfree m.memsize + ) + mem let numa_placement domid ~vcpus ~memory = let open Xenctrlext in @@ -911,20 +909,19 @@ let build_pre ~xc ~xs ~vcpus ~memory ~has_hard_affinity domid = log_reraise (Printf.sprintf "shadow_allocation_set %d MiB" shadow_mib) (fun () -> Xenctrl.shadow_allocation_set xc domid shadow_mib ) ; - if !Xenopsd.numa_placement then - log_reraise (Printf.sprintf "NUMA placement") (fun () -> - if has_hard_affinity then - D.debug "VM has hard affinity set, skipping NUMA optimization" - else - let do_numa_placement () = - numa_placement domid ~vcpus - ~memory:(Int64.mul memory.xen_max_mib 1048576L) - in - if !Xenopsd.numa_placement_strict then - do_numa_placement () - else - Xenops_utils.best_effort "NUMA placement" do_numa_placement - ) ; + let () = + match !Xenops_server.numa_placement with + | Any -> + () + | Best_effort -> + log_reraise (Printf.sprintf "NUMA placement") (fun () -> + if has_hard_affinity then + D.debug "VM has hard affinity set, skipping NUMA optimization" + else + numa_placement domid ~vcpus + ~memory:(Int64.mul memory.xen_max_mib 1048576L) + ) + in create_channels ~xc uuid domid let xenguest_args_base ~domid ~store_port ~store_domid ~console_port diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index a4cdb77686d..bc1c1c77d30 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -5171,6 +5171,8 @@ let init () = {Xs_protocol.ACL.owner= 0; other= Xs_protocol.ACL.READ; acl= []} ) ; Device.Backend.init () ; + Xenops_server.numa_placement := + if !Xenopsd.numa_placement_compat then Best_effort else Any ; Domain.numa_init () ; debug "xenstore is responding to requests" ; let () = Watcher.create_watcher_thread () in