From 59755626888b8c3bc64b6b873dee196913348944 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Tue, 18 Jun 2024 16:43:14 +0100 Subject: [PATCH] CA-392887: set_tls_config immediately after enabling clustering Previously xapi calls `set_tls_config` regardless of whether a host has joined or enabled, which will restart the remote server of xapi-clusterd. In the meantime, another xapi-clusterd might also be joining, which causes `distribute_state` to be called while the remote server is restarting. Now remove the `set_tls_config`, this is because `join_internal` already creates a tls_config and passes it to xapi-clusterd, but xapi-clusterd does not store that tls_config in its db, it just starts the http server with that tls config. Modifying xapi-clusterd to store that config will be done in a separate PR. Moreover, `cluster_host.enable` also calls `set_tls_config`, which means there is no need to call `set_tls_config` if the cluster host is joined but not enabled. Also move the observer and watcher creation into the not joined case, since cluster_host.enable already calls them and there is no need to call them if the host is not enabled. This does not, however, solve the whole problem. For that, we need to make sure that `distribute_state` and `set_tls_config` cannot happen at the same time. More generally, any remote calls cannot happen while `tls_config` is running. Hence we need them to hold the same lock. This will be done in xapi-clusterd. Signed-off-by: Vincent Liu --- ocaml/xapi/xapi_cluster_host.ml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/ocaml/xapi/xapi_cluster_host.ml b/ocaml/xapi/xapi_cluster_host.ml index de14b266a96..782d5a240f5 100644 --- a/ocaml/xapi/xapi_cluster_host.ml +++ b/ocaml/xapi/xapi_cluster_host.ml @@ -230,9 +230,12 @@ let resync_host ~__context ~host = ~msg:Api_messages.cluster_host_enable_failed ~cls:`Host ~obj_uuid ~body ~api_func:(fun rpc session_id -> (* If we have just joined, enable will prevent concurrent clustering ops *) - if not (Db.Cluster_host.get_joined ~__context ~self) then - join_internal ~__context ~self - else if Db.Cluster_host.get_enabled ~__context ~self then ( + if not (Db.Cluster_host.get_joined ~__context ~self) then ( + join_internal ~__context ~self ; + create_cluster_watcher_on_master ~__context ~host ; + Xapi_observer.initialise_observer ~__context + Xapi_observer_components.Xapi_clusterd + ) else if Db.Cluster_host.get_enabled ~__context ~self then ( (* [enable] unconditionally invokes low-level enable operations and is idempotent. RPU reformats partition, losing service status, never re-enables clusterd *) debug "Cluster_host %s is enabled, starting up xapi-clusterd" @@ -241,13 +244,7 @@ let resync_host ~__context ~host = maybe_switch_cluster_stack_version ~__context ~self ~cluster_stack ; (* Note that join_internal and enable both use the clustering lock *) Client.Client.Cluster_host.enable ~rpc ~session_id ~self - ) ; - (* create the watcher here so that the watcher exists after toolstack restart *) - create_cluster_watcher_on_master ~__context ~host ; - Xapi_observer.initialise_observer ~__context - Xapi_observer_components.Xapi_clusterd ; - let verify = Stunnel_client.get_verify_by_default () in - set_tls_config ~__context ~self ~verify + ) ) (* API call split into separate functions to create in db and enable in client layer *)