diff --git a/ocaml/xapi/xapi_guest_agent.ml b/ocaml/xapi/xapi_guest_agent.ml index 32dc31b20d0..02054cac30e 100644 --- a/ocaml/xapi/xapi_guest_agent.ml +++ b/ocaml/xapi/xapi_guest_agent.ml @@ -128,10 +128,10 @@ let memory_targets : (int, int64) Hashtbl.t = Hashtbl.create 20 let dead_domains : IntSet.t ref = ref IntSet.empty let mutex = Mutex.create () -(** Reset all the guest metrics for a particular VM. 'lookup' reads a key from xenstore - and 'list' reads a directory from xenstore. Both are relative to the guest's - domainpath. *) -let all (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid = + +(* In the following functions, 'lookup' reads a key from xenstore and 'list' reads + a directory from xenstore. Both are relative to the guest's domainpath. *) +let get_initial_guest_metrics (lookup: string -> string option) (list: string -> string list) = let all_control = list "control" in let to_map kvpairs = List.concat (List.map (fun (xskey, mapkey) -> match lookup xskey with | Some xsval -> [ mapkey, xsval ] @@ -156,15 +156,58 @@ let all (lookup: string -> string option) (list: string -> string list) ~__conte and last_updated = Unix.gettimeofday () in let can_use_hotplug_vbd = get_tristate "feature/hotplug/vbd" in let can_use_hotplug_vif = get_tristate "feature/hotplug/vif" in - - (* let num = Mutex.execute mutex (fun () -> Hashtbl.fold (fun _ _ c -> 1 + c) cache 0) in - debug "Number of entries in hashtbl: %d" num; *) - (* to avoid breakage whilst 'micro' is added to linux and windows agents, default this field to -1 if it's not present in xenstore *) let pv_drivers_version = if List.mem_assoc "micro" pv_drivers_version then pv_drivers_version (* already there; do nothing *) - else ("micro","-1")::pv_drivers_version in + else ("micro","-1")::pv_drivers_version + in + {pv_drivers_version; os_version; networks; other; memory; device_id; last_updated; can_use_hotplug_vbd; can_use_hotplug_vif;} + + +let create_and_set_guest_metrics (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid = + let initial_gm = get_initial_guest_metrics lookup list + in + let self = Db.VM.get_by_uuid ~__context ~uuid in + let new_gm_uuid = (Uuid.to_string (Uuid.make_uuid ())) + and new_gm_ref = Ref.make () in + Db.VM_guest_metrics.create ~__context + ~ref:new_gm_ref + ~uuid:new_gm_uuid + ~os_version:initial_gm.os_version + ~pV_drivers_version:initial_gm.pv_drivers_version + ~pV_drivers_up_to_date:false + ~memory:[] ~disks:[] + ~networks:initial_gm.networks + ~pV_drivers_detected:false + ~other:initial_gm.other + ~last_updated:(Date.of_float initial_gm.last_updated) + ~other_config:[] + ~live:true + ~can_use_hotplug_vbd:initial_gm.can_use_hotplug_vbd + ~can_use_hotplug_vif:initial_gm.can_use_hotplug_vif + ; + Db.VM.set_guest_metrics ~__context ~self ~value:new_gm_ref; + + (* Update the cache with the new values *) + Mutex.execute mutex (fun () -> Hashtbl.replace cache domid initial_gm); + (* We've just set the thing to live, let's make sure it's not in the dead list *) + Mutex.execute mutex (fun () -> dead_domains := IntSet.remove domid !dead_domains); + + let sl xs = String.concat "; " (List.map (fun (k, v) -> k ^ ": " ^ v) xs) in + info "Received initial update from guest agent in VM %s; os_version = [ %s ]; pv_drivers_version = [ %s ]; networks = [ %s ]" + (Ref.string_of self) (sl initial_gm.os_version) (sl initial_gm.pv_drivers_version) (sl initial_gm.networks); + new_gm_ref + + +(** Reset all the guest metrics for a particular VM *) +let all (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid = + let {pv_drivers_version; os_version; networks; other; memory; device_id; + last_updated; can_use_hotplug_vbd; can_use_hotplug_vif;} = get_initial_guest_metrics lookup list + in + + (* let num = Mutex.execute mutex (fun () -> Hashtbl.fold (fun _ _ c -> 1 + c) cache 0) in + debug "Number of entries in hashtbl: %d" num; *) let self = Db.VM.get_by_uuid ~__context ~uuid in @@ -228,16 +271,8 @@ let all (lookup: string -> string option) (list: string -> string list) ~__conte then existing else (* if it doesn't exist, make a fresh one *) - let new_ref = Ref.make () and new_uuid = Uuid.to_string (Uuid.make_uuid ()) in - Db.VM_guest_metrics.create ~__context ~ref:new_ref ~uuid:new_uuid - ~os_version:os_version ~pV_drivers_version:pv_drivers_version ~pV_drivers_up_to_date:false ~memory:[] ~disks:[] ~networks:networks ~other:other - ~pV_drivers_detected:false ~last_updated:(Date.of_float last_updated) ~other_config:[] ~live:true ~can_use_hotplug_vbd:`unspecified ~can_use_hotplug_vif:`unspecified; - Db.VM.set_guest_metrics ~__context ~self ~value:new_ref; - (* We've just set the thing to live, let's make sure it's not in the dead list *) - let sl xs = String.concat "; " (List.map (fun (k, v) -> k ^ ": " ^ v) xs) in - info "Received initial update from guest agent in VM %s; os_version = [ %s]; pv_drivers_version = [ %s ]; networks = [ %s ]" (Ref.string_of self) (sl os_version) (sl pv_drivers_version) (sl networks); - Mutex.execute mutex (fun () -> dead_domains := IntSet.remove domid !dead_domains); - new_ref in + create_and_set_guest_metrics lookup list ~__context ~domid ~uuid + in (* We unconditionally reset the database values but observe that the database checks whether a value has actually changed before doing anything *) diff --git a/ocaml/xapi/xapi_vm.ml b/ocaml/xapi/xapi_vm.ml index 362bac4bd2c..2c972ef2b24 100644 --- a/ocaml/xapi/xapi_vm.ml +++ b/ocaml/xapi/xapi_vm.ml @@ -241,9 +241,7 @@ let start ~__context ~vm ~start_paused ~force = (* Clear out any VM guest metrics record. Guest metrics will be updated by * the running VM and for now they might be wrong, especially network * addresses inherited by a cloned VM. *) - let vm_gm = Db.VM.get_guest_metrics ~__context ~self:vm in - Db.VM.set_guest_metrics ~__context ~self:vm ~value:Ref.null; - (try Db.VM_guest_metrics.destroy ~__context ~self:vm_gm with _ -> ()); + Xapi_vm_helpers.delete_guest_metrics ~__context ~self:vm; (* This makes sense here while the available versions are 0, 1 and 2. * If/when we introduce another version, we must reassess this. *) @@ -253,7 +251,7 @@ let start ~__context ~vm ~start_paused ~force = Cpuid_helpers.reset_cpu_flags ~__context ~vm; (* If the VM has any vGPUs, gpumon must remain stopped until the - * VM has started. *) + * VM has started. *) begin match vmr.API.vM_VGPUs with | [] -> Xapi_xenops.start ~__context ~self:vm start_paused force diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index c7fd296421f..7d652f64db2 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -1276,11 +1276,68 @@ let update_vm ~__context id = let a = Opt.map (fun x -> f (snd x)) info in let b = Opt.map f previous in a <> b in + (* Helpers to create and update guest metrics when needed *) + let lookup state key = + if List.mem_assoc key state.guest_agent then Some (List.assoc key state.guest_agent) else None in + let list state dir = + let dir = if dir.[0] = '/' then String.sub dir 1 (String.length dir - 1) else dir in + let results = Listext.List.filter_map (fun (path, value) -> + if String.startswith dir path then begin + let rest = String.sub path (String.length dir) (String.length path - (String.length dir)) in + match List.filter (fun x -> x <> "") (String.split '/' rest) with + | x :: _ -> Some x + | _ -> None + end else None + ) state.guest_agent |> Listext.List.setify in + results in + let create_guest_metrics_if_needed () = + let gm = Db.VM.get_guest_metrics ~__context ~self in + if gm = Ref.null then + Opt.iter + (fun (_, state) -> + List.iter + (fun domid -> + try + let new_gm_ref = Xapi_guest_agent.create_and_set_guest_metrics (lookup state) (list state) ~__context ~domid ~uuid:id in + debug "xenopsd event: created guest metrics %s for VM %s" (Ref.string_of new_gm_ref) id + with e -> + error "Caught %s: while creating VM %s guest metrics" (Printexc.to_string e) id + ) state.domids + ) info in + let check_guest_agent () = + Opt.iter + (fun (_, state) -> + Opt.iter (fun oldstate -> + let old_ga = oldstate.guest_agent in + let new_ga = state.guest_agent in + + (* Remove memory keys *) + let ignored_keys = [ "data/meminfo_free"; "data/updated"; "data/update_cnt" ] in + let remove_ignored ga = + List.fold_left (fun acc k -> List.filter (fun x -> fst x <> k) acc) ga ignored_keys in + let old_ga = remove_ignored old_ga in + let new_ga = remove_ignored new_ga in + if new_ga <> old_ga then begin + debug "Will update VM.allowed_operations because guest_agent has changed."; + should_update_allowed_operations := true + end else begin + debug "Supressing VM.allowed_operations update because guest_agent data is largely the same" + end + ) previous; + List.iter + (fun domid -> + try + debug "xenopsd event: Updating VM %s domid %d guest_agent" id domid; + Xapi_guest_agent.all (lookup state) (list state) ~__context ~domid ~uuid:id + with e -> + error "Caught %s: while updating VM %s guest_agent" (Printexc.to_string e) id + ) state.domids + ) info in (* Notes on error handling: if something fails we log and continue, to - maximise the amount of state which is correctly synced. If something - does fail then we may end up permanently out-of-sync until either a - process restart or an event is generated. We may wish to periodically - inject artificial events IF there has been an event sync failure? *) + maximise the amount of state which is correctly synced. If something + does fail then we may end up permanently out-of-sync until either a + process restart or an event is generated. We may wish to periodically + inject artificial events IF there has been an event sync failure? *) if different (fun x -> x.power_state) then begin try debug "Will update VM.allowed_operations because power_state has changed."; @@ -1288,10 +1345,11 @@ let update_vm ~__context id = let power_state = xenapi_of_xenops_power_state (Opt.map (fun x -> (snd x).power_state) info) in debug "xenopsd event: Updating VM %s power_state <- %s" id (Record_util.power_state_to_string power_state); (* This will mark VBDs, VIFs as detached and clear resident_on - if the VM has permanently shutdown. current-operations - should not be reset as there maybe a checkpoint is ongoing*) + if the VM has permanently shutdown. current-operations + should not be reset as there maybe a checkpoint is ongoing*) Xapi_vm_lifecycle.force_state_reset_keep_current_operations ~__context ~self ~value:power_state; + if power_state = `Running then create_guest_metrics_if_needed (); if power_state = `Suspended || power_state = `Halted then begin Xapi_network.detach_for_vm ~__context ~host:localhost ~vm:self; Storage_access.reset ~__context ~vm:self; @@ -1406,48 +1464,6 @@ let update_vm ~__context id = with e -> error "Caught %s: while updating VM %s rtc/timeoffset" (Printexc.to_string e) id end; - let check_guest_agent () = - Opt.iter - (fun (_, state) -> - Opt.iter (fun oldstate -> - let old_ga = oldstate.guest_agent in - let new_ga = state.guest_agent in - - (* Remove memory keys *) - let ignored_keys = [ "data/meminfo_free"; "data/updated"; "data/update_cnt" ] in - let remove_ignored ga = - List.fold_left (fun acc k -> List.filter (fun x -> fst x <> k) acc) ga ignored_keys in - let old_ga = remove_ignored old_ga in - let new_ga = remove_ignored new_ga in - if new_ga <> old_ga then begin - debug "Will update VM.allowed_operations because guest_agent has changed."; - should_update_allowed_operations := true - end else begin - debug "Supressing VM.allowed_operations update because guest_agent data is largely the same" - end - ) previous; - List.iter - (fun domid -> - let lookup key = - if List.mem_assoc key state.guest_agent then Some (List.assoc key state.guest_agent) else None in - let list dir = - let dir = if dir.[0] = '/' then String.sub dir 1 (String.length dir - 1) else dir in - let results = Listext.List.filter_map (fun (path, value) -> - if String.startswith dir path then begin - let rest = String.sub path (String.length dir) (String.length path - (String.length dir)) in - match List.filter (fun x -> x <> "") (String.split '/' rest) with - | x :: _ -> Some x - | _ -> None - end else None - ) state.guest_agent |> Listext.List.setify in - results in - try - debug "xenopsd event: Updating VM %s domid %d guest_agent" id domid; - Xapi_guest_agent.all lookup list ~__context ~domid ~uuid:id - with e -> - error "Caught %s: while updating VM %s guest_agent" (Printexc.to_string e) id - ) state.domids - ) info in if different (fun x -> x.hvm) then begin Opt.iter (fun (_, state) -> @@ -1484,15 +1500,44 @@ let update_vm ~__context id = let update_pv_drivers_detected () = Opt.iter (fun (_, state) -> - let gm = Db.VM.get_guest_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s PV drivers detected %b" id state.pv_drivers_detected; - Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm ~value:state.pv_drivers_detected; - Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm ~value:state.pv_drivers_detected + try + let gm = Db.VM.get_guest_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s PV drivers detected %b" id state.pv_drivers_detected; + Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm ~value:state.pv_drivers_detected; + Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm ~value:state.pv_drivers_detected + with e -> + debug "Caught %s: while updating VM %s PV drivers" (Printexc.to_string e) id ) info in + (* Chack last_start_time before updating anything in the guest metrics *) + if different (fun x -> x.last_start_time) then begin + try + Opt.iter + (fun (_, state) -> + debug "xenopsd event: Updating VM %s last_start_time <- %s" id (Date.to_string (Date.of_float state.last_start_time)); + let metrics = Db.VM.get_metrics ~__context ~self in + let start_time = Date.of_float state.last_start_time in + Db.VM_metrics.set_start_time ~__context ~self:metrics ~value:start_time; + + create_guest_metrics_if_needed (); + let gm = Db.VM.get_guest_metrics ~__context ~self in + let update_time = Db.VM_guest_metrics.get_last_updated ~__context ~self:gm in + if update_time < start_time then begin + debug "VM %s guest metrics update time (%s) < VM start time (%s): deleting" + id (Date.to_string update_time) (Date.to_string start_time); + Xapi_vm_helpers.delete_guest_metrics ~__context ~self; + check_guest_agent (); + end + ) info + with e -> + error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id + end; Opt.iter (fun (_, state) -> List.iter (fun domid -> + (* Guest metrics could have been destroyed during the last_start_time check + by recreating them, we avoid CA-223387 *) + create_guest_metrics_if_needed (); if different (fun x -> x.uncooperative_balloon_driver) then begin debug "xenopsd event: VM %s domid %d uncooperative_balloon_driver = %b" id domid state.uncooperative_balloon_driver; end; @@ -1526,30 +1571,6 @@ let update_vm ~__context id = error "Caught %s: while updating VM %s VCPUs_number" (Printexc.to_string e) id ) info end; - if different (fun x -> x.last_start_time) then begin - try - Opt.iter - (fun (_, state) -> - debug "xenopsd event: Updating VM %s last_start_time <- %s" id (Date.to_string (Date.of_float state.last_start_time)); - let metrics = Db.VM.get_metrics ~__context ~self in - let start_time = Date.of_float state.last_start_time in - Db.VM_metrics.set_start_time ~__context ~self:metrics ~value:start_time; - begin - try - let gm = Db.VM.get_guest_metrics ~__context ~self in - let update_time = Db.VM_guest_metrics.get_last_updated ~__context ~self:gm in - if update_time < start_time then begin - debug "VM %s guest metrics update time (%s) < VM start time (%s): deleting" - id (Date.to_string update_time) (Date.to_string start_time); - Xapi_vm_helpers.delete_guest_metrics ~__context ~self; - check_guest_agent (); - end - with _ -> () (* The guest metrics didn't exist *) - end - ) info - with e -> - error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id - end; if different (fun x -> x.shadow_multiplier_target) then begin try Opt.iter