Skip to content

Commit

Permalink
Merge pull request #2788 from mseri/create-gm-on-start-refactored
Browse files Browse the repository at this point in the history
Create VM guest metrics on VM start
  • Loading branch information
lindig authored Oct 7, 2016
2 parents cad6ac6 + 9ca1f62 commit 8e3d097
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 99 deletions.
73 changes: 54 additions & 19 deletions ocaml/xapi/xapi_guest_agent.ml
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ let memory_targets : (int, int64) Hashtbl.t = Hashtbl.create 20
let dead_domains : IntSet.t ref = ref IntSet.empty
let mutex = Mutex.create ()

(** Reset all the guest metrics for a particular VM. 'lookup' reads a key from xenstore
and 'list' reads a directory from xenstore. Both are relative to the guest's
domainpath. *)
let all (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid =

(* In the following functions, 'lookup' reads a key from xenstore and 'list' reads
a directory from xenstore. Both are relative to the guest's domainpath. *)
let get_initial_guest_metrics (lookup: string -> string option) (list: string -> string list) =
let all_control = list "control" in
let to_map kvpairs = List.concat (List.map (fun (xskey, mapkey) -> match lookup xskey with
| Some xsval -> [ mapkey, xsval ]
Expand All @@ -156,15 +156,58 @@ let all (lookup: string -> string option) (list: string -> string list) ~__conte
and last_updated = Unix.gettimeofday () in
let can_use_hotplug_vbd = get_tristate "feature/hotplug/vbd" in
let can_use_hotplug_vif = get_tristate "feature/hotplug/vif" in

(* let num = Mutex.execute mutex (fun () -> Hashtbl.fold (fun _ _ c -> 1 + c) cache 0) in
debug "Number of entries in hashtbl: %d" num; *)

(* to avoid breakage whilst 'micro' is added to linux and windows agents, default this field
to -1 if it's not present in xenstore *)
let pv_drivers_version =
if List.mem_assoc "micro" pv_drivers_version then pv_drivers_version (* already there; do nothing *)
else ("micro","-1")::pv_drivers_version in
else ("micro","-1")::pv_drivers_version
in
{pv_drivers_version; os_version; networks; other; memory; device_id; last_updated; can_use_hotplug_vbd; can_use_hotplug_vif;}


let create_and_set_guest_metrics (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid =
let initial_gm = get_initial_guest_metrics lookup list
in
let self = Db.VM.get_by_uuid ~__context ~uuid in
let new_gm_uuid = (Uuid.to_string (Uuid.make_uuid ()))
and new_gm_ref = Ref.make () in
Db.VM_guest_metrics.create ~__context
~ref:new_gm_ref
~uuid:new_gm_uuid
~os_version:initial_gm.os_version
~pV_drivers_version:initial_gm.pv_drivers_version
~pV_drivers_up_to_date:false
~memory:[] ~disks:[]
~networks:initial_gm.networks
~pV_drivers_detected:false
~other:initial_gm.other
~last_updated:(Date.of_float initial_gm.last_updated)
~other_config:[]
~live:true
~can_use_hotplug_vbd:initial_gm.can_use_hotplug_vbd
~can_use_hotplug_vif:initial_gm.can_use_hotplug_vif
;
Db.VM.set_guest_metrics ~__context ~self ~value:new_gm_ref;

(* Update the cache with the new values *)
Mutex.execute mutex (fun () -> Hashtbl.replace cache domid initial_gm);
(* We've just set the thing to live, let's make sure it's not in the dead list *)
Mutex.execute mutex (fun () -> dead_domains := IntSet.remove domid !dead_domains);

let sl xs = String.concat "; " (List.map (fun (k, v) -> k ^ ": " ^ v) xs) in
info "Received initial update from guest agent in VM %s; os_version = [ %s ]; pv_drivers_version = [ %s ]; networks = [ %s ]"
(Ref.string_of self) (sl initial_gm.os_version) (sl initial_gm.pv_drivers_version) (sl initial_gm.networks);
new_gm_ref


(** Reset all the guest metrics for a particular VM *)
let all (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid =
let {pv_drivers_version; os_version; networks; other; memory; device_id;
last_updated; can_use_hotplug_vbd; can_use_hotplug_vif;} = get_initial_guest_metrics lookup list
in

(* let num = Mutex.execute mutex (fun () -> Hashtbl.fold (fun _ _ c -> 1 + c) cache 0) in
debug "Number of entries in hashtbl: %d" num; *)

let self = Db.VM.get_by_uuid ~__context ~uuid in

Expand Down Expand Up @@ -228,16 +271,8 @@ let all (lookup: string -> string option) (list: string -> string list) ~__conte
then existing
else
(* if it doesn't exist, make a fresh one *)
let new_ref = Ref.make () and new_uuid = Uuid.to_string (Uuid.make_uuid ()) in
Db.VM_guest_metrics.create ~__context ~ref:new_ref ~uuid:new_uuid
~os_version:os_version ~pV_drivers_version:pv_drivers_version ~pV_drivers_up_to_date:false ~memory:[] ~disks:[] ~networks:networks ~other:other
~pV_drivers_detected:false ~last_updated:(Date.of_float last_updated) ~other_config:[] ~live:true ~can_use_hotplug_vbd:`unspecified ~can_use_hotplug_vif:`unspecified;
Db.VM.set_guest_metrics ~__context ~self ~value:new_ref;
(* We've just set the thing to live, let's make sure it's not in the dead list *)
let sl xs = String.concat "; " (List.map (fun (k, v) -> k ^ ": " ^ v) xs) in
info "Received initial update from guest agent in VM %s; os_version = [ %s]; pv_drivers_version = [ %s ]; networks = [ %s ]" (Ref.string_of self) (sl os_version) (sl pv_drivers_version) (sl networks);
Mutex.execute mutex (fun () -> dead_domains := IntSet.remove domid !dead_domains);
new_ref in
create_and_set_guest_metrics lookup list ~__context ~domid ~uuid
in

(* We unconditionally reset the database values but observe that the database
checks whether a value has actually changed before doing anything *)
Expand Down
6 changes: 2 additions & 4 deletions ocaml/xapi/xapi_vm.ml
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,7 @@ let start ~__context ~vm ~start_paused ~force =
(* Clear out any VM guest metrics record. Guest metrics will be updated by
* the running VM and for now they might be wrong, especially network
* addresses inherited by a cloned VM. *)
let vm_gm = Db.VM.get_guest_metrics ~__context ~self:vm in
Db.VM.set_guest_metrics ~__context ~self:vm ~value:Ref.null;
(try Db.VM_guest_metrics.destroy ~__context ~self:vm_gm with _ -> ());
Xapi_vm_helpers.delete_guest_metrics ~__context ~self:vm;

(* This makes sense here while the available versions are 0, 1 and 2.
* If/when we introduce another version, we must reassess this. *)
Expand All @@ -253,7 +251,7 @@ let start ~__context ~vm ~start_paused ~force =
Cpuid_helpers.reset_cpu_flags ~__context ~vm;

(* If the VM has any vGPUs, gpumon must remain stopped until the
* VM has started. *)
* VM has started. *)
begin
match vmr.API.vM_VGPUs with
| [] -> Xapi_xenops.start ~__context ~self:vm start_paused force
Expand Down
173 changes: 97 additions & 76 deletions ocaml/xapi/xapi_xenops.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1276,22 +1276,80 @@ let update_vm ~__context id =
let a = Opt.map (fun x -> f (snd x)) info in
let b = Opt.map f previous in
a <> b in
(* Helpers to create and update guest metrics when needed *)
let lookup state key =
if List.mem_assoc key state.guest_agent then Some (List.assoc key state.guest_agent) else None in
let list state dir =
let dir = if dir.[0] = '/' then String.sub dir 1 (String.length dir - 1) else dir in
let results = Listext.List.filter_map (fun (path, value) ->
if String.startswith dir path then begin
let rest = String.sub path (String.length dir) (String.length path - (String.length dir)) in
match List.filter (fun x -> x <> "") (String.split '/' rest) with
| x :: _ -> Some x
| _ -> None
end else None
) state.guest_agent |> Listext.List.setify in
results in
let create_guest_metrics_if_needed () =
let gm = Db.VM.get_guest_metrics ~__context ~self in
if gm = Ref.null then
Opt.iter
(fun (_, state) ->
List.iter
(fun domid ->
try
let new_gm_ref = Xapi_guest_agent.create_and_set_guest_metrics (lookup state) (list state) ~__context ~domid ~uuid:id in
debug "xenopsd event: created guest metrics %s for VM %s" (Ref.string_of new_gm_ref) id
with e ->
error "Caught %s: while creating VM %s guest metrics" (Printexc.to_string e) id
) state.domids
) info in
let check_guest_agent () =
Opt.iter
(fun (_, state) ->
Opt.iter (fun oldstate ->
let old_ga = oldstate.guest_agent in
let new_ga = state.guest_agent in

(* Remove memory keys *)
let ignored_keys = [ "data/meminfo_free"; "data/updated"; "data/update_cnt" ] in
let remove_ignored ga =
List.fold_left (fun acc k -> List.filter (fun x -> fst x <> k) acc) ga ignored_keys in
let old_ga = remove_ignored old_ga in
let new_ga = remove_ignored new_ga in
if new_ga <> old_ga then begin
debug "Will update VM.allowed_operations because guest_agent has changed.";
should_update_allowed_operations := true
end else begin
debug "Supressing VM.allowed_operations update because guest_agent data is largely the same"
end
) previous;
List.iter
(fun domid ->
try
debug "xenopsd event: Updating VM %s domid %d guest_agent" id domid;
Xapi_guest_agent.all (lookup state) (list state) ~__context ~domid ~uuid:id
with e ->
error "Caught %s: while updating VM %s guest_agent" (Printexc.to_string e) id
) state.domids
) info in
(* Notes on error handling: if something fails we log and continue, to
maximise the amount of state which is correctly synced. If something
does fail then we may end up permanently out-of-sync until either a
process restart or an event is generated. We may wish to periodically
inject artificial events IF there has been an event sync failure? *)
maximise the amount of state which is correctly synced. If something
does fail then we may end up permanently out-of-sync until either a
process restart or an event is generated. We may wish to periodically
inject artificial events IF there has been an event sync failure? *)
if different (fun x -> x.power_state) then begin
try
debug "Will update VM.allowed_operations because power_state has changed.";
should_update_allowed_operations := true;
let power_state = xenapi_of_xenops_power_state (Opt.map (fun x -> (snd x).power_state) info) in
debug "xenopsd event: Updating VM %s power_state <- %s" id (Record_util.power_state_to_string power_state);
(* This will mark VBDs, VIFs as detached and clear resident_on
if the VM has permanently shutdown. current-operations
should not be reset as there maybe a checkpoint is ongoing*)
if the VM has permanently shutdown. current-operations
should not be reset as there maybe a checkpoint is ongoing*)
Xapi_vm_lifecycle.force_state_reset_keep_current_operations ~__context ~self ~value:power_state;

if power_state = `Running then create_guest_metrics_if_needed ();
if power_state = `Suspended || power_state = `Halted then begin
Xapi_network.detach_for_vm ~__context ~host:localhost ~vm:self;
Storage_access.reset ~__context ~vm:self;
Expand Down Expand Up @@ -1406,48 +1464,6 @@ let update_vm ~__context id =
with e ->
error "Caught %s: while updating VM %s rtc/timeoffset" (Printexc.to_string e) id
end;
let check_guest_agent () =
Opt.iter
(fun (_, state) ->
Opt.iter (fun oldstate ->
let old_ga = oldstate.guest_agent in
let new_ga = state.guest_agent in

(* Remove memory keys *)
let ignored_keys = [ "data/meminfo_free"; "data/updated"; "data/update_cnt" ] in
let remove_ignored ga =
List.fold_left (fun acc k -> List.filter (fun x -> fst x <> k) acc) ga ignored_keys in
let old_ga = remove_ignored old_ga in
let new_ga = remove_ignored new_ga in
if new_ga <> old_ga then begin
debug "Will update VM.allowed_operations because guest_agent has changed.";
should_update_allowed_operations := true
end else begin
debug "Supressing VM.allowed_operations update because guest_agent data is largely the same"
end
) previous;
List.iter
(fun domid ->
let lookup key =
if List.mem_assoc key state.guest_agent then Some (List.assoc key state.guest_agent) else None in
let list dir =
let dir = if dir.[0] = '/' then String.sub dir 1 (String.length dir - 1) else dir in
let results = Listext.List.filter_map (fun (path, value) ->
if String.startswith dir path then begin
let rest = String.sub path (String.length dir) (String.length path - (String.length dir)) in
match List.filter (fun x -> x <> "") (String.split '/' rest) with
| x :: _ -> Some x
| _ -> None
end else None
) state.guest_agent |> Listext.List.setify in
results in
try
debug "xenopsd event: Updating VM %s domid %d guest_agent" id domid;
Xapi_guest_agent.all lookup list ~__context ~domid ~uuid:id
with e ->
error "Caught %s: while updating VM %s guest_agent" (Printexc.to_string e) id
) state.domids
) info in
if different (fun x -> x.hvm) then begin
Opt.iter
(fun (_, state) ->
Expand Down Expand Up @@ -1484,15 +1500,44 @@ let update_vm ~__context id =
let update_pv_drivers_detected () =
Opt.iter
(fun (_, state) ->
let gm = Db.VM.get_guest_metrics ~__context ~self in
debug "xenopsd event: Updating VM %s PV drivers detected %b" id state.pv_drivers_detected;
Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm ~value:state.pv_drivers_detected;
Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm ~value:state.pv_drivers_detected
try
let gm = Db.VM.get_guest_metrics ~__context ~self in
debug "xenopsd event: Updating VM %s PV drivers detected %b" id state.pv_drivers_detected;
Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm ~value:state.pv_drivers_detected;
Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm ~value:state.pv_drivers_detected
with e ->
debug "Caught %s: while updating VM %s PV drivers" (Printexc.to_string e) id
) info in
(* Chack last_start_time before updating anything in the guest metrics *)
if different (fun x -> x.last_start_time) then begin
try
Opt.iter
(fun (_, state) ->
debug "xenopsd event: Updating VM %s last_start_time <- %s" id (Date.to_string (Date.of_float state.last_start_time));
let metrics = Db.VM.get_metrics ~__context ~self in
let start_time = Date.of_float state.last_start_time in
Db.VM_metrics.set_start_time ~__context ~self:metrics ~value:start_time;

create_guest_metrics_if_needed ();
let gm = Db.VM.get_guest_metrics ~__context ~self in
let update_time = Db.VM_guest_metrics.get_last_updated ~__context ~self:gm in
if update_time < start_time then begin
debug "VM %s guest metrics update time (%s) < VM start time (%s): deleting"
id (Date.to_string update_time) (Date.to_string start_time);
Xapi_vm_helpers.delete_guest_metrics ~__context ~self;
check_guest_agent ();
end
) info
with e ->
error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id
end;
Opt.iter
(fun (_, state) ->
List.iter
(fun domid ->
(* Guest metrics could have been destroyed during the last_start_time check
by recreating them, we avoid CA-223387 *)
create_guest_metrics_if_needed ();
if different (fun x -> x.uncooperative_balloon_driver) then begin
debug "xenopsd event: VM %s domid %d uncooperative_balloon_driver = %b" id domid state.uncooperative_balloon_driver;
end;
Expand Down Expand Up @@ -1526,30 +1571,6 @@ let update_vm ~__context id =
error "Caught %s: while updating VM %s VCPUs_number" (Printexc.to_string e) id
) info
end;
if different (fun x -> x.last_start_time) then begin
try
Opt.iter
(fun (_, state) ->
debug "xenopsd event: Updating VM %s last_start_time <- %s" id (Date.to_string (Date.of_float state.last_start_time));
let metrics = Db.VM.get_metrics ~__context ~self in
let start_time = Date.of_float state.last_start_time in
Db.VM_metrics.set_start_time ~__context ~self:metrics ~value:start_time;
begin
try
let gm = Db.VM.get_guest_metrics ~__context ~self in
let update_time = Db.VM_guest_metrics.get_last_updated ~__context ~self:gm in
if update_time < start_time then begin
debug "VM %s guest metrics update time (%s) < VM start time (%s): deleting"
id (Date.to_string update_time) (Date.to_string start_time);
Xapi_vm_helpers.delete_guest_metrics ~__context ~self;
check_guest_agent ();
end
with _ -> () (* The guest metrics didn't exist *)
end
) info
with e ->
error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id
end;
if different (fun x -> x.shadow_multiplier_target) then begin
try
Opt.iter
Expand Down

0 comments on commit 8e3d097

Please sign in to comment.