From 20eb8b93daed46f6399cfe29437cf239ac095f16 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Sat, 1 Feb 2025 09:55:17 +0000 Subject: [PATCH] DAOS-16312 control: Always use --force for dmg system stop Allow-unstable-test: true Features: control Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/system.go | 12 +++++++++++- src/control/cmd/dmg/system_test.go | 26 ++++++++++++++++++++++++++ src/control/lib/control/system.go | 13 ++++++++++++- src/control/lib/control/system_test.go | 21 +++++++++++++++++++++ src/control/server/mgmt_system.go | 12 +++++++++--- src/control/server/mgmt_system_test.go | 26 +++++++++++++++++++------- src/tests/ftest/control/log_entry.py | 3 ++- utils/node_local_test.py | 4 ++-- 8 files changed, 102 insertions(+), 15 deletions(-) diff --git a/src/control/cmd/dmg/system.go b/src/control/cmd/dmg/system.go index 376b1c4e5f8..f1f782d7de9 100644 --- a/src/control/cmd/dmg/system.go +++ b/src/control/cmd/dmg/system.go @@ -172,6 +172,7 @@ type systemStopCmd struct { cmdutil.JSONOutputCmd rankListCmd Force bool `long:"force" description:"Force stop DAOS system members"` + Full bool `long:"full" description:"Attempt a graceful shutdown of DAOS system. Experimental and not for use in production environments"` } // Execute is run when systemStopCmd activates. @@ -181,11 +182,20 @@ func (cmd *systemStopCmd) Execute(_ []string) (errOut error) { defer func() { errOut = errors.Wrap(errOut, "system stop failed") }() + if cmd.Force && cmd.Full { + return errIncompatFlags("force", "full") + } + if cmd.Full && !cmd.Hosts.Empty() { + return errIncompatFlags("full", "rank-hosts") + } + if cmd.Full && !cmd.Ranks.Empty() { + return errIncompatFlags("full", "ranks") + } if err := cmd.validateHostsRanks(); err != nil { return err } - req := &control.SystemStopReq{Force: cmd.Force} + req := &control.SystemStopReq{Force: cmd.Force, Full: cmd.Full} req.Hosts.Replace(&cmd.Hosts.HostSet) req.Ranks.Replace(&cmd.Ranks.RankSet) diff --git a/src/control/cmd/dmg/system_test.go b/src/control/cmd/dmg/system_test.go index be8b3ab6dd2..2064bc0e4d0 100644 --- a/src/control/cmd/dmg/system_test.go +++ b/src/control/cmd/dmg/system_test.go @@ -192,6 +192,32 @@ func TestDmg_SystemCommands(t *testing.T) { "", errors.New("--ranks and --rank-hosts options cannot be set together"), }, + { + "system stop with full option", + "system stop --full", + strings.Join([]string{ + printRequest(t, &control.SystemStopReq{Full: true}), + }, " "), + nil, + }, + { + "system stop with full and force options", + "system stop --full --force", + "", + errors.New(`may not be mixed`), + }, + { + "system stop with full and rank-hosts options", + "system stop --full --rank-hosts foo-[0-2]", + "", + errors.New(`may not be mixed`), + }, + { + "system stop with full and ranks options", + "system stop --full --ranks 0-2", + "", + errors.New(`may not be mixed`), + }, { "system start with no arguments", "system start", diff --git a/src/control/lib/control/system.go b/src/control/lib/control/system.go index 73afa873c9f..f33953f23c7 100644 --- a/src/control/lib/control/system.go +++ b/src/control/lib/control/system.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -406,6 +407,7 @@ type SystemStopReq struct { msRequest sysRequest Force bool + Full bool } // SystemStopResp contains the request response. @@ -451,11 +453,20 @@ func SystemStop(ctx context.Context, rpcClient UnaryInvoker, req *SystemStopReq) if req == nil { return nil, errors.Errorf("nil %T request", req) } + if req.Force && req.Full { + return nil, errors.New("force and full options may not be mixed") + } + if req.Full && req.Hosts.String() != "" { + return nil, errors.New("full and hosts options may not be mixed") + } + if req.Full && req.Ranks.String() != "" { + return nil, errors.New("full and ranks options may not be mixed") + } pbReq := new(mgmtpb.SystemStopReq) pbReq.Hosts = req.Hosts.String() pbReq.Ranks = req.Ranks.String() - pbReq.Force = req.Force + pbReq.Force = !req.Full // Force used unless full graceful shutdown requested. pbReq.Sys = req.getSystem(rpcClient) req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) { diff --git a/src/control/lib/control/system_test.go b/src/control/lib/control/system_test.go index 3a85eaa1d1a..f01d02dbb8a 100644 --- a/src/control/lib/control/system_test.go +++ b/src/control/lib/control/system_test.go @@ -849,6 +849,12 @@ func TestControl_SystemStop(t *testing.T) { testRespRS := new(SystemStopResp) testRespRS.AbsentRanks.Replace(testRS) + withFull := func(stopReq *SystemStopReq) *SystemStopReq { + nr := *stopReq + nr.Full = true + return &nr + } + for name, tc := range map[string]struct { req *SystemStopReq uErr error @@ -886,6 +892,21 @@ func TestControl_SystemStop(t *testing.T) { }), expResp: testRespRS, }, + "request force and full options": { + req: &SystemStopReq{ + Force: true, + Full: true, + }, + expErr: errors.New("may not be mixed"), + }, + "request full and host set options": { + req: withFull(testReqHS), + expErr: errors.New("may not be mixed"), + }, + "request full and rank set options": { + req: withFull(testReqRS), + expErr: errors.New("may not be mixed"), + }, "dual host dual rank": { req: new(SystemStopReq), uResp: MockMSResponse("10.0.0.1:10001", nil, diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index 91d51e17690..76db0629d19 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -45,6 +45,8 @@ import ( const fabricProviderProp = "fabric_providers" const groupUpdatePauseProp = "group_update_paused" +var errSysForceNotFull = errors.New("force must be used if not full system stop") + // GetAttachInfo handles a request to retrieve a map of ranks to fabric URIs, in addition // to client network autoconfiguration hints. // @@ -832,9 +834,13 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) ( return nil, err } - // First phase: Prepare the ranks for shutdown, but only if the request - // is for an unforced full system stop. - if fReq.FullSystem && !fReq.Force { + // First phase: Prepare the ranks for shutdown, but only if the request is for an unforced + // full system stop. + if !fReq.Force { + if !fReq.FullSystem { + return nil, errSysForceNotFull + } + fReq.Method = control.PrepShutdownRanks fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true) if err != nil { diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go index 0cffe7f63d8..b11f47861cb 100644 --- a/src/control/server/mgmt_system_test.go +++ b/src/control/server/mgmt_system_test.go @@ -1563,9 +1563,16 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) { expInvokeCount: 2, }, "stop some ranks": { - req: &mgmtpb.SystemStopReq{Ranks: "0,1"}, - mResps: [][]*control.HostResponse{{hr(1, mockRankSuccess("stop", 0), mockRankSuccess("stop", 1))}}, - expResults: []*sharedpb.RankResult{mockRankSuccess("stop", 0, 1), mockRankSuccess("stop", 1, 1)}, + req: &mgmtpb.SystemStopReq{Ranks: "0,1", Force: true}, + mResps: [][]*control.HostResponse{ + { + hr(1, mockRankSuccess("stop", 0), mockRankSuccess("stop", 1)), + }, + }, + expResults: []*sharedpb.RankResult{ + mockRankSuccess("stop", 0, 1), + mockRankSuccess("stop", 1, 1), + }, expMembers: func() system.Members { return system.Members{ mockMember(t, 0, 1, "stopped"), @@ -1575,9 +1582,9 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) { }, expInvokeCount: 1, // prep should not be called }, - "stop with all ranks (same as full system stop)": { - req: &mgmtpb.SystemStopReq{Ranks: "0,1,3"}, - mResps: hostRespSuccess, + "stop with all ranks": { + req: &mgmtpb.SystemStopReq{Ranks: "0,1,3", Force: true}, + mResps: hostRespStopSuccess, expResults: rankResStopSuccess, expMembers: func() system.Members { return system.Members{ @@ -1586,7 +1593,7 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) { mockMember(t, 3, 2, "stopped"), } }, - expInvokeCount: 2, // prep should be called + expInvokeCount: 1, // prep should not be called }, "full system stop": { req: &mgmtpb.SystemStopReq{}, @@ -1601,6 +1608,11 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) { }, expInvokeCount: 2, // prep should be called }, + "full system stop; partial ranks in req": { + req: &mgmtpb.SystemStopReq{Ranks: "0,1"}, + mResps: hostRespStopSuccess, + expAPIErr: errSysForceNotFull, + }, "full system stop (forced)": { req: &mgmtpb.SystemStopReq{Force: true}, mResps: hostRespStopSuccess, diff --git a/src/tests/ftest/control/log_entry.py b/src/tests/ftest/control/log_entry.py index c231ef750ec..ce4a83a47c8 100644 --- a/src/tests/ftest/control/log_entry.py +++ b/src/tests/ftest/control/log_entry.py @@ -1,5 +1,6 @@ """ (C) Copyright 2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -171,7 +172,7 @@ def test_control_log_entry(self): self.log_step('Stop/start 2 random ranks') stop_ranks = self.random.sample(list(self.server_managers[0].ranks), k=2) - expected = [fr'rank {rank}.*exited with 0' for rank in stop_ranks] \ + expected = [fr'rank {rank}.*killed' for rank in stop_ranks] \ + [fr'process.*started on rank {rank}' for rank in stop_ranks] with self.verify_journalctl(expected): self.server_managers[0].stop_ranks(stop_ranks, self.d_log) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 9dbfa385564..d02cc710ed8 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -908,7 +908,7 @@ def _stop(self, wf): self.conf.wf.issues.append(entry) self._add_test_case('server_stop', failure=message) start = time.perf_counter() - rc = self.run_dmg(['system', 'stop']) + rc = self.run_dmg(['system', 'stop', '--full']) if rc.returncode != 0: print(rc) entry = {} @@ -916,7 +916,7 @@ def _stop(self, wf): # pylint: disable=protected-access entry['lineStart'] = sys._getframe().f_lineno entry['severity'] = 'ERROR' - msg = f'dmg system stop failed with {rc.returncode}' + msg = f'dmg system stop --full failed with {rc.returncode}' entry['message'] = msg self.conf.wf.issues.append(entry) if not self.valgrind: