Skip to content

Commit

Permalink
DAOS-16312 control: Always use --force for dmg system stop
Browse files Browse the repository at this point in the history
Allow-unstable-test: true
Features: control
Signed-off-by: Tom Nabarro <[email protected]>
  • Loading branch information
tanabarr committed Feb 1, 2025
1 parent 46e04b1 commit 20eb8b9
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 15 deletions.
12 changes: 11 additions & 1 deletion src/control/cmd/dmg/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ type systemStopCmd struct {
cmdutil.JSONOutputCmd
rankListCmd
Force bool `long:"force" description:"Force stop DAOS system members"`
Full bool `long:"full" description:"Attempt a graceful shutdown of DAOS system. Experimental and not for use in production environments"`
}

// Execute is run when systemStopCmd activates.
Expand All @@ -181,11 +182,20 @@ func (cmd *systemStopCmd) Execute(_ []string) (errOut error) {
defer func() {
errOut = errors.Wrap(errOut, "system stop failed")
}()
if cmd.Force && cmd.Full {
return errIncompatFlags("force", "full")
}
if cmd.Full && !cmd.Hosts.Empty() {
return errIncompatFlags("full", "rank-hosts")
}
if cmd.Full && !cmd.Ranks.Empty() {
return errIncompatFlags("full", "ranks")
}

if err := cmd.validateHostsRanks(); err != nil {
return err
}
req := &control.SystemStopReq{Force: cmd.Force}
req := &control.SystemStopReq{Force: cmd.Force, Full: cmd.Full}
req.Hosts.Replace(&cmd.Hosts.HostSet)
req.Ranks.Replace(&cmd.Ranks.RankSet)

Expand Down
26 changes: 26 additions & 0 deletions src/control/cmd/dmg/system_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,32 @@ func TestDmg_SystemCommands(t *testing.T) {
"",
errors.New("--ranks and --rank-hosts options cannot be set together"),
},
{
"system stop with full option",
"system stop --full",
strings.Join([]string{
printRequest(t, &control.SystemStopReq{Full: true}),
}, " "),
nil,
},
{
"system stop with full and force options",
"system stop --full --force",
"",
errors.New(`may not be mixed`),
},
{
"system stop with full and rank-hosts options",
"system stop --full --rank-hosts foo-[0-2]",
"",
errors.New(`may not be mixed`),
},
{
"system stop with full and ranks options",
"system stop --full --ranks 0-2",
"",
errors.New(`may not be mixed`),
},
{
"system start with no arguments",
"system start",
Expand Down
13 changes: 12 additions & 1 deletion src/control/lib/control/system.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -406,6 +407,7 @@ type SystemStopReq struct {
msRequest
sysRequest
Force bool
Full bool
}

// SystemStopResp contains the request response.
Expand Down Expand Up @@ -451,11 +453,20 @@ func SystemStop(ctx context.Context, rpcClient UnaryInvoker, req *SystemStopReq)
if req == nil {
return nil, errors.Errorf("nil %T request", req)
}
if req.Force && req.Full {
return nil, errors.New("force and full options may not be mixed")
}
if req.Full && req.Hosts.String() != "" {
return nil, errors.New("full and hosts options may not be mixed")
}
if req.Full && req.Ranks.String() != "" {
return nil, errors.New("full and ranks options may not be mixed")
}

pbReq := new(mgmtpb.SystemStopReq)
pbReq.Hosts = req.Hosts.String()
pbReq.Ranks = req.Ranks.String()
pbReq.Force = req.Force
pbReq.Force = !req.Full // Force used unless full graceful shutdown requested.
pbReq.Sys = req.getSystem(rpcClient)

req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) {
Expand Down
21 changes: 21 additions & 0 deletions src/control/lib/control/system_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,12 @@ func TestControl_SystemStop(t *testing.T) {
testRespRS := new(SystemStopResp)
testRespRS.AbsentRanks.Replace(testRS)

withFull := func(stopReq *SystemStopReq) *SystemStopReq {
nr := *stopReq
nr.Full = true
return &nr
}

for name, tc := range map[string]struct {
req *SystemStopReq
uErr error
Expand Down Expand Up @@ -886,6 +892,21 @@ func TestControl_SystemStop(t *testing.T) {
}),
expResp: testRespRS,
},
"request force and full options": {
req: &SystemStopReq{
Force: true,
Full: true,
},
expErr: errors.New("may not be mixed"),
},
"request full and host set options": {
req: withFull(testReqHS),
expErr: errors.New("may not be mixed"),
},
"request full and rank set options": {
req: withFull(testReqRS),
expErr: errors.New("may not be mixed"),
},
"dual host dual rank": {
req: new(SystemStopReq),
uResp: MockMSResponse("10.0.0.1:10001", nil,
Expand Down
12 changes: 9 additions & 3 deletions src/control/server/mgmt_system.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ import (
const fabricProviderProp = "fabric_providers"
const groupUpdatePauseProp = "group_update_paused"

var errSysForceNotFull = errors.New("force must be used if not full system stop")

// GetAttachInfo handles a request to retrieve a map of ranks to fabric URIs, in addition
// to client network autoconfiguration hints.
//
Expand Down Expand Up @@ -832,9 +834,13 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) (
return nil, err
}

// First phase: Prepare the ranks for shutdown, but only if the request
// is for an unforced full system stop.
if fReq.FullSystem && !fReq.Force {
// First phase: Prepare the ranks for shutdown, but only if the request is for an unforced
// full system stop.
if !fReq.Force {
if !fReq.FullSystem {
return nil, errSysForceNotFull
}

fReq.Method = control.PrepShutdownRanks
fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true)
if err != nil {
Expand Down
26 changes: 19 additions & 7 deletions src/control/server/mgmt_system_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1563,9 +1563,16 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) {
expInvokeCount: 2,
},
"stop some ranks": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1"},
mResps: [][]*control.HostResponse{{hr(1, mockRankSuccess("stop", 0), mockRankSuccess("stop", 1))}},
expResults: []*sharedpb.RankResult{mockRankSuccess("stop", 0, 1), mockRankSuccess("stop", 1, 1)},
req: &mgmtpb.SystemStopReq{Ranks: "0,1", Force: true},
mResps: [][]*control.HostResponse{
{
hr(1, mockRankSuccess("stop", 0), mockRankSuccess("stop", 1)),
},
},
expResults: []*sharedpb.RankResult{
mockRankSuccess("stop", 0, 1),
mockRankSuccess("stop", 1, 1),
},
expMembers: func() system.Members {
return system.Members{
mockMember(t, 0, 1, "stopped"),
Expand All @@ -1575,9 +1582,9 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) {
},
expInvokeCount: 1, // prep should not be called
},
"stop with all ranks (same as full system stop)": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1,3"},
mResps: hostRespSuccess,
"stop with all ranks": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1,3", Force: true},
mResps: hostRespStopSuccess,
expResults: rankResStopSuccess,
expMembers: func() system.Members {
return system.Members{
Expand All @@ -1586,7 +1593,7 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) {
mockMember(t, 3, 2, "stopped"),
}
},
expInvokeCount: 2, // prep should be called
expInvokeCount: 1, // prep should not be called
},
"full system stop": {
req: &mgmtpb.SystemStopReq{},
Expand All @@ -1601,6 +1608,11 @@ func TestServer_MgmtSvc_SystemStop(t *testing.T) {
},
expInvokeCount: 2, // prep should be called
},
"full system stop; partial ranks in req": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1"},
mResps: hostRespStopSuccess,
expAPIErr: errSysForceNotFull,
},
"full system stop (forced)": {
req: &mgmtpb.SystemStopReq{Force: true},
mResps: hostRespStopSuccess,
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/control/log_entry.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
(C) Copyright 2023 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -171,7 +172,7 @@ def test_control_log_entry(self):

self.log_step('Stop/start 2 random ranks')
stop_ranks = self.random.sample(list(self.server_managers[0].ranks), k=2)
expected = [fr'rank {rank}.*exited with 0' for rank in stop_ranks] \
expected = [fr'rank {rank}.*killed' for rank in stop_ranks] \
+ [fr'process.*started on rank {rank}' for rank in stop_ranks]
with self.verify_journalctl(expected):
self.server_managers[0].stop_ranks(stop_ranks, self.d_log)
Expand Down
4 changes: 2 additions & 2 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,15 +908,15 @@ def _stop(self, wf):
self.conf.wf.issues.append(entry)
self._add_test_case('server_stop', failure=message)
start = time.perf_counter()
rc = self.run_dmg(['system', 'stop'])
rc = self.run_dmg(['system', 'stop', '--full'])
if rc.returncode != 0:
print(rc)
entry = {}
entry['fileName'] = self._file
# pylint: disable=protected-access
entry['lineStart'] = sys._getframe().f_lineno
entry['severity'] = 'ERROR'
msg = f'dmg system stop failed with {rc.returncode}'
msg = f'dmg system stop --full failed with {rc.returncode}'
entry['message'] = msg
self.conf.wf.issues.append(entry)
if not self.valgrind:
Expand Down

0 comments on commit 20eb8b9

Please sign in to comment.