Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16312 control: Always use --force for dmg system stop #15811

Open
wants to merge 1 commit into
base: release/2.6
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/control/cmd/dmg/system.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2023 Intel Corporation.

Check failure on line 2 in src/control/cmd/dmg/system.go

View workflow job for this annotation

GitHub Actions / Copyright check

Copyright out of date
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -172,6 +172,7 @@
cmdutil.JSONOutputCmd
rankListCmd
Force bool `long:"force" description:"Force stop DAOS system members"`
Full bool `long:"full" description:"Attempt a graceful shutdown of DAOS system. Experimental and not for use in production environments"`
}

// Execute is run when systemStopCmd activates.
Expand All @@ -181,11 +182,20 @@
defer func() {
errOut = errors.Wrap(errOut, "system stop failed")
}()
if cmd.Force && cmd.Full {
return errIncompatFlags("force", "full")
}
if cmd.Full && !cmd.Hosts.Empty() {
return errIncompatFlags("full", "rank-hosts")
}
if cmd.Full && !cmd.Ranks.Empty() {
return errIncompatFlags("full", "ranks")
}

if err := cmd.validateHostsRanks(); err != nil {
return err
}
req := &control.SystemStopReq{Force: cmd.Force}
req := &control.SystemStopReq{Force: cmd.Force, Full: cmd.Full}
req.Hosts.Replace(&cmd.Hosts.HostSet)
req.Ranks.Replace(&cmd.Ranks.RankSet)

Expand Down
26 changes: 26 additions & 0 deletions src/control/cmd/dmg/system_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2023 Intel Corporation.

Check failure on line 2 in src/control/cmd/dmg/system_test.go

View workflow job for this annotation

GitHub Actions / Copyright check

Copyright out of date
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -192,6 +192,32 @@
"",
errors.New("--ranks and --rank-hosts options cannot be set together"),
},
{
"system stop with full option",
"system stop --full",
strings.Join([]string{
printRequest(t, &control.SystemStopReq{Full: true}),
}, " "),
nil,
},
{
"system stop with full and force options",
"system stop --full --force",
"",
errors.New(`may not be mixed`),
},
{
"system stop with full and rank-hosts options",
"system stop --full --rank-hosts foo-[0-2]",
"",
errors.New(`may not be mixed`),
},
{
"system stop with full and ranks options",
"system stop --full --ranks 0-2",
"",
errors.New(`may not be mixed`),
},
{
"system start with no arguments",
"system start",
Expand Down
13 changes: 12 additions & 1 deletion src/control/lib/control/system.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -406,6 +407,7 @@ type SystemStopReq struct {
msRequest
sysRequest
Force bool
Full bool
}

// SystemStopResp contains the request response.
Expand Down Expand Up @@ -451,11 +453,20 @@ func SystemStop(ctx context.Context, rpcClient UnaryInvoker, req *SystemStopReq)
if req == nil {
return nil, errors.Errorf("nil %T request", req)
}
if req.Force && req.Full {
return nil, errors.New("force and full options may not be mixed")
}
if req.Full && req.Hosts.String() != "" {
return nil, errors.New("full and hosts options may not be mixed")
}
if req.Full && req.Ranks.String() != "" {
return nil, errors.New("full and ranks options may not be mixed")
}

pbReq := new(mgmtpb.SystemStopReq)
pbReq.Hosts = req.Hosts.String()
pbReq.Ranks = req.Ranks.String()
pbReq.Force = req.Force
pbReq.Force = !req.Full // Force used unless full graceful shutdown requested.
pbReq.Sys = req.getSystem(rpcClient)

req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) {
Expand Down
21 changes: 21 additions & 0 deletions src/control/lib/control/system_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.

Check failure on line 2 in src/control/lib/control/system_test.go

View workflow job for this annotation

GitHub Actions / Copyright check

Copyright out of date
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -849,6 +849,12 @@
testRespRS := new(SystemStopResp)
testRespRS.AbsentRanks.Replace(testRS)

withFull := func(stopReq *SystemStopReq) *SystemStopReq {
nr := *stopReq
nr.Full = true
return &nr
}

for name, tc := range map[string]struct {
req *SystemStopReq
uErr error
Expand Down Expand Up @@ -886,6 +892,21 @@
}),
expResp: testRespRS,
},
"request force and full options": {
req: &SystemStopReq{
Force: true,
Full: true,
},
expErr: errors.New("may not be mixed"),
},
"request full and host set options": {
req: withFull(testReqHS),
expErr: errors.New("may not be mixed"),
},
"request full and rank set options": {
req: withFull(testReqRS),
expErr: errors.New("may not be mixed"),
},
"dual host dual rank": {
req: new(SystemStopReq),
uResp: MockMSResponse("10.0.0.1:10001", nil,
Expand Down
12 changes: 9 additions & 3 deletions src/control/server/mgmt_system.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ import (
const fabricProviderProp = "fabric_providers"
const groupUpdatePauseProp = "group_update_paused"

var errSysForceNotFull = errors.New("force must be used if not full system stop")

// GetAttachInfo handles a request to retrieve a map of ranks to fabric URIs, in addition
// to client network autoconfiguration hints.
//
Expand Down Expand Up @@ -832,9 +834,13 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) (
return nil, err
}

// First phase: Prepare the ranks for shutdown, but only if the request
// is for an unforced full system stop.
if fReq.FullSystem && !fReq.Force {
// First phase: Prepare the ranks for shutdown, but only if the request is for an unforced
// full system stop.
if !fReq.Force {
if !fReq.FullSystem {
return nil, errSysForceNotFull
}

fReq.Method = control.PrepShutdownRanks
fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true)
if err != nil {
Expand Down
26 changes: 19 additions & 7 deletions src/control/server/mgmt_system_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2024 Intel Corporation.

Check failure on line 2 in src/control/server/mgmt_system_test.go

View workflow job for this annotation

GitHub Actions / Copyright check

Copyright out of date
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -1563,9 +1563,16 @@
expInvokeCount: 2,
},
"stop some ranks": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1"},
mResps: [][]*control.HostResponse{{hr(1, mockRankSuccess("stop", 0), mockRankSuccess("stop", 1))}},
expResults: []*sharedpb.RankResult{mockRankSuccess("stop", 0, 1), mockRankSuccess("stop", 1, 1)},
req: &mgmtpb.SystemStopReq{Ranks: "0,1", Force: true},
mResps: [][]*control.HostResponse{
{
hr(1, mockRankSuccess("stop", 0), mockRankSuccess("stop", 1)),
},
},
expResults: []*sharedpb.RankResult{
mockRankSuccess("stop", 0, 1),
mockRankSuccess("stop", 1, 1),
},
expMembers: func() system.Members {
return system.Members{
mockMember(t, 0, 1, "stopped"),
Expand All @@ -1575,9 +1582,9 @@
},
expInvokeCount: 1, // prep should not be called
},
"stop with all ranks (same as full system stop)": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1,3"},
mResps: hostRespSuccess,
"stop with all ranks": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1,3", Force: true},
mResps: hostRespStopSuccess,
expResults: rankResStopSuccess,
expMembers: func() system.Members {
return system.Members{
Expand All @@ -1586,7 +1593,7 @@
mockMember(t, 3, 2, "stopped"),
}
},
expInvokeCount: 2, // prep should be called
expInvokeCount: 1, // prep should not be called
},
"full system stop": {
req: &mgmtpb.SystemStopReq{},
Expand All @@ -1601,6 +1608,11 @@
},
expInvokeCount: 2, // prep should be called
},
"full system stop; partial ranks in req": {
req: &mgmtpb.SystemStopReq{Ranks: "0,1"},
mResps: hostRespStopSuccess,
expAPIErr: errSysForceNotFull,
},
"full system stop (forced)": {
req: &mgmtpb.SystemStopReq{Force: true},
mResps: hostRespStopSuccess,
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/control/log_entry.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
(C) Copyright 2023 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -171,7 +172,7 @@ def test_control_log_entry(self):

self.log_step('Stop/start 2 random ranks')
stop_ranks = self.random.sample(list(self.server_managers[0].ranks), k=2)
expected = [fr'rank {rank}.*exited with 0' for rank in stop_ranks] \
expected = [fr'rank {rank}.*killed' for rank in stop_ranks] \
+ [fr'process.*started on rank {rank}' for rank in stop_ranks]
with self.verify_journalctl(expected):
self.server_managers[0].stop_ranks(stop_ranks, self.d_log)
Expand Down
4 changes: 2 additions & 2 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,15 +908,15 @@ def _stop(self, wf):
self.conf.wf.issues.append(entry)
self._add_test_case('server_stop', failure=message)
start = time.perf_counter()
rc = self.run_dmg(['system', 'stop'])
rc = self.run_dmg(['system', 'stop', '--full'])
if rc.returncode != 0:
print(rc)
entry = {}
entry['fileName'] = self._file
# pylint: disable=protected-access
entry['lineStart'] = sys._getframe().f_lineno
entry['severity'] = 'ERROR'
msg = f'dmg system stop failed with {rc.returncode}'
msg = f'dmg system stop --full failed with {rc.returncode}'
entry['message'] = msg
self.conf.wf.issues.append(entry)
if not self.valgrind:
Expand Down
Loading