Skip to content

Commit

Permalink
DAOS-16312 control: Always use --force for dmg system stop
Browse files Browse the repository at this point in the history
Allow-unstable-test: true
Features: control
Signed-off-by: Tom Nabarro <[email protected]>
  • Loading branch information
tanabarr committed Jan 29, 2025
1 parent 657ae8a commit 649a314
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 13 deletions.
18 changes: 10 additions & 8 deletions src/control/server/ctl_ranks_rpc.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -8,10 +9,10 @@ package server

import (
"context"
"syscall"
"time"

"github.com/pkg/errors"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"

"github.com/daos-stack/daos/src/control/common/proto/convert"
Expand Down Expand Up @@ -167,10 +168,11 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) (
return nil, errors.New("no ranks specified in request")
}

signal := syscall.SIGINT
if req.Force {
signal = syscall.SIGKILL
}
// DAOS-16312 NOTE: SIGINT or SIGTERM are more traditional signals to use to terminate *nix
// processes as they allow the processes to perform cleanup tasks before
// shutdown. SIGKILL is now being used to avoid potential data loss issues
// related to problems in clean shutdown of engines. The rationale maybe
// similar to how STONITH is used in high-availability systems.

instances, err := svc.harness.FilterInstancesByRankSet(req.GetRanks())
if err != nil {
Expand All @@ -185,8 +187,8 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) (
if !ei.IsStarted() {
continue
}
if err := ei.Stop(signal); err != nil {
return nil, errors.Wrapf(err, "sending %s", signal)
if err := ei.Stop(unix.SIGKILL); err != nil {
return nil, errors.Wrapf(err, "sending kill signal")
}
}

Expand Down
5 changes: 3 additions & 2 deletions src/control/server/ctl_ranks_rpc_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -279,7 +280,7 @@ func TestServer_CtlSvc_StopRanks(t *testing.T) {
},
"instances successfully stopped": {
req: &ctlpb.RanksReq{Ranks: "0-3"},
expSignalsSent: map[uint32]os.Signal{0: syscall.SIGINT, 1: syscall.SIGINT},
expSignalsSent: map[uint32]os.Signal{0: syscall.SIGKILL, 1: syscall.SIGKILL},
expResults: []*sharedpb.RankResult{
{Rank: 1, State: msStopped},
{Rank: 2, State: msStopped},
Expand All @@ -296,7 +297,7 @@ func TestServer_CtlSvc_StopRanks(t *testing.T) {
"instances not stopped in time": {
req: &ctlpb.RanksReq{Ranks: "0-3"},
timeout: time.Second,
expSignalsSent: map[uint32]os.Signal{0: syscall.SIGINT, 1: syscall.SIGINT},
expSignalsSent: map[uint32]os.Signal{0: syscall.SIGKILL, 1: syscall.SIGKILL},
instancesDontStop: true,
expErr: errors.New("deadline exceeded"),
},
Expand Down
4 changes: 2 additions & 2 deletions src/control/server/mgmt_system.go
Original file line number Diff line number Diff line change
Expand Up @@ -832,8 +832,8 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) (
return nil, err
}

// First phase: Prepare the ranks for shutdown, but only if the request
// is for an unforced full system stop.
// First phase: Prepare the ranks for shutdown, but only if the request is for an unforced
// full system stop.
if fReq.FullSystem && !fReq.Force {
fReq.Method = control.PrepShutdownRanks
fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true)
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/control/log_entry.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
(C) Copyright 2023 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -171,7 +172,7 @@ def test_control_log_entry(self):

self.log_step('Stop/start 2 random ranks')
stop_ranks = self.random.sample(list(self.server_managers[0].ranks), k=2)
expected = [fr'rank {rank}.*exited with 0' for rank in stop_ranks] \
expected = [fr'rank {rank}.*killed' for rank in stop_ranks] \
+ [fr'process.*started on rank {rank}' for rank in stop_ranks]
with self.verify_journalctl(expected):
self.server_managers[0].stop_ranks(stop_ranks, self.d_log)
Expand Down

0 comments on commit 649a314

Please sign in to comment.