diff --git a/src/control/server/ctl_ranks_rpc.go b/src/control/server/ctl_ranks_rpc.go index 7006eec4647f..04846ebf11dd 100644 --- a/src/control/server/ctl_ranks_rpc.go +++ b/src/control/server/ctl_ranks_rpc.go @@ -1,5 +1,6 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,10 +9,10 @@ package server import ( "context" - "syscall" "time" "github.com/pkg/errors" + "golang.org/x/sys/unix" "google.golang.org/protobuf/proto" "github.com/daos-stack/daos/src/control/common/proto/convert" @@ -167,10 +168,11 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) ( return nil, errors.New("no ranks specified in request") } - signal := syscall.SIGINT - if req.Force { - signal = syscall.SIGKILL - } + // DAOS-16312 NOTE: SIGINT or SIGTERM are more traditional signals to use to terminate *nix + // processes as they allow the processes to perform cleanup tasks before + // shutdown. SIGKILL is now being used to avoid potential data loss issues + // related to problems in clean shutdown of engines. The rationale maybe + // similar to how STONITH is used in high-availability systems. instances, err := svc.harness.FilterInstancesByRankSet(req.GetRanks()) if err != nil { @@ -185,8 +187,8 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) ( if !ei.IsStarted() { continue } - if err := ei.Stop(signal); err != nil { - return nil, errors.Wrapf(err, "sending %s", signal) + if err := ei.Stop(unix.SIGKILL); err != nil { + return nil, errors.Wrapf(err, "sending kill signal") } } diff --git a/src/control/server/ctl_ranks_rpc_test.go b/src/control/server/ctl_ranks_rpc_test.go index 9355a04bce8b..bc2f57806fe9 100644 --- a/src/control/server/ctl_ranks_rpc_test.go +++ b/src/control/server/ctl_ranks_rpc_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -279,7 +280,7 @@ func TestServer_CtlSvc_StopRanks(t *testing.T) { }, "instances successfully stopped": { req: &ctlpb.RanksReq{Ranks: "0-3"}, - expSignalsSent: map[uint32]os.Signal{0: syscall.SIGINT, 1: syscall.SIGINT}, + expSignalsSent: map[uint32]os.Signal{0: syscall.SIGKILL, 1: syscall.SIGKILL}, expResults: []*sharedpb.RankResult{ {Rank: 1, State: msStopped}, {Rank: 2, State: msStopped}, @@ -296,7 +297,7 @@ func TestServer_CtlSvc_StopRanks(t *testing.T) { "instances not stopped in time": { req: &ctlpb.RanksReq{Ranks: "0-3"}, timeout: time.Second, - expSignalsSent: map[uint32]os.Signal{0: syscall.SIGINT, 1: syscall.SIGINT}, + expSignalsSent: map[uint32]os.Signal{0: syscall.SIGKILL, 1: syscall.SIGKILL}, instancesDontStop: true, expErr: errors.New("deadline exceeded"), }, diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index 91d51e176900..cc71a1905db6 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -832,8 +832,8 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) ( return nil, err } - // First phase: Prepare the ranks for shutdown, but only if the request - // is for an unforced full system stop. + // First phase: Prepare the ranks for shutdown, but only if the request is for an unforced + // full system stop. if fReq.FullSystem && !fReq.Force { fReq.Method = control.PrepShutdownRanks fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true) diff --git a/src/tests/ftest/control/log_entry.py b/src/tests/ftest/control/log_entry.py index c231ef750ec8..ce4a83a47c8d 100644 --- a/src/tests/ftest/control/log_entry.py +++ b/src/tests/ftest/control/log_entry.py @@ -1,5 +1,6 @@ """ (C) Copyright 2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -171,7 +172,7 @@ def test_control_log_entry(self): self.log_step('Stop/start 2 random ranks') stop_ranks = self.random.sample(list(self.server_managers[0].ranks), k=2) - expected = [fr'rank {rank}.*exited with 0' for rank in stop_ranks] \ + expected = [fr'rank {rank}.*killed' for rank in stop_ranks] \ + [fr'process.*started on rank {rank}' for rank in stop_ranks] with self.verify_journalctl(expected): self.server_managers[0].stop_ranks(stop_ranks, self.d_log)