Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16464 test: improve online_rebuild_mdtest.py (#15108) #15807

Draft
wants to merge 1 commit into
base: release/2.6
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 5 additions & 16 deletions src/tests/ftest/erasurecode/online_rebuild_mdtest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2020-2023 Intel Corporation.
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand All @@ -14,11 +15,6 @@ class EcodOnlineRebuildMdtest(ErasureCodeMdtest):

:avocado: recursive
"""
def __init__(self, *args, **kwargs):
"""Initialize a EcOnlineRebuild object."""
super().__init__(*args, **kwargs)
self.set_online_rebuild = True

def test_ec_online_rebuild_mdtest(self):
"""Jira ID: DAOS-7320.

Expand All @@ -35,13 +31,6 @@ def test_ec_online_rebuild_mdtest(self):
:avocado: tags=ec,ec_array,mdtest,ec_online_rebuild
:avocado: tags=EcodOnlineRebuildMdtest,test_ec_online_rebuild_mdtest
"""
# Kill last server rank
self.rank_to_kill = self.server_count - 1

# Run only object type which matches the server count and
# remove other objects
for oclass in self.obj_class:
if oclass[1] == self.server_count:
self.obj_class = oclass[0]

self.start_online_mdtest()
# Stop one random rank while mdtest is running
ranks_to_stop = self.random.sample(list(self.server_managers[0].ranks), k=1)
self.start_online_mdtest(ranks_to_stop)
47 changes: 26 additions & 21 deletions src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ hosts:
12_server:
test_servers: server-[1-6]
test_clients: 2
timeout: 1000
timeout: 1500
setup:
start_agents_once: False
start_servers_once: False
Expand All @@ -21,13 +21,15 @@ server_config:
fabric_iface: ib0
fabric_iface_port: 31416
log_file: daos_server0.log
log_mask: ERR
storage: auto
1:
pinned_numa_node: 1
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31517
log_file: daos_server1.log
log_mask: ERR
storage: auto
pool:
control_method: dmg
Expand All @@ -38,26 +40,29 @@ container:
properties: rd_fac:2
mdtest:
client_processes:
np_48:
np: 48
num_of_files_dirs: 200
mdtest_api:
dfs:
api: 'DFS'
test_dir: "/"
iteration: 4
np: 4
api: DFS
test_dir: /
dfs_destroy: True
manager: "MPICH"
flags: "-u"
write_bytes: 4194304
read_bytes: 4194304
manager: MPICH
flags: "-u -F -C"
write_bytes: 524288
read_bytes: 524288
depth: 10
num_of_files_dirs: 10000000
stonewall_timer: 30
# EC does not supported for directory so for now running with RP
dfs_dir_oclass: "RP_3G1"
objectclass:
dfs_oclass_list:
#- [EC_Object_Class, Exact number of servers]
- ["EC_2P2GX", 6]
- ["EC_4P2GX", 8]
- ["EC_4P3GX", 12]
- ["EC_8P2GX", 12]
dfs_dir_oclass: RP_3G1
dfs_oclass_mux: !mux
6_server_ec2p2gx:
!filter-only : "/run/hosts/servers/6_server" # yamllint disable-line rule:colons
dfs_oclass: EC_2P2GX
8_server_ec4p2gx:
!filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons
dfs_oclass: EC_4P2GX
12_server_ec4p3gx:
!filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons
dfs_oclass: EC_4P3GX
12_server_ec8p2gx:
!filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons
dfs_oclass: EC_8P2GX
60 changes: 29 additions & 31 deletions src/tests/ftest/util/ec_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -413,56 +414,53 @@ def start_online_single_operation(self, operation, parity=1):
class ErasureCodeMdtest(MdtestBase):
"""Class to used for EC testing for MDtest Benchmark."""

def __init__(self, *args, **kwargs):
"""Initialize a MdtestBase object."""
super().__init__(*args, **kwargs)
self.server_count = None
self.set_online_rebuild = False
self.rank_to_kill = None
self.obj_class = None

def setUp(self):
"""Set up each test case."""
super().setUp()
engine_count = self.server_managers[0].get_config_value("engines_per_host")
self.server_count = len(self.hostlist_servers) * engine_count
self.obj_class = self.params.get("dfs_oclass_list", '/run/mdtest/objectclass/*')
# Create Pool
self.add_pool()
self.container = None
self.out_queue = queue.Queue()

def write_single_mdtest_dataset(self):
"""Run MDtest with EC object type."""
# Update the MDtest obj class
self.mdtest_cmd.dfs_oclass.update(self.obj_class)
def _start_execute_mdtest(self, mdtest_result_queue):
"""Run the execute_mdtest method

# Write the MDtest data
self.execute_mdtest(self.out_queue)
Args:
mdtest_result_queue(Queue) : Queue for passing errors.
Returns:
result(object) : mdtest run result
"""
try:
result = self.execute_mdtest(mdtest_result_queue)
except Exception: # pylint: disable=broad-except
mdtest_result_queue.put('Mdtest Failed')
return result

def start_online_mdtest(self):
"""Run MDtest operation with thread in background.
def start_online_mdtest(self, ranks_to_stop):
"""Run mdtest and stop ranks while mdtest is running.

Trigger the server failure while MDtest is running
Args:
ranks_to_stop (list): ranks to stop while mdtest is running
"""
# Create the container and check the status
self.container = self.get_mdtest_container(self.pool)
# Create the MDtest run thread
job = threading.Thread(target=self.write_single_mdtest_dataset)
job = threading.Thread(
target=self._start_execute_mdtest,
kwargs={"mdtest_result_queue": self.out_queue})

# Launch the MDtest thread
job.start()

# Kill the server rank while IO operation in progress
if self.set_online_rebuild:
time.sleep(30)
# Kill the server rank
if self.rank_to_kill is not None:
self.server_managers[0].stop_ranks([self.rank_to_kill],
self.d_log,
force=True)
# Stop the server ranks while IO operation in progress
time.sleep(self.mdtest_cmd.stonewall_timer.value / 2)
self.server_managers[0].stop_ranks(ranks_to_stop, self.d_log, force=True)

# Wait to finish the thread
job.join()

# Verify the queue result and make sure test has no failure
while not self.out_queue.empty():
if self.out_queue.get() == "Mdtest Failed":
self.fail("FAIL")
result = self.out_queue.get()
if result == "Mdtest Failed":
self.fail(result)