daos-stack · daltonbohning · Feb 14, 2025 · Feb 19, 2025 · Feb 27, 2025 · Feb 27, 2025
@@ -1,12 +1,13 @@
 """
   (C) Copyright 2020-2022 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
 import os
 
 from control_test_base import ControlTestBase
-from general_utils import pcmd, run_pcmd
+from run_utils import run_remote
 
 
 class DmgStorageScanSCMTest(ControlTestBase):
@@ -42,21 +43,18 @@ def verify_storage_scan_scm(self, storage_dict):
         for scm_namespace in storage_dict["scm_namespaces"]:
             # Verify that all namespaces exist under /dev.
             pmem_name = scm_namespace["blockdev"]
-            lscmd = "{} {}".format("ls", os.path.join("/dev", pmem_name))
-            # rc is a dictionary where return code is the key.
-            rc = pcmd(hosts=self.hostlist_servers, command=lscmd)
-
-            if 0 not in rc:
-                errors.append("{} didn't exist under /dev!".format(pmem_name))
+            ls_cmd = f"ls {os.path.join('/dev', pmem_name)}"
+            if not run_remote(self.log, self.hostlist_servers, ls_cmd).passed:
+                errors.append(f"{pmem_name} didn't exist under /dev!")
 
             # Verify the Socket ID.
-            numa_node_path = "/sys/class/block/{}/device/numa_node".format(
-                pmem_name)
-            command = "cat {}".format(numa_node_path)
-            out_list = run_pcmd(hosts=self.hostlist_servers, command=command)
-
-            # This one is in str.
-            expected_numa_node = out_list[0]["stdout"][0]
+            numa_node_path = os.path.join(
+                os.sep, "sys", "class", "block", pmem_name, "device", "numa_node")
+            command = f"cat {numa_node_path}"
+            result = run_remote(self.log, self.hostlist_servers, command)
+            if not result.passed:
+                errors.append(f"{command} failed on {result.failed_hosts}")
+            expected_numa_node = result.joined_stdout
             actual_numa_node = str(scm_namespace["numa_node"])
 
             if expected_numa_node != actual_numa_node:

@@ -158,7 +158,7 @@ def test_control_log_entry(self):
         self.log_step('Restart server')
         expected = [r'Starting I/O Engine instance', r'Listening on']
         with self.verify_journalctl(expected):
-            self.server_managers[0].restart(list(kill_host), wait=True)
+            self.server_managers[0].restart(kill_host, wait=True)
 
         self.log_step('Reintegrate all ranks and wait for rebuild')
         expected = [fr'rank {rank}.*start reintegration' for rank in kill_ranks] \

@@ -1,13 +1,15 @@
 """
   (C) Copyright 2020-2022 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
 import os
 from textwrap import wrap
 
+from ClusterShell.NodeSet import NodeSet
 from control_test_base import ControlTestBase
-from general_utils import pcmd, run_pcmd
+from run_utils import run_remote
 
 
 class SSDSocketTest(ControlTestBase):
@@ -36,10 +38,10 @@ def debug_numa_node(self, pci_addr_heads):
         for pci_addr_head in pci_addr_heads:
             self.log.debug(
                 "----- Search PCI Addr Head %s in /sys -----", pci_addr_head)
-            run_pcmd(
-                hosts=self.hostlist_servers,
-                command="find /sys -name \"{}\"".format(pci_addr_head),
-                verbose=True)
+            run_remote(
+                self.log,
+                self.hostlist_servers,
+                f'find /sys -name "{pci_addr_head}"')
 
         # Another way to obtain the Socket ID is to use hwloc-ls --whole-io
         # --verbose. It contains something like:
@@ -55,9 +57,10 @@ def debug_numa_node(self, pci_addr_heads):
         # much more cumbersome than reading the numa_node, so it's called here
         # for mainly debugging purpose.
         self.log.debug("----- Show PCI Address in hwloc-ls -----")
-        pcmd(
-            hosts=self.hostlist_servers,
-            command="hwloc-ls --whole-io --verbose")
+        run_remote(
+            self.log,
+            self.hostlist_servers,
+            "hwloc-ls --whole-io --verbose")
 
     def verify_ssd_sockets(self, storage_dict):
         """Verify SSD sockets.
@@ -98,17 +101,14 @@ def verify_ssd_sockets(self, storage_dict):
             pci_addr_heads.append(pci_addr_head)
 
             # Call cat on the server host, not necessarily the local test host.
-            results = run_pcmd(
-                hosts=self.hostlist_servers[0:1], command="cat {}".format(numa_node_path))
-
-            # Obtain the numa_node content.
-            fs_socket_id = ""
-            for result in results:
-                # Test that the content is expected.
-                fs_socket_id = result["stdout"][-1]
-                if fs_socket_id != str(cmd_socket_id):
-                    errors.append(
-                        "Unexpected socket ID! Cmd: {}; FS: {}".format(cmd_socket_id, fs_socket_id))
+            command = f"cat {numa_node_path}"
+            result = run_remote(
+                self.log, NodeSet(self.hostlist_servers[0]), command)
+            if not result.passed:
+                errors.append(f"{command} failed on {result.failed_hosts}")
+            fs_socket_id = result.joined_stdout
+            if fs_socket_id != str(cmd_socket_id):
+                errors.append(f"Unexpected socket ID! Cmd: {cmd_socket_id}; FS: {fs_socket_id}")
 
         if errors:
             # Since we're dealing with system files and we don't have access to

@@ -1,14 +1,15 @@
 """
   (C) Copyright 2020-2023 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
-
-
 import os
 
 from apricot import TestWithServers
-from general_utils import check_file_exists, pcmd
+from command_utils import command_as_user
+from general_utils import check_file_exists
+from run_utils import run_remote
 
 
 class SuperBlockVersioning(TestWithServers):
@@ -39,9 +40,7 @@ def test_super_block_version_basic(self):
             self.fail("{}: {} not found".format(check_result[1], fname))
 
         # Make sure that 'version' is in the file, run task to check
-        cmd = "sudo cat {} | grep -F \"version\"".format(fname)
-        result = pcmd(self.hostlist_servers, cmd, timeout=20)
-
-        # Determine if the command completed successfully across all the hosts
-        if len(result) > 1 or 0 not in result:
-            self.fail("Was not able to find version in {} file".format(fname))
+        cmd = command_as_user(f'cat {fname} | grep -F "version"', "root")
+        result = run_remote(self.log, self.hostlist_servers, cmd, timeout=20)
+        if not result.passed:
+            self.fail(f"Was not able to find version in {fname} file")
@@ -1,13 +1,16 @@
 '''
   (C) Copyright 2018-2023 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 '''
 import json
 import re
 
 from apricot import TestWithServers
-from general_utils import append_error, report_errors, run_pcmd
+from ClusterShell.NodeSet import NodeSet
+from general_utils import append_error, report_errors
+from run_utils import run_remote
 from server_utils_base import DaosServerCommandRunner
 
 
@@ -37,31 +40,17 @@ def test_version(self):
         """
         # Get RPM version.
         rpm_command = "rpm -qa | grep daos-server"
-        output = run_pcmd(hosts=self.hostlist_servers, command=rpm_command)
-        self.log.debug("RPM output = %s", output)
-        rc = output[0]["exit_status"]
-        stdout = output[0]["stdout"]
-        if rc != 0:
-            report_errors(self, ["DAOS RPMs not properly installed: rc={}".format(rc)])
-        rpm_version = None
-        for rpm in stdout:
-            result = re.findall(r"daos-server-[tests-|tests_openmpi-]*([\d.]+)", rpm)
-            if result:
-                rpm_version = result[0]
-                break
-        if not result:
-            report_errors(self, ["RPM version could not be defined"])
+        result = run_remote(self.log, self.hostlist_servers, rpm_command)
+        if not result.passed:
+            self.fail("Failed to list daos-server RPMs")
+        if not result.homogeneous:
+            self.fail("Non-homogenous daos-server RPMs")
+        match = re.findall(r"daos-server-[tests-|tests_openmpi-]*([\d.]+)", result.joined_stdout)
+        if not match:
+            self.fail("Failed to get version from daos-server RPMs")
+        rpm_version = match[0]
         self.log.info("RPM version = %s", rpm_version)
 
-        # Remove configuration files
-        cleanup_cmds = [
-            "sudo find /etc/daos/certs -type f -delete -print",
-            "sudo rm -fv /etc/daos/daos_server.yml /etc/daos/daos_control.yml"
-            "     /etc/daos/daos_agent.yml",
-        ]
-        for cmd in cleanup_cmds:
-            run_pcmd(hosts=self.hostlist_servers, command=cmd)
-
         # Get dmg version.
         dmg_version = self.get_dmg_command().version()["response"]["version"]
         self.log.info("dmg version = %s", dmg_version)
@@ -75,17 +64,11 @@ def test_version(self):
         # Get daos_agent version.
         daos_agent_version = None
         daos_agent_cmd = "daos_agent --json version"
-        output = run_pcmd(hosts=self.hostlist_servers, command=daos_agent_cmd)
-        self.log.debug("DAOS Agent output = %s", output)
-        rc = output[0]["exit_status"]
-        stdout = output[0]["stdout"]
-        if rc != 0:
-            msg = "DAOS Agent not properly installed: rc={}".format(rc)
-            append_error(errors, msg, stdout)
-        else:
-            self.log.info("DAOS Agent stdout = %s", "".join(stdout))
-            daos_agent_version = json.loads("".join(stdout))["response"]["version"]
-            self.log.info("daos_agent version = %s", daos_agent_version)
+        result = run_remote(self.log, NodeSet(self.hostlist_servers[0]), daos_agent_cmd)
+        if not result.passed:
+            self.fail("Failed to get daos_agent version")
+        daos_agent_version = json.loads(result.joined_stdout)["response"]["version"]
+        self.log.info("daos_agent version = %s", daos_agent_version)
 
         # Get daos_server version
         daos_server_cmd = DaosServerCommandRunner(path=self.bin)

@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 """
 (C) Copyright 2021-2022 Intel Corporation.
+(C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
 SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -52,7 +53,8 @@ def test_daos_racer_parallel(self):
             job_manager.run()
 
         except CommandFailure as error:
-            self.log.error("DAOS Racer Failed: %s", str(error))
-            self.fail("Test was expected to pass but it failed.\n")
+            msg = f"daos_racer failed: {error}"
+            self.log.error(msg)
+            self.fail(msg)
 
         self.log.info("Test passed!")
@@ -1,5 +1,6 @@
 """
   (C) Copyright 2022-2024 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -241,7 +242,7 @@ def test_agent_failure_isolation(self):
         # 6. On the killed client, verify journalctl shows the log that the agent is
         # stopped.
         results = get_journalctl(
-            hosts=[agent_host_kill], since=since, until=until,
+            hosts=NodeSet(agent_host_kill), since=since, until=until,
             journalctl_type="daos_agent")
         self.log.info("journalctl results (kill) = %s", results)
         if "shutting down" not in results[0]["data"]:
@@ -252,7 +253,7 @@ def test_agent_failure_isolation(self):
         # 7. On the other client where agent is still running, verify that the journalctl
         # in the previous step doesn't show that the agent is stopped.
         results = get_journalctl(
-            hosts=[agent_host_keep], since=since, until=until,
+            hosts=NodeSet(agent_host_keep), since=since, until=until,
             journalctl_type="daos_agent")
         self.log.info("journalctl results (keep) = %s", results)
         if "shutting down" in results[0]["data"]:

@@ -11,11 +11,12 @@
 from ClusterShell.NodeSet import NodeSet
 from command_utils_base import CommandFailure
 from dmg_utils import check_system_query_status
-from general_utils import report_errors, run_pcmd
+from general_utils import report_errors
 from ior_test_base import IorTestBase
 from ior_utils import IorCommand
 from job_manager_utils import get_job_manager
 from network_utils import NetworkInterface
+from run_utils import run_remote
 
 
 class NetworkFailureTest(IorTestBase):
@@ -98,16 +99,15 @@ def create_ip_to_host(self):
 
         """
         command = "hostname -i"
-        results = run_pcmd(hosts=self.hostlist_servers, command=command)
-        self.log.info("hostname -i results = %s", results)
+        result = run_remote(self.log, self.hostlist_servers, command)
+        if not result.passed:
+            self.fail("Failed to get hostname on servers")
 
         ip_to_host = {}
-        for result in results:
-            ips_str = result["stdout"][0]
+        for hosts, stdout in result.all_stdout.items():
             # There may be multiple IP addresses for one host.
-            ip_addresses = ips_str.split()
-            for ip_address in ip_addresses:
-                ip_to_host[ip_address] = NodeSet(str(result["hosts"]))
+            for ip_address in stdout.split():
+                ip_to_host[ip_address] = NodeSet(hosts)
 
         return ip_to_host
 

@@ -1,11 +1,13 @@
 """
   (C) Copyright 2018-2023 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
 
-from general_utils import get_remote_file_size, run_pcmd
+from general_utils import get_remote_file_size
 from ior_test_base import IorTestBase
+from run_utils import run_remote
 
 
 class POSIXStatTest(IorTestBase):
@@ -55,31 +57,27 @@ def test_stat_parameters(self):
                 create_cont=False, test_file_suffix=test_file_suffix)
 
             # Get current epoch.
-            current_epoch = -1
-            output = run_pcmd(hosts=self.hostlist_clients, command="date +%s")
-            stdout = output[0]["stdout"]
-            self.log.info("date stdout = %s", stdout)
-            current_epoch = stdout[-1]
+            result = run_remote(self.log, self.hostlist_clients, "date +%s")
+            if not result.passed:
+                self.fail("Failed to get date on clients")
+            current_epoch = int(result.output[0].stdout[-1])
 
             # Get epoch of the created file. (technically %Z is for last status
             # change. %W is file birth, but it returns 0.)
-            creation_epoch = -1
             # As in date command, run stat command in the client node.
             stat_command = "stat -c%Z {}".format(self.ior_cmd.test_file.value)
-            output = run_pcmd(hosts=self.hostlist_clients, command=stat_command)
-            stdout = output[0]["stdout"]
-            self.log.info("stat stdout = %s", stdout)
-            creation_epoch = stdout[-1]
+            result = run_remote(self.log, self.hostlist_clients, stat_command)
+            if not result.passed:
+                self.fail(f"{stat_command} failed on clients")
+            creation_epoch = int(result.output[0].stdout[-1])
 
             # Calculate the epoch difference between the creation time and the
             # value in the file metadata. They're usually 2 sec apart.
-            creation_epoch_int = int(creation_epoch)
-            current_epoch_int = int(current_epoch)
-            diff_epoch = creation_epoch_int - current_epoch_int
+            diff_epoch = creation_epoch - current_epoch
             if diff_epoch > 10:
                 msg = "Unexpected creation time! Expected = {}; Actual = {}"
                 error_list.append(
-                    msg.format(current_epoch_int, creation_epoch_int))
+                    msg.format(current_epoch, creation_epoch))
 
             # 2. Verify file size.
             # Get file size.