daos-stack · phender · Aug 15, 2024 · Aug 15, 2024 · Aug 16, 2024 · Aug 28, 2024
@@ -4,8 +4,6 @@
   SPDX-License-Identifier: BSD-2-Clause-Patent
 '''
 
-import os
-
 import yaml
 from apricot import TestWithServers
 from server_utils import ServerFailed
@@ -49,7 +47,7 @@ def test_config_generate_run(self):
         # path needs to be set in that case.
         control_metadata = None
         if use_tmpfs_scm:
-            control_metadata = os.path.join(self.test_env.log_dir, 'control_metadata')
+            control_metadata = self.test_env.control_metadata
 
         # Call dmg config generate. AP is always the first server host.
         server_host = self.hostlist_servers[0]

@@ -42,11 +42,15 @@ def test_daos_agent_config_basic(self):
             self.agent_managers[-1],
             include_local_host(self.hostlist_clients),
             self.hostfile_clients_slots)
-        self.agent_managers[-1].verify_socket_dir = False
 
         # Get the input to verify
         c_val = self.params.get("config_val", "/run/agent_config_val/*/")
 
+        # Do not create the agent runtime directory if running as root or the test is attempting
+        # to test with an invalid runtime directory value.
+        if self.test_env.agent_user is None or (c_val[0] == "runtime_dir" and c_val[2] == "FAIL"):
+            self.agent_managers[-1].verify_socket_dir = False
+
         # Identify the attribute and modify its value to test value
         self.assertTrue(
             self.agent_managers[-1].set_config_value(c_val[0], c_val[1]),

@@ -1,5 +1,5 @@
 """
-  (C) Copyright 2023 Intel Corporation.
+  (C) Copyright 2023-2024 Intel Corporation.
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -8,7 +8,7 @@
 
 from apricot import TestWithServers
 from ClusterShell.NodeSet import NodeSet
-from general_utils import get_journalctl, journalctl_time, wait_for_result
+from general_utils import journalctl_time, wait_for_result
 from run_utils import run_remote
 
 
@@ -36,16 +36,14 @@ def _verify_journalctl(self, since, expected_messages):
             since (str): start time for journalctl
             expected_messages (list): list of regular expressions to look for
         """
-        self.log_step('Verify journalctl output since {}'.format(since))
+        self.log_step(f'Verify journalctl output since {since}')
 
         not_found = set(expected_messages)
         journalctl_per_hosts = []
 
         def _search():
             """Look for each message in any host's journalctl."""
-            journalctl_results = get_journalctl(
-                hosts=self.hostlist_servers, since=since, until=journalctl_time(),
-                journalctl_type="daos_server")
+            journalctl_results = self.server_managers[0].get_journalctl(since, journalctl_time())
 
             # Convert the journalctl to a dict of hosts : output
             journalctl_per_hosts.append({})
@@ -76,7 +74,7 @@ def _search():
 
         # Fail if any message was not found
         if not_found:
-            fail_msg = '{} messages not found in journalctl'.format(len(not_found))
+            fail_msg = f'{len(not_found)} messages not found in journalctl'
             self.log.error(fail_msg)
             for message in not_found:
                 self.log.error('  %s', message)
@@ -157,7 +155,7 @@ def test_control_log_entry(self):
         self.log_step('Restart server')
         expected = [r'Starting I/O Engine instance', r'Listening on']
         with self.verify_journalctl(expected):
-            self.server_managers[0].restart(list(kill_host), wait=True)
+            self.server_managers[0].restart(kill_host, wait=True)
 
         self.log_step('Reintegrate all ranks and wait for rebuild')
         expected = [fr'rank {rank}.*start reintegration' for rank in kill_ranks] \

@@ -1,5 +1,5 @@
 """
-(C) Copyright 2021-2023 Intel Corporation.
+(C) Copyright 2021-2024 Intel Corporation.
 
 SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -206,7 +206,9 @@ def kill_servers(self, leader, replicas, num_hosts):
             kill_list.remove(kill_list[-1])
             kill_list.add(leader)
         self.log.info("*** stopping leader (%s) + %d others: %s", leader, num_hosts - 1, kill_list)
-        stop_processes(self.log, kill_list, self.server_managers[0].manager.job.command_regex)
+        stop_processes(
+            self.log, kill_list, self.server_managers[0].manager.job.command_regex,
+            user=self.server_managers[0].manager.job.run_user)
 
         kill_ranks = self.server_managers[0].get_host_ranks(kill_list)
         self.assertGreaterEqual(len(kill_ranks), len(kill_list),

@@ -9,7 +9,7 @@
 
 from ClusterShell.NodeSet import NodeSet
 from command_utils_base import CommandFailure
-from general_utils import get_journalctl, journalctl_time, report_errors
+from general_utils import journalctl_time, report_errors
 from ior_test_base import IorTestBase
 from ior_utils import IorCommand
 from job_manager_utils import get_job_manager
@@ -55,7 +55,7 @@ def run_ior_collect_error(self, results, job_num, file_name, clients, namespace)
             # We'll verify the error message.
             results[job_num].append(ior_output.stderr_text)
         except CommandFailure as error:
-            results[job_num] = [False, "IOR failed: {}".format(error)]
+            results[job_num] = [False, f"IOR failed: {error}"]
 
     def test_agent_failure(self):
         """Jira ID: DAOS-9385.
@@ -121,14 +121,10 @@ def test_agent_failure(self):
             errors.append("IOR worked when agent is killed!")
 
         # 5. Verify journalctl shows the log that the agent is stopped.
-        results = get_journalctl(
-            hosts=self.hostlist_clients, since=since, until=until,
-            journalctl_type="daos_agent")
+        results = self.agent_managers[0].get_journalctl(since, until)
         self.log.info("journalctl results = %s", results)
         if "shutting down" not in results[0]["data"]:
-            msg = "Agent shut down message not found in journalctl! Output = {}".format(
-                results)
-            errors.append(msg)
+            errors.append(f"Agent shut down message not found in journalctl! Output = {results}")
 
         # 6. Restart agent.
         self.log.info("Restart agent")
@@ -146,7 +142,7 @@ def test_agent_failure(self):
         self.log.info(ior_results[job_num])
         if not ior_results[job_num][0]:
             ior_error = ior_results[job_num][-1]
-            errors.append("IOR with restarted agent failed! Error: {}".format(ior_error))
+            errors.append(f"IOR with restarted agent failed! Error: {ior_error}")
 
         self.log.info("########## Errors ##########")
         report_errors(test=self, errors=errors)
@@ -211,13 +207,13 @@ def test_agent_failure_isolation(self):
         since = journalctl_time()
         self.log.info("Stopping agent on %s", agent_host_kill)
         pattern = self.agent_managers[0].manager.job.command_regex
-        detected, running = stop_processes(self.log, hosts=agent_host_kill, pattern=pattern)
+        detected, running = stop_processes(
+            self.log, hosts=agent_host_kill, pattern=pattern,
+            user=self.agent_managers[0].manager.job.run_user)
         if not detected:
-            msg = "No daos_agent process killed on {}!".format(agent_host_kill)
-            errors.append(msg)
+            errors.append(f"No daos_agent process killed on {agent_host_kill}!")
         elif running:
-            msg = "Unable to kill daos_agent processes on {}!".format(running)
-            errors.append(msg)
+            errors.append(f"Unable to kill daos_agent processes on {running}!")
         else:
             self.log.info("daos_agent processes on %s killed", detected)
         until = journalctl_time()
@@ -236,29 +232,25 @@ def test_agent_failure_isolation(self):
         self.log.info(ior_results[job_num_keep])
         if not ior_results[job_num_keep][0]:
             ior_error = ior_results[job_num_keep][-1]
-            errors.append("Error found in IOR on keep client! {}".format(ior_error))
+            errors.append(f"Error found in IOR on keep client! {ior_error}")
 
         # 6. On the killed client, verify journalctl shows the log that the agent is
         # stopped.
-        results = get_journalctl(
-            hosts=[agent_host_kill], since=since, until=until,
-            journalctl_type="daos_agent")
+        results = self.agent_managers[0].get_journalctl(since, until, agent_host_kill)
         self.log.info("journalctl results (kill) = %s", results)
         if "shutting down" not in results[0]["data"]:
-            msg = ("Agent shut down message not found in journalctl on killed client! "
-                   "Output = {}".format(results))
-            errors.append(msg)
+            errors.append(
+                "Agent shut down message not found in journalctl on killed client! "
+                f"Output = {results}")
 
         # 7. On the other client where agent is still running, verify that the journalctl
         # in the previous step doesn't show that the agent is stopped.
-        results = get_journalctl(
-            hosts=[agent_host_keep], since=since, until=until,
-            journalctl_type="daos_agent")
+        results = self.agent_managers[0].get_journalctl(since, until, agent_host_keep)
         self.log.info("journalctl results (keep) = %s", results)
         if "shutting down" in results[0]["data"]:
-            msg = ("Agent shut down message found in journalctl on keep client! "
-                   "Output = {}".format(results))
-            errors.append(msg)
+            errors.append(
+                "Agent shut down message found in journalctl on keep client! "
+                f"Output = {results}")
 
         # 8. Restart both daos_agent. (Currently, there's no clean way to restart one.)
         self.start_agent_managers()
@@ -274,7 +266,7 @@ def test_agent_failure_isolation(self):
         self.log.info(ior_results[job_num_keep])
         if not ior_results[job_num_keep][0]:
             ior_error = ior_results[job_num_keep][-1]
-            errors.append("Error found in second IOR run! {}".format(ior_error))
+            errors.append(f"Error found in second IOR run! {ior_error}")
 
         self.log.info("########## Errors ##########")
         report_errors(test=self, errors=errors)

@@ -10,7 +10,7 @@
 from apricot import TestWithoutServers, TestWithServers
 from ClusterShell.NodeSet import NodeSet
 from exception_utils import CommandFailure
-from general_utils import DaosTestError, get_journalctl, journalctl_time, run_command
+from general_utils import DaosTestError, journalctl_time, run_command
 from run_utils import run_remote
 
 # pylint: disable-next=fixme
@@ -167,22 +167,20 @@ def test_ras(self):
             dmg.system_start(ranks=ranks_to_stop)
             check_started_ranks = self.server_managers[0].check_rank_state(sub_list, ["joined"], 5)
             if check_started_ranks:
-                self.fail("Following Ranks {} failed to restart".format(check_started_ranks))
+                self.fail(f"Following Ranks {check_started_ranks} failed to restart")
 
         until = journalctl_time()
 
         # gather journalctl logs for each server host, verify system stop event was sent to logs
-        results = get_journalctl(hosts=self.hostlist_servers, since=since,
-                                 until=until, journalctl_type="daos_server")
+        results = self.server_managers[0].get_journalctl(since, until)
         str_to_match = "daos_engine exited: process exited with 0"
         for count, host in enumerate(self.hostlist_servers):
             occurrence = results[count]["data"].count(str_to_match)
             if occurrence != 2:
-                self.log.info("Occurrence %s for rank stop not as expected for host %s",
-                              occurrence, host)
-                msg = "Rank shut down message not found in journalctl! Output = {}".format(
-                    results[count]["data"])
-                self.fail(msg)
+                self.log.error(
+                    "Occurrence %s for rank stop not as expected for host %s", occurrence, host)
+                self.log.debug("Journalctl output: %s", results[count]["data"])
+                self.fail("Rank shut down message not found in journalctl!")
 
         dmg.storage_scan()
         dmg.network_scan()
@@ -92,7 +92,9 @@ def kill_engine(self, engine_kill_host):
             engine_kill_host (str): Hostname to kill engine.
         """
         pattern = self.server_managers[0].manager.job.command_regex
-        detected, running = stop_processes(self.log, NodeSet(engine_kill_host), pattern)
+        detected, running = stop_processes(
+            self.log, NodeSet(engine_kill_host), pattern,
+            user=self.server_managers[0].manager.job.run_user)
         if not detected:
             self.log.info("No daos_engine process killed on %s!", engine_kill_host)
         elif running:

@@ -282,7 +282,8 @@ def _run(self, args):
             else:
                 set_test_environment(
                     logger, test_env, args.test_servers, args.test_clients, args.provider,
-                    args.insecure_mode, self.details)
+                    args.insecure_mode, self.details, args.agent_user, args.test_log_dir,
+                    args.systemd_path, args.systemd_lib_path)
         except TestEnvironmentException as error:
             message = f"Error setting up test environment: {str(error)}"
             return self.get_exit_status(1, message, "Setup", sys.exc_info())
@@ -320,12 +321,13 @@ def _run(self, args):
             return self.get_exit_status(0, "Listing tests complete")
 
         # Setup the fuse configuration
-        try:
-            setup_fuse_config(logger, args.test_servers | args.test_clients)
-        except LaunchException:
-            # Warn but don't fail
-            message = "Issue detected setting up the fuse configuration"
-            setup_result.warn_test(logger, "Setup", message, sys.exc_info())
+        if args.fuse_setup:
+            try:
+                setup_fuse_config(logger, args.test_servers | args.test_clients)
+            except LaunchException:
+                # Warn but don't fail
+                message = "Issue detected setting up the fuse configuration"
+                setup_result.warn_test(logger, "Setup", message, sys.exc_info())
 
         # Setup override systemctl files
         try:
@@ -358,8 +360,8 @@ def _run(self, args):
             group.update_test_yaml(
                 logger, args.scm_size, args.scm_mount, args.extra_yaml,
                 args.timeout_multiplier, args.override, args.verbose, args.include_localhost)
-        except (RunException, YamlException) as e:
-            message = "Error modifying the test yaml files: {}".format(e)
+        except (RunException, YamlException) as error:
+            message = f"Error modifying the test yaml files: {error}"
             status |= self.get_exit_status(1, message, "Setup", sys.exc_info())
         except StorageException:
             message = "Error detecting storage information for test yaml files"
@@ -540,6 +542,12 @@ def main():
         "-a", "--archive",
         action="store_true",
         help="archive host log files in the avocado job-results directory")
+    parser.add_argument(
+        "-au", "--agent_user",
+        action="store",
+        default=None,
+        type=str,
+        help="user account to use when running the daos_agent")
     parser.add_argument(
         "-c", "--clear_mounts",
         action="append",
@@ -562,6 +570,10 @@ def main():
         "--failfast",
         action="store_true",
         help="stop the test suite after the first failure")
+    parser.add_argument(
+        "-fs", "--fuse_setup",
+        action="store_true",
+        help="enable setting up fuse configuration files")
     parser.add_argument(
         "-i", "--include_localhost",
         action="store_true",
@@ -584,7 +596,7 @@ def main():
         help="modify the test yaml files but do not run the tests")
     parser.add_argument(
         "-mo", "--mode",
-        choices=['normal', 'manual', 'ci'],
+        choices=['normal', 'manual', 'ci', 'custom_a'],
         default='normal',
         help="provide the mode of test to be run under. Default is normal, "
              "in which the final return code of launch.py is still zero if "
@@ -649,6 +661,18 @@ def main():
         "-si", "--slurm_install",
         action="store_true",
         help="enable installing slurm RPMs if required by the tests")
+    parser.add_argument(
+        "-sl", "--systemd_lib_path",
+        action="store",
+        default=None,
+        type=str,
+        help="the daos_server and daos_agent systemd LD_LIBRARY_PATH to define in the config")
+    parser.add_argument(
+        "-sp", "--systemd_path",
+        action="store",
+        default=None,
+        type=str,
+        help="the daos_server and daos_agent systemd PATH to define in the config")
     parser.add_argument(
         "--scm_mount",
         action="store",
@@ -681,6 +705,12 @@ def main():
         default=NodeSet(),
         help="comma-separated list of hosts to use as replacement values for "
              "client placeholders in each test's yaml file")
+    parser.add_argument(
+        "-tld", "--test_log_dir",
+        action="store",
+        default=None,
+        type=str,
+        help="test log directory base path")
     parser.add_argument(
         "-th", "--logs_threshold",
         action="store",
@@ -744,10 +774,22 @@ def main():
         args.slurm_install = True
         args.slurm_setup = True
         args.user_create = True
+        args.fuse_setup = True
         args.clear_mounts.append("/mnt/daos")
         args.clear_mounts.append("/mnt/daos0")
         args.clear_mounts.append("/mnt/daos1")
 
+    elif args.mode == "custom_a":
+        if args.agent_user is None:
+            # Run the agent with the current user by default
+            args.agent_user = getpass.getuser()
+        args.process_cores = False
+        args.logs_threshold = None
+        args.slurm_install = False
+        args.slurm_setup = False
+        args.user_create = False
+        args.fuse_setup = False
+
     # Setup the Launch object
     launch = Launch(args.name, args.mode, args.slurm_install, args.slurm_setup)