From eadf962a6a6f40f957e09393f8290ec1d90a90da Mon Sep 17 00:00:00 2001 From: Lucas Meneghel Rodrigues Date: Thu, 16 Feb 2012 11:29:59 +0100 Subject: [PATCH] scheduler: Rework entry points As part of the effort to make autotest more packaging friendly, refactor some of the scheduler code and rename: scheduler/monitor_db.py -> scheduler/autotest-scheduler scheduler/monitor_db_babysitter.py -> scheduler/autotest-scheduler-watcher This renaming makes the entry points easier to discover and figure out what they do. The actual functionality of the entry points was actually split to libraries, to ease unit testing. Signed-off-by: Lucas Meneghel Rodrigues Signed-off-by: Martin Krizek --- scheduler/autotest-scheduler | 13 ++ scheduler/autotest-scheduler-watcher | 13 ++ scheduler/monitor_db.py | 214 +++++++++--------- ...or_db_babysitter => monitor_db_watcher.py} | 134 +++++------ ...ng_config.py => watcher_logging_config.py} | 8 +- utils/autotest-rh.init | 22 +- utils/autotest.init | 10 +- utils/autotestd.service | 2 +- 8 files changed, 219 insertions(+), 197 deletions(-) create mode 100755 scheduler/autotest-scheduler create mode 100755 scheduler/autotest-scheduler-watcher mode change 100755 => 100644 scheduler/monitor_db.py rename scheduler/{monitor_db_babysitter => monitor_db_watcher.py} (58%) mode change 100755 => 100644 rename scheduler/{babysitter_logging_config.py => watcher_logging_config.py} (60%) diff --git a/scheduler/autotest-scheduler b/scheduler/autotest-scheduler new file mode 100755 index 0000000000..ab36eac594 --- /dev/null +++ b/scheduler/autotest-scheduler @@ -0,0 +1,13 @@ +#!/usr/bin/python -u +""" +Autotest scheduler. +""" +try: + import autotest.common as common +except ImportError: + import common + +from autotest_lib.scheduler import monitor_db + +if __name__ == '__main__': + monitor_db.main() diff --git a/scheduler/autotest-scheduler-watcher b/scheduler/autotest-scheduler-watcher new file mode 100755 index 0000000000..aaaa7c29a3 --- /dev/null +++ b/scheduler/autotest-scheduler-watcher @@ -0,0 +1,13 @@ +#!/usr/bin/python -u +""" +Autotest scheduler watcher. +""" +try: + import autotest.common +except ImportError: + import common + +from autotest_lib.scheduler import monitor_db_watcher + +if __name__ == '__main__': + monitor_db_watcher.main() diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py old mode 100755 new mode 100644 index 7230f3c885..e4d5c208e1 --- a/scheduler/monitor_db.py +++ b/scheduler/monitor_db.py @@ -1,19 +1,12 @@ -#!/usr/bin/python -u - """ -Autotest scheduler +Autotest scheduler main library. """ - - try: import autotest.common as common except ImportError: import common -import datetime, errno, optparse, os, pwd, Queue, re, shutil, signal -import smtplib, socket, stat, subprocess, sys, tempfile, time, traceback, urllib -import itertools, logging, weakref, gc - -import MySQLdb +import datetime, optparse, os, signal, sys, time, traceback, urllib +import logging, gc from autotest_lib.scheduler import scheduler_logging_config from autotest_lib.frontend import setup_django_environment @@ -29,8 +22,9 @@ from autotest_lib.scheduler import gc_stats, host_scheduler, monitor_db_cleanup from autotest_lib.scheduler import status_server, scheduler_config from autotest_lib.scheduler import scheduler_models -BABYSITTER_PID_FILE_PREFIX = 'monitor_db_babysitter' -PID_FILE_PREFIX = 'monitor_db' + +WATCHER_PID_FILE_PREFIX = 'autotest-scheduler-watcher' +PID_FILE_PREFIX = 'autotest-scheduler' RESULTS_DIR = '.' AUTOSERV_NICE_LEVEL = 10 @@ -58,6 +52,7 @@ _testing_mode = False _drone_manager = None + def _parser_path_default(install_dir): return os.path.join(install_dir, 'tko', 'parse') _parser_path_func = utils.import_site_function( @@ -73,6 +68,32 @@ def _get_pidfile_timeout_secs(): return pidfile_timeout_mins * 60 +def _autoserv_command_line(machines, extra_args, job=None, + queue_entry=None, verbose=True): + """ + @returns The autoserv command line as a list of executable + parameters. + + @param machines - string - A machine or comma separated list of machines + for the (-m) flag. + @param extra_args - list - Additional arguments to pass to autoserv. + @param job - Job object - If supplied, -u owner and -l name parameters + will be added. + @param queue_entry - A HostQueueEntry object - If supplied and no Job + object was supplied, this will be used to lookup the Job object. + """ + autoserv_argv = [_autoserv_path, '-p', + '-r', drone_manager.WORKING_DIRECTORY] + if machines: + autoserv_argv += ['-m', machines] + if job or queue_entry: + if not job: + job = queue_entry.job + autoserv_argv += ['-u', job.owner, '-l', job.name] + if verbose: + autoserv_argv.append('--verbose') + return autoserv_argv + extra_args + + def _site_init_monitor_db_dummy(): return {} @@ -89,17 +110,62 @@ def _sanity_check(): _verify_default_drone_set_exists() -def main(): - try: - try: - main_without_exception_handling() - except SystemExit: - raise - except: - logging.exception('Exception escaping in monitor_db') - raise - finally: - utils.delete_pid_file_if_exists(PID_FILE_PREFIX) +def setup_logging(): + log_dir = os.environ.get('AUTOTEST_SCHEDULER_LOG_DIR', None) + log_name = os.environ.get('AUTOTEST_SCHEDULER_LOG_NAME', None) + logging_manager.configure_logging( + scheduler_logging_config.SchedulerLoggingConfig(), log_dir=log_dir, + logfile_name=log_name) + + +def handle_sigint(signum, frame): + global _shutdown + _shutdown = True + logging.info("Shutdown request received.") + + +def initialize(): + logging.info("%s> dispatcher starting", time.strftime("%X %x")) + logging.info("My PID is %d", os.getpid()) + + if utils.program_is_alive(PID_FILE_PREFIX): + logging.critical("scheduler already running, aborting!") + sys.exit(1) + utils.write_pid(PID_FILE_PREFIX) + + if _testing_mode: + global_config.global_config.override_config_value( + DB_CONFIG_SECTION, 'database', 'stresstest_autotest_web') + + os.environ['PATH'] = AUTOTEST_SERVER_DIR + ':' + os.environ['PATH'] + global _db + _db = database_connection.DatabaseConnection(DB_CONFIG_SECTION) + _db.connect(db_type='django') + + # ensure Django connection is in autocommit + setup_django_environment.enable_autocommit() + # bypass the readonly connection + readonly_connection.ReadOnlyConnection.set_globally_disabled(True) + + logging.info("Setting signal handler") + signal.signal(signal.SIGINT, handle_sigint) + + initialize_globals() + scheduler_models.initialize() + + drones = global_config.global_config.get_config_value( + scheduler_config.CONFIG_SECTION, 'drones', default='localhost') + drone_list = [hostname.strip() for hostname in drones.split(',')] + results_host = global_config.global_config.get_config_value( + scheduler_config.CONFIG_SECTION, 'results_host', default='localhost') + _drone_manager.initialize(RESULTS_DIR, drone_list, results_host) + + logging.info("Connected! Running...") + + +def initialize_globals(): + global _drone_manager + _drone_manager = drone_manager.instance() def main_without_exception_handling(): @@ -162,7 +228,7 @@ def main_without_exception_handling(): time.sleep(scheduler_config.config.tick_pause_sec) except: email_manager.manager.log_stacktrace( - "Uncaught exception; terminating monitor_db") + "Uncaught exception; terminating scheduler") email_manager.manager.send_queued_emails() server.shutdown() @@ -170,88 +236,17 @@ def main_without_exception_handling(): _db.disconnect() -def setup_logging(): - log_dir = os.environ.get('AUTOTEST_SCHEDULER_LOG_DIR', None) - log_name = os.environ.get('AUTOTEST_SCHEDULER_LOG_NAME', None) - logging_manager.configure_logging( - scheduler_logging_config.SchedulerLoggingConfig(), log_dir=log_dir, - logfile_name=log_name) - - -def handle_sigint(signum, frame): - global _shutdown - _shutdown = True - logging.info("Shutdown request received.") - - -def initialize(): - logging.info("%s> dispatcher starting", time.strftime("%X %x")) - logging.info("My PID is %d", os.getpid()) - - if utils.program_is_alive(PID_FILE_PREFIX): - logging.critical("monitor_db already running, aborting!") - sys.exit(1) - utils.write_pid(PID_FILE_PREFIX) - - if _testing_mode: - global_config.global_config.override_config_value( - DB_CONFIG_SECTION, 'database', 'stresstest_autotest_web') - - os.environ['PATH'] = AUTOTEST_SERVER_DIR + ':' + os.environ['PATH'] - global _db - _db = database_connection.DatabaseConnection(DB_CONFIG_SECTION) - _db.connect(db_type='django') - - # ensure Django connection is in autocommit - setup_django_environment.enable_autocommit() - # bypass the readonly connection - readonly_connection.ReadOnlyConnection.set_globally_disabled(True) - - logging.info("Setting signal handler") - signal.signal(signal.SIGINT, handle_sigint) - - initialize_globals() - scheduler_models.initialize() - - drones = global_config.global_config.get_config_value( - scheduler_config.CONFIG_SECTION, 'drones', default='localhost') - drone_list = [hostname.strip() for hostname in drones.split(',')] - results_host = global_config.global_config.get_config_value( - scheduler_config.CONFIG_SECTION, 'results_host', default='localhost') - _drone_manager.initialize(RESULTS_DIR, drone_list, results_host) - - logging.info("Connected! Running...") - - -def initialize_globals(): - global _drone_manager - _drone_manager = drone_manager.instance() - - -def _autoserv_command_line(machines, extra_args, job=None, queue_entry=None, - verbose=True): - """ - @returns The autoserv command line as a list of executable + parameters. - - @param machines - string - A machine or comma separated list of machines - for the (-m) flag. - @param extra_args - list - Additional arguments to pass to autoserv. - @param job - Job object - If supplied, -u owner and -l name parameters - will be added. - @param queue_entry - A HostQueueEntry object - If supplied and no Job - object was supplied, this will be used to lookup the Job object. - """ - autoserv_argv = [_autoserv_path, '-p', - '-r', drone_manager.WORKING_DIRECTORY] - if machines: - autoserv_argv += ['-m', machines] - if job or queue_entry: - if not job: - job = queue_entry.job - autoserv_argv += ['-u', job.owner, '-l', job.name] - if verbose: - autoserv_argv.append('--verbose') - return autoserv_argv + extra_args +def main(): + try: + try: + main_without_exception_handling() + except SystemExit: + raise + except: + logging.exception('Exception escaping in scheduler') + raise + finally: + utils.delete_pid_file_if_exists(PID_FILE_PREFIX) class Dispatcher(object): @@ -800,7 +795,6 @@ def _process_recurring_runs(self): host_objects = info['hosts'] one_time_hosts = info['one_time_hosts'] metahost_objects = info['meta_hosts'] - dependencies = info['dependencies'] atomic_group = info['atomic_group'] for host in one_time_hosts or []: @@ -950,7 +944,7 @@ def _get_pidfile_info(self): """ try: self._get_pidfile_info_helper() - except self._PidfileException, exc: + except self._PidfileException: self._handle_pidfile_error('Pidfile error', traceback.format_exc()) @@ -2194,7 +2188,3 @@ def epilog(self): % self.monitor.exit_code()], paired_with_process=paired_process) self._set_all_statuses(self._final_status()) - - -if __name__ == '__main__': - main() diff --git a/scheduler/monitor_db_babysitter b/scheduler/monitor_db_watcher.py old mode 100755 new mode 100644 similarity index 58% rename from scheduler/monitor_db_babysitter rename to scheduler/monitor_db_watcher.py index bfe6568c84..9e1f984482 --- a/scheduler/monitor_db_babysitter +++ b/scheduler/monitor_db_watcher.py @@ -1,35 +1,26 @@ -#!/usr/bin/python -u +""" +Autotest scheduler watcher main library. +""" + import os, sys, signal, time, subprocess, logging from optparse import OptionParser try: - import autotest.common + import autotest.common as common except ImportError: import common -from autotest_lib.scheduler import babysitter_logging_config +from autotest_lib.scheduler import watcher_logging_config from autotest_lib.client.common_lib import error, global_config, utils from autotest_lib.client.common_lib import logging_manager from autotest_lib.scheduler import scheduler_logging_config from autotest_lib.scheduler import monitor_db + PAUSE_LENGTH = 60 STALL_TIMEOUT = 2*60*60 -parser = OptionParser() -parser.add_option("-r", action="store_true", dest="recover", - help=("run recovery mode (implicit after any crash)")) -parser.add_option("--background", dest="background", action="store_true", - default=False, help=("runs the scheduler monitor on " - "background")) -(options, args) = parser.parse_args() - autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) results_dir = os.path.join(autodir, 'results') -monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py') -recover = (options.recover == True) - -if len(args) != 0: - parser.print_help() - sys.exit(1) +monitor_db_path = os.path.join(autodir, 'scheduler/autotest-scheduler') def run_banner_output(cmd): @@ -46,7 +37,7 @@ def run_banner_output(cmd): def kill_monitor(): - logging.info("Killing monitor_db") + logging.info("Killing scheduler") # try shutdown first utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT) if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed? @@ -59,14 +50,14 @@ def kill_monitor(): def handle_sigterm(signum, frame): logging.info('Caught SIGTERM') kill_monitor() - utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX) + utils.delete_pid_file_if_exists(monitor_db.WATCHER_PID_FILE_PREFIX) sys.exit(1) signal.signal(signal.SIGTERM, handle_sigterm) SiteMonitorProc = utils.import_site_class( - __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter', + __file__, 'autotest_lib.scheduler.site_monitor_db_watcher', 'SiteMonitorProc', object) @@ -78,7 +69,6 @@ def __init__(self, do_recovery=False): args.append(results_dir) kill_monitor() - environ = os.environ scheduler_config = scheduler_logging_config.SchedulerLoggingConfig log_name = scheduler_config.get_log_name() os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name @@ -88,7 +78,7 @@ def __init__(self, do_recovery=False): self.log_size = 0 self.last_log_change = time.time() - logging.info("STARTING monitor_db with log file %s" % self.log_path) + logging.info("Starting scheduler with log file %s" % self.log_path) self.args = args # Allow site specific code to run, set environment variables and @@ -103,7 +93,7 @@ def start(self): def is_running(self): if self.proc.poll() is not None: - logging.info("monitor_db DIED") + logging.info("Scheduler died") return False old_size = self.log_size @@ -113,7 +103,7 @@ def is_running(self): self.log_size = new_size self.last_log_change = time.time() elif self.last_log_change + STALL_TIMEOUT < time.time(): - logging.info("monitor_db STALLED") + logging.info("Scheduler stalled") self.collect_stalled_info() return False @@ -142,52 +132,68 @@ def collect_stalled_info(self): log.close() -if os.getuid() == 0: - logging.critical("Running as root, aborting!") - sys.exit(1) - -if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX): - logging.critical("Monitor_db_babysitter already running, aborting!") - sys.exit(1) +def main(): + parser = OptionParser() + parser.add_option("-r", action="store_true", dest="recover", + help=("run recovery mode (implicit after any crash)")) + parser.add_option("--background", dest="background", action="store_true", + default=False, help=("runs the scheduler monitor on " + "background")) + (options, args) = parser.parse_args() -utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX) + recover = (options.recover == True) -if options.background: - logging_manager.configure_logging( - babysitter_logging_config.BabysitterLoggingConfig(use_console=False)) - - # Double fork - see http://code.activestate.com/recipes/66012/ - try: - pid = os.fork() - if (pid > 0): - sys.exit(0) # exit from first parent - except OSError, e: - sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror)) + if len(args) != 0: + parser.print_help() sys.exit(1) - # Decouple from parent environment - os.chdir("/") - os.umask(0) - os.setsid() + if os.getuid() == 0: + logging.critical("Running as root, aborting!") + sys.exit(1) - # Second fork - try: - pid = os.fork() - if (pid > 0): - sys.exit(0) # exit from second parent - except OSError, e: - sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror)) + if utils.program_is_alive(monitor_db.WATCHER_PID_FILE_PREFIX): + logging.critical("autotest-monitor-watcher already running, aborting!") sys.exit(1) -else: - logging_manager.configure_logging( - babysitter_logging_config.BabysitterLoggingConfig()) + utils.write_pid(monitor_db.WATCHER_PID_FILE_PREFIX) -while True: - proc = MonitorProc(do_recovery=recover) - proc.start() - time.sleep(PAUSE_LENGTH) - while proc.is_running(): - logging.info("Tick") + if options.background: + logging_manager.configure_logging( + watcher_logging_config.WatcherLoggingConfig(use_console=False)) + + # Double fork - see http://code.activestate.com/recipes/66012/ + try: + pid = os.fork() + if (pid > 0): + sys.exit(0) # exit from first parent + except OSError, e: + sys.stderr.write("fork #1 failed: (%d) %s\n" % + (e.errno, e.strerror)) + sys.exit(1) + + # Decouple from parent environment + os.chdir("/") + os.umask(0) + os.setsid() + + # Second fork + try: + pid = os.fork() + if (pid > 0): + sys.exit(0) # exit from second parent + except OSError, e: + sys.stderr.write("fork #2 failed: (%d) %s\n" % + (e.errno, e.strerror)) + sys.exit(1) + else: + logging_manager.configure_logging( + watcher_logging_config.WatcherLoggingConfig()) + + while True: + proc = MonitorProc(do_recovery=recover) + proc.start() time.sleep(PAUSE_LENGTH) - recover = False + while proc.is_running(): + logging.info("Tick") + time.sleep(PAUSE_LENGTH) + recover = False diff --git a/scheduler/babysitter_logging_config.py b/scheduler/watcher_logging_config.py similarity index 60% rename from scheduler/babysitter_logging_config.py rename to scheduler/watcher_logging_config.py index 5a2c48e2d3..8e526d7f97 100644 --- a/scheduler/babysitter_logging_config.py +++ b/scheduler/watcher_logging_config.py @@ -5,14 +5,14 @@ import logging from autotest_lib.client.common_lib import logging_config -class BabysitterLoggingConfig(logging_config.LoggingConfig): +class WatcherLoggingConfig(logging_config.LoggingConfig): def __init__(self, use_console=True): - super(BabysitterLoggingConfig, self).__init__(use_console=use_console) + super(WatcherLoggingConfig, self).__init__(use_console=use_console) def configure_logging(self): - super(BabysitterLoggingConfig, self).configure_logging( + super(WatcherLoggingConfig, self).configure_logging( use_console=self.use_console) - self.add_file_handler(self.get_timestamped_log_name('babysitter'), + self.add_file_handler(self.get_timestamped_log_name('scheduler-watcher'), logging.DEBUG, log_dir=self.get_server_log_dir()) diff --git a/utils/autotest-rh.init b/utils/autotest-rh.init index e5618a0abc..1201b35e88 100755 --- a/utils/autotest-rh.init +++ b/utils/autotest-rh.init @@ -14,7 +14,7 @@ # chkconfig: - 65 25 # description: Autotest is a framework for fully automated testing. # processname: monitor_db.py -# pidfile: /var/run/autotest/monitor_db_babysitter.pid +# pidfile: /var/run/autotest/autotest-monitor-watcher.pid # ### BEGIN INIT INFO # Provides: autotest @@ -37,26 +37,26 @@ LOCKFILE=/var/lock/subsys/$PROG # Autotest paths AUTOTEST_DIR="/usr/local/$PROG" -BABYSITTER="$AUTOTEST_DIR/scheduler/monitor_db_babysitter" -SCHEDULER="$AUTOTEST_DIR/scheduler/monitor_db.py" +WATCHER="$AUTOTEST_DIR/scheduler/autotest-scheduler-watcher" +SCHEDULER="$AUTOTEST_DIR/scheduler/autotest-scheduler" # Scheduler options OPTIONS="--background" # Where to locate PID files PID_PATH="$AUTOTEST_DIR" # "/var/run/$PROG" -BABYSITTER_PIDFILE="$PID_PATH/monitor_db_babysitter.pid" -SCHEDULER_PIDFILE="$PID_PATH/monitor_db.pid" +WATCHER_PIDFILE="$PID_PATH/autotest-scheduler-watcher.pid" +SCHEDULER_PIDFILE="$PID_PATH/autotest-scheduler.pid" # Assume pass RETVAL=0 start() { - [ -f $BABYSITTER ] || exit 5 + [ -f $WATCHER ] || exit 5 echo -n $"Starting $PROG: " - daemon --user $BECOME_USER --check $PROG $BABYSITTER $OPTIONS + daemon --user $BECOME_USER --check $PROG $WATCHER $OPTIONS RETVAL=$? echo [ "$RETVAL" = 0 ] && touch $LOCKFILE @@ -67,13 +67,13 @@ stop() { echo -n $"Stopping $PROG: " - killproc $BABYSITTER + killproc $WATCHER RETVAL=$? echo if [ "$RETVAL" = 0 ]; then rm -f $LOCKFILE - rm -f $BABYSITTER_PIDFILE + rm -f $WATCHER_PIDFILE rm -f $SCHEDULER_PIDFILE fi return $RETVAL @@ -82,7 +82,7 @@ stop() reload() { echo -n $"Reloading $PROG: " - killproc -p $BABYSITTER_PIDFILE $PROG -HUP + killproc -p $WATCHER_PIDFILE $PROG -HUP RETVAL=$? echo return $RETVAL @@ -124,7 +124,7 @@ case "$1" in ;; status) # status -p $PIDFILE $PROG - status $BABYSITTER + status $WATCHER status $SCHEDULER RETVAL=$? ;; diff --git a/utils/autotest.init b/utils/autotest.init index 78ed9a86fe..fe6ad19646 100755 --- a/utils/autotest.init +++ b/utils/autotest.init @@ -21,22 +21,22 @@ fi autotest_start() { cd /tmp - log_daemon_msg "Starting monitor_db_babysitter" + log_daemon_msg "Starting autotest-scheduler-watcher" ( ulimit -v 2048000 ; \ start-stop-daemon --start --quiet --chuid $BECOME_USER \ - --background --exec $BASE_DIR/scheduler/monitor_db_babysitter ) + --background --exec $BASE_DIR/scheduler/autotest-scheduler-watcher ) } stop_daemon() { PID_NAME=$1 DAEMON_NAME=$2 - log_daemon_msg "Stopping $DAEMON_NAME" + log_daemon_msg "Stopping autotest $DAEMON_NAME" start-stop-daemon --stop --quiet --pidfile $BASE_DIR/$PID_NAME.pid } autotest_stop() { - stop_daemon monitor_db_babysitter babysitter - stop_daemon monitor_db scheduler + stop_daemon autotest-scheduler-watcher scheduler-watcher + stop_daemon autotest-scheduler scheduler } case "$1" in diff --git a/utils/autotestd.service b/utils/autotestd.service index 0152ec2e35..b67dd60662 100644 --- a/utils/autotestd.service +++ b/utils/autotestd.service @@ -2,7 +2,7 @@ Description=Autotest scheduler [Service] -ExecStart=/usr/local/autotest/scheduler/monitor_db_babysitter +ExecStart=/usr/local/autotest/scheduler/autotest-scheduler-watcher User=autotest Group=autotest Restart=on-abort