diff --git a/scheduler/autotest-scheduler b/scheduler/autotest-scheduler new file mode 100755 index 0000000000..ab36eac594 --- /dev/null +++ b/scheduler/autotest-scheduler @@ -0,0 +1,13 @@ +#!/usr/bin/python -u +""" +Autotest scheduler. +""" +try: + import autotest.common as common +except ImportError: + import common + +from autotest_lib.scheduler import monitor_db + +if __name__ == '__main__': + monitor_db.main() diff --git a/scheduler/autotest-scheduler-watcher b/scheduler/autotest-scheduler-watcher new file mode 100755 index 0000000000..aaaa7c29a3 --- /dev/null +++ b/scheduler/autotest-scheduler-watcher @@ -0,0 +1,13 @@ +#!/usr/bin/python -u +""" +Autotest scheduler watcher. +""" +try: + import autotest.common +except ImportError: + import common + +from autotest_lib.scheduler import monitor_db_watcher + +if __name__ == '__main__': + monitor_db_watcher.main() diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py old mode 100755 new mode 100644 index 7230f3c885..e4d5c208e1 --- a/scheduler/monitor_db.py +++ b/scheduler/monitor_db.py @@ -1,19 +1,12 @@ -#!/usr/bin/python -u - """ -Autotest scheduler +Autotest scheduler main library. """ - - try: import autotest.common as common except ImportError: import common -import datetime, errno, optparse, os, pwd, Queue, re, shutil, signal -import smtplib, socket, stat, subprocess, sys, tempfile, time, traceback, urllib -import itertools, logging, weakref, gc - -import MySQLdb +import datetime, optparse, os, signal, sys, time, traceback, urllib +import logging, gc from autotest_lib.scheduler import scheduler_logging_config from autotest_lib.frontend import setup_django_environment @@ -29,8 +22,9 @@ from autotest_lib.scheduler import gc_stats, host_scheduler, monitor_db_cleanup from autotest_lib.scheduler import status_server, scheduler_config from autotest_lib.scheduler import scheduler_models -BABYSITTER_PID_FILE_PREFIX = 'monitor_db_babysitter' -PID_FILE_PREFIX = 'monitor_db' + +WATCHER_PID_FILE_PREFIX = 'autotest-scheduler-watcher' +PID_FILE_PREFIX = 'autotest-scheduler' RESULTS_DIR = '.' AUTOSERV_NICE_LEVEL = 10 @@ -58,6 +52,7 @@ _testing_mode = False _drone_manager = None + def _parser_path_default(install_dir): return os.path.join(install_dir, 'tko', 'parse') _parser_path_func = utils.import_site_function( @@ -73,6 +68,32 @@ def _get_pidfile_timeout_secs(): return pidfile_timeout_mins * 60 +def _autoserv_command_line(machines, extra_args, job=None, + queue_entry=None, verbose=True): + """ + @returns The autoserv command line as a list of executable + parameters. + + @param machines - string - A machine or comma separated list of machines + for the (-m) flag. + @param extra_args - list - Additional arguments to pass to autoserv. + @param job - Job object - If supplied, -u owner and -l name parameters + will be added. + @param queue_entry - A HostQueueEntry object - If supplied and no Job + object was supplied, this will be used to lookup the Job object. + """ + autoserv_argv = [_autoserv_path, '-p', + '-r', drone_manager.WORKING_DIRECTORY] + if machines: + autoserv_argv += ['-m', machines] + if job or queue_entry: + if not job: + job = queue_entry.job + autoserv_argv += ['-u', job.owner, '-l', job.name] + if verbose: + autoserv_argv.append('--verbose') + return autoserv_argv + extra_args + + def _site_init_monitor_db_dummy(): return {} @@ -89,17 +110,62 @@ def _sanity_check(): _verify_default_drone_set_exists() -def main(): - try: - try: - main_without_exception_handling() - except SystemExit: - raise - except: - logging.exception('Exception escaping in monitor_db') - raise - finally: - utils.delete_pid_file_if_exists(PID_FILE_PREFIX) +def setup_logging(): + log_dir = os.environ.get('AUTOTEST_SCHEDULER_LOG_DIR', None) + log_name = os.environ.get('AUTOTEST_SCHEDULER_LOG_NAME', None) + logging_manager.configure_logging( + scheduler_logging_config.SchedulerLoggingConfig(), log_dir=log_dir, + logfile_name=log_name) + + +def handle_sigint(signum, frame): + global _shutdown + _shutdown = True + logging.info("Shutdown request received.") + + +def initialize(): + logging.info("%s> dispatcher starting", time.strftime("%X %x")) + logging.info("My PID is %d", os.getpid()) + + if utils.program_is_alive(PID_FILE_PREFIX): + logging.critical("scheduler already running, aborting!") + sys.exit(1) + utils.write_pid(PID_FILE_PREFIX) + + if _testing_mode: + global_config.global_config.override_config_value( + DB_CONFIG_SECTION, 'database', 'stresstest_autotest_web') + + os.environ['PATH'] = AUTOTEST_SERVER_DIR + ':' + os.environ['PATH'] + global _db + _db = database_connection.DatabaseConnection(DB_CONFIG_SECTION) + _db.connect(db_type='django') + + # ensure Django connection is in autocommit + setup_django_environment.enable_autocommit() + # bypass the readonly connection + readonly_connection.ReadOnlyConnection.set_globally_disabled(True) + + logging.info("Setting signal handler") + signal.signal(signal.SIGINT, handle_sigint) + + initialize_globals() + scheduler_models.initialize() + + drones = global_config.global_config.get_config_value( + scheduler_config.CONFIG_SECTION, 'drones', default='localhost') + drone_list = [hostname.strip() for hostname in drones.split(',')] + results_host = global_config.global_config.get_config_value( + scheduler_config.CONFIG_SECTION, 'results_host', default='localhost') + _drone_manager.initialize(RESULTS_DIR, drone_list, results_host) + + logging.info("Connected! Running...") + + +def initialize_globals(): + global _drone_manager + _drone_manager = drone_manager.instance() def main_without_exception_handling(): @@ -162,7 +228,7 @@ def main_without_exception_handling(): time.sleep(scheduler_config.config.tick_pause_sec) except: email_manager.manager.log_stacktrace( - "Uncaught exception; terminating monitor_db") + "Uncaught exception; terminating scheduler") email_manager.manager.send_queued_emails() server.shutdown() @@ -170,88 +236,17 @@ def main_without_exception_handling(): _db.disconnect() -def setup_logging(): - log_dir = os.environ.get('AUTOTEST_SCHEDULER_LOG_DIR', None) - log_name = os.environ.get('AUTOTEST_SCHEDULER_LOG_NAME', None) - logging_manager.configure_logging( - scheduler_logging_config.SchedulerLoggingConfig(), log_dir=log_dir, - logfile_name=log_name) - - -def handle_sigint(signum, frame): - global _shutdown - _shutdown = True - logging.info("Shutdown request received.") - - -def initialize(): - logging.info("%s> dispatcher starting", time.strftime("%X %x")) - logging.info("My PID is %d", os.getpid()) - - if utils.program_is_alive(PID_FILE_PREFIX): - logging.critical("monitor_db already running, aborting!") - sys.exit(1) - utils.write_pid(PID_FILE_PREFIX) - - if _testing_mode: - global_config.global_config.override_config_value( - DB_CONFIG_SECTION, 'database', 'stresstest_autotest_web') - - os.environ['PATH'] = AUTOTEST_SERVER_DIR + ':' + os.environ['PATH'] - global _db - _db = database_connection.DatabaseConnection(DB_CONFIG_SECTION) - _db.connect(db_type='django') - - # ensure Django connection is in autocommit - setup_django_environment.enable_autocommit() - # bypass the readonly connection - readonly_connection.ReadOnlyConnection.set_globally_disabled(True) - - logging.info("Setting signal handler") - signal.signal(signal.SIGINT, handle_sigint) - - initialize_globals() - scheduler_models.initialize() - - drones = global_config.global_config.get_config_value( - scheduler_config.CONFIG_SECTION, 'drones', default='localhost') - drone_list = [hostname.strip() for hostname in drones.split(',')] - results_host = global_config.global_config.get_config_value( - scheduler_config.CONFIG_SECTION, 'results_host', default='localhost') - _drone_manager.initialize(RESULTS_DIR, drone_list, results_host) - - logging.info("Connected! Running...") - - -def initialize_globals(): - global _drone_manager - _drone_manager = drone_manager.instance() - - -def _autoserv_command_line(machines, extra_args, job=None, queue_entry=None, - verbose=True): - """ - @returns The autoserv command line as a list of executable + parameters. - - @param machines - string - A machine or comma separated list of machines - for the (-m) flag. - @param extra_args - list - Additional arguments to pass to autoserv. - @param job - Job object - If supplied, -u owner and -l name parameters - will be added. - @param queue_entry - A HostQueueEntry object - If supplied and no Job - object was supplied, this will be used to lookup the Job object. - """ - autoserv_argv = [_autoserv_path, '-p', - '-r', drone_manager.WORKING_DIRECTORY] - if machines: - autoserv_argv += ['-m', machines] - if job or queue_entry: - if not job: - job = queue_entry.job - autoserv_argv += ['-u', job.owner, '-l', job.name] - if verbose: - autoserv_argv.append('--verbose') - return autoserv_argv + extra_args +def main(): + try: + try: + main_without_exception_handling() + except SystemExit: + raise + except: + logging.exception('Exception escaping in scheduler') + raise + finally: + utils.delete_pid_file_if_exists(PID_FILE_PREFIX) class Dispatcher(object): @@ -800,7 +795,6 @@ def _process_recurring_runs(self): host_objects = info['hosts'] one_time_hosts = info['one_time_hosts'] metahost_objects = info['meta_hosts'] - dependencies = info['dependencies'] atomic_group = info['atomic_group'] for host in one_time_hosts or []: @@ -950,7 +944,7 @@ def _get_pidfile_info(self): """ try: self._get_pidfile_info_helper() - except self._PidfileException, exc: + except self._PidfileException: self._handle_pidfile_error('Pidfile error', traceback.format_exc()) @@ -2194,7 +2188,3 @@ def epilog(self): % self.monitor.exit_code()], paired_with_process=paired_process) self._set_all_statuses(self._final_status()) - - -if __name__ == '__main__': - main() diff --git a/scheduler/monitor_db_babysitter b/scheduler/monitor_db_watcher.py old mode 100755 new mode 100644 similarity index 58% rename from scheduler/monitor_db_babysitter rename to scheduler/monitor_db_watcher.py index bfe6568c84..9e1f984482 --- a/scheduler/monitor_db_babysitter +++ b/scheduler/monitor_db_watcher.py @@ -1,35 +1,26 @@ -#!/usr/bin/python -u +""" +Autotest scheduler watcher main library. +""" + import os, sys, signal, time, subprocess, logging from optparse import OptionParser try: - import autotest.common + import autotest.common as common except ImportError: import common -from autotest_lib.scheduler import babysitter_logging_config +from autotest_lib.scheduler import watcher_logging_config from autotest_lib.client.common_lib import error, global_config, utils from autotest_lib.client.common_lib import logging_manager from autotest_lib.scheduler import scheduler_logging_config from autotest_lib.scheduler import monitor_db + PAUSE_LENGTH = 60 STALL_TIMEOUT = 2*60*60 -parser = OptionParser() -parser.add_option("-r", action="store_true", dest="recover", - help=("run recovery mode (implicit after any crash)")) -parser.add_option("--background", dest="background", action="store_true", - default=False, help=("runs the scheduler monitor on " - "background")) -(options, args) = parser.parse_args() - autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) results_dir = os.path.join(autodir, 'results') -monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py') -recover = (options.recover == True) - -if len(args) != 0: - parser.print_help() - sys.exit(1) +monitor_db_path = os.path.join(autodir, 'scheduler/autotest-scheduler') def run_banner_output(cmd): @@ -46,7 +37,7 @@ def run_banner_output(cmd): def kill_monitor(): - logging.info("Killing monitor_db") + logging.info("Killing scheduler") # try shutdown first utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT) if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed? @@ -59,14 +50,14 @@ def kill_monitor(): def handle_sigterm(signum, frame): logging.info('Caught SIGTERM') kill_monitor() - utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX) + utils.delete_pid_file_if_exists(monitor_db.WATCHER_PID_FILE_PREFIX) sys.exit(1) signal.signal(signal.SIGTERM, handle_sigterm) SiteMonitorProc = utils.import_site_class( - __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter', + __file__, 'autotest_lib.scheduler.site_monitor_db_watcher', 'SiteMonitorProc', object) @@ -78,7 +69,6 @@ def __init__(self, do_recovery=False): args.append(results_dir) kill_monitor() - environ = os.environ scheduler_config = scheduler_logging_config.SchedulerLoggingConfig log_name = scheduler_config.get_log_name() os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name @@ -88,7 +78,7 @@ def __init__(self, do_recovery=False): self.log_size = 0 self.last_log_change = time.time() - logging.info("STARTING monitor_db with log file %s" % self.log_path) + logging.info("Starting scheduler with log file %s" % self.log_path) self.args = args # Allow site specific code to run, set environment variables and @@ -103,7 +93,7 @@ def start(self): def is_running(self): if self.proc.poll() is not None: - logging.info("monitor_db DIED") + logging.info("Scheduler died") return False old_size = self.log_size @@ -113,7 +103,7 @@ def is_running(self): self.log_size = new_size self.last_log_change = time.time() elif self.last_log_change + STALL_TIMEOUT < time.time(): - logging.info("monitor_db STALLED") + logging.info("Scheduler stalled") self.collect_stalled_info() return False @@ -142,52 +132,68 @@ def collect_stalled_info(self): log.close() -if os.getuid() == 0: - logging.critical("Running as root, aborting!") - sys.exit(1) - -if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX): - logging.critical("Monitor_db_babysitter already running, aborting!") - sys.exit(1) +def main(): + parser = OptionParser() + parser.add_option("-r", action="store_true", dest="recover", + help=("run recovery mode (implicit after any crash)")) + parser.add_option("--background", dest="background", action="store_true", + default=False, help=("runs the scheduler monitor on " + "background")) + (options, args) = parser.parse_args() -utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX) + recover = (options.recover == True) -if options.background: - logging_manager.configure_logging( - babysitter_logging_config.BabysitterLoggingConfig(use_console=False)) - - # Double fork - see http://code.activestate.com/recipes/66012/ - try: - pid = os.fork() - if (pid > 0): - sys.exit(0) # exit from first parent - except OSError, e: - sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror)) + if len(args) != 0: + parser.print_help() sys.exit(1) - # Decouple from parent environment - os.chdir("/") - os.umask(0) - os.setsid() + if os.getuid() == 0: + logging.critical("Running as root, aborting!") + sys.exit(1) - # Second fork - try: - pid = os.fork() - if (pid > 0): - sys.exit(0) # exit from second parent - except OSError, e: - sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror)) + if utils.program_is_alive(monitor_db.WATCHER_PID_FILE_PREFIX): + logging.critical("autotest-monitor-watcher already running, aborting!") sys.exit(1) -else: - logging_manager.configure_logging( - babysitter_logging_config.BabysitterLoggingConfig()) + utils.write_pid(monitor_db.WATCHER_PID_FILE_PREFIX) -while True: - proc = MonitorProc(do_recovery=recover) - proc.start() - time.sleep(PAUSE_LENGTH) - while proc.is_running(): - logging.info("Tick") + if options.background: + logging_manager.configure_logging( + watcher_logging_config.WatcherLoggingConfig(use_console=False)) + + # Double fork - see http://code.activestate.com/recipes/66012/ + try: + pid = os.fork() + if (pid > 0): + sys.exit(0) # exit from first parent + except OSError, e: + sys.stderr.write("fork #1 failed: (%d) %s\n" % + (e.errno, e.strerror)) + sys.exit(1) + + # Decouple from parent environment + os.chdir("/") + os.umask(0) + os.setsid() + + # Second fork + try: + pid = os.fork() + if (pid > 0): + sys.exit(0) # exit from second parent + except OSError, e: + sys.stderr.write("fork #2 failed: (%d) %s\n" % + (e.errno, e.strerror)) + sys.exit(1) + else: + logging_manager.configure_logging( + watcher_logging_config.WatcherLoggingConfig()) + + while True: + proc = MonitorProc(do_recovery=recover) + proc.start() time.sleep(PAUSE_LENGTH) - recover = False + while proc.is_running(): + logging.info("Tick") + time.sleep(PAUSE_LENGTH) + recover = False diff --git a/scheduler/babysitter_logging_config.py b/scheduler/watcher_logging_config.py similarity index 60% rename from scheduler/babysitter_logging_config.py rename to scheduler/watcher_logging_config.py index 5a2c48e2d3..8e526d7f97 100644 --- a/scheduler/babysitter_logging_config.py +++ b/scheduler/watcher_logging_config.py @@ -5,14 +5,14 @@ import logging from autotest_lib.client.common_lib import logging_config -class BabysitterLoggingConfig(logging_config.LoggingConfig): +class WatcherLoggingConfig(logging_config.LoggingConfig): def __init__(self, use_console=True): - super(BabysitterLoggingConfig, self).__init__(use_console=use_console) + super(WatcherLoggingConfig, self).__init__(use_console=use_console) def configure_logging(self): - super(BabysitterLoggingConfig, self).configure_logging( + super(WatcherLoggingConfig, self).configure_logging( use_console=self.use_console) - self.add_file_handler(self.get_timestamped_log_name('babysitter'), + self.add_file_handler(self.get_timestamped_log_name('scheduler-watcher'), logging.DEBUG, log_dir=self.get_server_log_dir()) diff --git a/utils/autotest-rh.init b/utils/autotest-rh.init index e5618a0abc..1201b35e88 100755 --- a/utils/autotest-rh.init +++ b/utils/autotest-rh.init @@ -14,7 +14,7 @@ # chkconfig: - 65 25 # description: Autotest is a framework for fully automated testing. # processname: monitor_db.py -# pidfile: /var/run/autotest/monitor_db_babysitter.pid +# pidfile: /var/run/autotest/autotest-monitor-watcher.pid # ### BEGIN INIT INFO # Provides: autotest @@ -37,26 +37,26 @@ LOCKFILE=/var/lock/subsys/$PROG # Autotest paths AUTOTEST_DIR="/usr/local/$PROG" -BABYSITTER="$AUTOTEST_DIR/scheduler/monitor_db_babysitter" -SCHEDULER="$AUTOTEST_DIR/scheduler/monitor_db.py" +WATCHER="$AUTOTEST_DIR/scheduler/autotest-scheduler-watcher" +SCHEDULER="$AUTOTEST_DIR/scheduler/autotest-scheduler" # Scheduler options OPTIONS="--background" # Where to locate PID files PID_PATH="$AUTOTEST_DIR" # "/var/run/$PROG" -BABYSITTER_PIDFILE="$PID_PATH/monitor_db_babysitter.pid" -SCHEDULER_PIDFILE="$PID_PATH/monitor_db.pid" +WATCHER_PIDFILE="$PID_PATH/autotest-scheduler-watcher.pid" +SCHEDULER_PIDFILE="$PID_PATH/autotest-scheduler.pid" # Assume pass RETVAL=0 start() { - [ -f $BABYSITTER ] || exit 5 + [ -f $WATCHER ] || exit 5 echo -n $"Starting $PROG: " - daemon --user $BECOME_USER --check $PROG $BABYSITTER $OPTIONS + daemon --user $BECOME_USER --check $PROG $WATCHER $OPTIONS RETVAL=$? echo [ "$RETVAL" = 0 ] && touch $LOCKFILE @@ -67,13 +67,13 @@ stop() { echo -n $"Stopping $PROG: " - killproc $BABYSITTER + killproc $WATCHER RETVAL=$? echo if [ "$RETVAL" = 0 ]; then rm -f $LOCKFILE - rm -f $BABYSITTER_PIDFILE + rm -f $WATCHER_PIDFILE rm -f $SCHEDULER_PIDFILE fi return $RETVAL @@ -82,7 +82,7 @@ stop() reload() { echo -n $"Reloading $PROG: " - killproc -p $BABYSITTER_PIDFILE $PROG -HUP + killproc -p $WATCHER_PIDFILE $PROG -HUP RETVAL=$? echo return $RETVAL @@ -124,7 +124,7 @@ case "$1" in ;; status) # status -p $PIDFILE $PROG - status $BABYSITTER + status $WATCHER status $SCHEDULER RETVAL=$? ;; diff --git a/utils/autotest.init b/utils/autotest.init index 78ed9a86fe..fe6ad19646 100755 --- a/utils/autotest.init +++ b/utils/autotest.init @@ -21,22 +21,22 @@ fi autotest_start() { cd /tmp - log_daemon_msg "Starting monitor_db_babysitter" + log_daemon_msg "Starting autotest-scheduler-watcher" ( ulimit -v 2048000 ; \ start-stop-daemon --start --quiet --chuid $BECOME_USER \ - --background --exec $BASE_DIR/scheduler/monitor_db_babysitter ) + --background --exec $BASE_DIR/scheduler/autotest-scheduler-watcher ) } stop_daemon() { PID_NAME=$1 DAEMON_NAME=$2 - log_daemon_msg "Stopping $DAEMON_NAME" + log_daemon_msg "Stopping autotest $DAEMON_NAME" start-stop-daemon --stop --quiet --pidfile $BASE_DIR/$PID_NAME.pid } autotest_stop() { - stop_daemon monitor_db_babysitter babysitter - stop_daemon monitor_db scheduler + stop_daemon autotest-scheduler-watcher scheduler-watcher + stop_daemon autotest-scheduler scheduler } case "$1" in diff --git a/utils/autotestd.service b/utils/autotestd.service index 0152ec2e35..b67dd60662 100644 --- a/utils/autotestd.service +++ b/utils/autotestd.service @@ -2,7 +2,7 @@ Description=Autotest scheduler [Service] -ExecStart=/usr/local/autotest/scheduler/monitor_db_babysitter +ExecStart=/usr/local/autotest/scheduler/autotest-scheduler-watcher User=autotest Group=autotest Restart=on-abort