1*9c5db199SXin Li#!/usr/bin/env python3 2*9c5db199SXin Li# Copyright 2015 The Chromium Authors. All rights reserved. 3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be 4*9c5db199SXin Li# found in the LICENSE file. 5*9c5db199SXin Li 6*9c5db199SXin Li"""Cleanup orphaned containers. 7*9c5db199SXin Li 8*9c5db199SXin LiIf an autoserv process dies without being able to call handler of SIGTERM, the 9*9c5db199SXin Licontainer used to run the test will be orphaned. This adds overhead to the 10*9c5db199SXin Lidrone. This script is used to clean up such containers. 11*9c5db199SXin Li 12*9c5db199SXin LiThis module also checks if the test job associated with a container has 13*9c5db199SXin Lifinished. If so, kill the autoserv process for the test job and destroy the 14*9c5db199SXin Licontainer. To avoid racing condition, this only applies to job finished at least 15*9c5db199SXin Li1 hour ago. 16*9c5db199SXin Li 17*9c5db199SXin Li""" 18*9c5db199SXin Li 19*9c5db199SXin Liimport argparse 20*9c5db199SXin Liimport datetime 21*9c5db199SXin Liimport logging 22*9c5db199SXin Liimport os 23*9c5db199SXin Liimport signal 24*9c5db199SXin Li 25*9c5db199SXin Liimport common 26*9c5db199SXin Lifrom autotest_lib.client.common_lib import logging_config 27*9c5db199SXin Lifrom autotest_lib.client.common_lib import time_utils 28*9c5db199SXin Lifrom autotest_lib.client.common_lib import utils 29*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers 30*9c5db199SXin Lifrom autotest_lib.site_utils import lxc 31*9c5db199SXin Li 32*9c5db199SXin Li 33*9c5db199SXin LiAFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10) 34*9c5db199SXin Li# The cutoff time to declare a test job is completed and container is orphaned. 35*9c5db199SXin Li# This is to avoid a race condition that scheduler aborts a job and autoserv 36*9c5db199SXin Li# is still in the process of destroying the container it used. 37*9c5db199SXin LiFINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1) 38*9c5db199SXin Li 39*9c5db199SXin Lidef is_container_orphaned(container): 40*9c5db199SXin Li """Check if a container is orphaned. 41*9c5db199SXin Li 42*9c5db199SXin Li A container is orphaned if any of these condition is True: 43*9c5db199SXin Li 1. The autoserv process created the container is no longer running. 44*9c5db199SXin Li 2. The test job is finished at least 1 hour ago. 45*9c5db199SXin Li 46*9c5db199SXin Li @param container: A Container object. 47*9c5db199SXin Li 48*9c5db199SXin Li @return: True if the container is orphaned. 49*9c5db199SXin Li 50*9c5db199SXin Li """ 51*9c5db199SXin Li logging.debug('Checking if container is orphaned: %s', container.name) 52*9c5db199SXin Li if container.id is None: 53*9c5db199SXin Li logging.debug('Container %s is not created for test.', container.name) 54*9c5db199SXin Li return False 55*9c5db199SXin Li 56*9c5db199SXin Li job_id = container.id.job_id 57*9c5db199SXin Li pid = container.id.pid 58*9c5db199SXin Li 59*9c5db199SXin Li if pid and not utils.pid_is_alive(pid): 60*9c5db199SXin Li logging.debug('Process with PID %s is not alive, container %s is ' 61*9c5db199SXin Li 'orphaned.', pid, container.name) 62*9c5db199SXin Li return True 63*9c5db199SXin Li 64*9c5db199SXin Li try: 65*9c5db199SXin Li hqes = AFE.get_host_queue_entries(job_id=job_id) 66*9c5db199SXin Li except Exception as e: 67*9c5db199SXin Li logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e) 68*9c5db199SXin Li return False 69*9c5db199SXin Li 70*9c5db199SXin Li if not hqes: 71*9c5db199SXin Li # The job has not run yet. 72*9c5db199SXin Li return False 73*9c5db199SXin Li for hqe in hqes: 74*9c5db199SXin Li if hqe.active or not hqe.complete: 75*9c5db199SXin Li logging.debug('Test job %s is not completed yet, container %s is ' 76*9c5db199SXin Li 'not orphaned.', job_id, container.name) 77*9c5db199SXin Li return False 78*9c5db199SXin Li if (hqe.finished_on and 79*9c5db199SXin Li (time_utils.time_string_to_datetime(hqe.finished_on) > 80*9c5db199SXin Li FINISHED_JOB_CUTOFF_TIME)): 81*9c5db199SXin Li logging.debug('Test job %s was completed less than an hour ago.', 82*9c5db199SXin Li job_id) 83*9c5db199SXin Li return False 84*9c5db199SXin Li 85*9c5db199SXin Li logging.debug('Test job %s was completed, container %s is orphaned.', 86*9c5db199SXin Li job_id, container.name) 87*9c5db199SXin Li return True 88*9c5db199SXin Li 89*9c5db199SXin Li 90*9c5db199SXin Lidef cleanup(container, options): 91*9c5db199SXin Li """Cleanup orphaned container. 92*9c5db199SXin Li 93*9c5db199SXin Li @param container: A Container object to be cleaned up. 94*9c5db199SXin Li @param options: Options to do cleanup. 95*9c5db199SXin Li 96*9c5db199SXin Li @return: True if cleanup is successful. False otherwise. 97*9c5db199SXin Li 98*9c5db199SXin Li """ 99*9c5db199SXin Li if not options.execute: 100*9c5db199SXin Li logging.info('dryrun: Cleanup container %s', container.name) 101*9c5db199SXin Li return False 102*9c5db199SXin Li 103*9c5db199SXin Li try: 104*9c5db199SXin Li # cleanup is protected by is_container_orphaned. At this point the 105*9c5db199SXin Li # container may be assumed to have a valid ID. 106*9c5db199SXin Li pid = container.id.pid 107*9c5db199SXin Li # Kill autoserv process 108*9c5db199SXin Li if pid and utils.pid_is_alive(pid): 109*9c5db199SXin Li logging.info('Stopping process %s...', pid) 110*9c5db199SXin Li utils.nuke_pid(int(pid), (signal.SIGKILL,)) 111*9c5db199SXin Li 112*9c5db199SXin Li # Destroy container 113*9c5db199SXin Li logging.info('Destroying container %s...', container.name) 114*9c5db199SXin Li container.destroy() 115*9c5db199SXin Li return True 116*9c5db199SXin Li except Exception as e: 117*9c5db199SXin Li logging.error('Failed to cleanup container %s. Error: %s', 118*9c5db199SXin Li container.name, e) 119*9c5db199SXin Li return False 120*9c5db199SXin Li 121*9c5db199SXin Li 122*9c5db199SXin Lidef parse_options(): 123*9c5db199SXin Li """Parse command line inputs. 124*9c5db199SXin Li 125*9c5db199SXin Li @return: Options to run the script. 126*9c5db199SXin Li """ 127*9c5db199SXin Li parser = argparse.ArgumentParser() 128*9c5db199SXin Li parser.add_argument('-v', '--verbose', action='store_true', 129*9c5db199SXin Li default=False, 130*9c5db199SXin Li help='Print out ALL entries.') 131*9c5db199SXin Li parser.add_argument('-x', '--execute', action='store_true', 132*9c5db199SXin Li default=False, 133*9c5db199SXin Li help=('Execute the actions to kill autoserv processes ' 134*9c5db199SXin Li 'and destroy containers. Default is False to do ' 135*9c5db199SXin Li 'dry run')) 136*9c5db199SXin Li # TODO(dshi): Consider to adopt the scheduler log model: 137*9c5db199SXin Li # 1. Create one log per run. 138*9c5db199SXin Li # 2. Create a symlink to the latest log. 139*9c5db199SXin Li parser.add_argument('-l', '--logfile', type=str, 140*9c5db199SXin Li default=None, 141*9c5db199SXin Li help='Path to the log file to save logs.') 142*9c5db199SXin Li return parser.parse_args() 143*9c5db199SXin Li 144*9c5db199SXin Li 145*9c5db199SXin Lidef main(options): 146*9c5db199SXin Li """Main script. 147*9c5db199SXin Li 148*9c5db199SXin Li @param options: Options to run the script. 149*9c5db199SXin Li """ 150*9c5db199SXin Li config = logging_config.LoggingConfig() 151*9c5db199SXin Li if options.logfile: 152*9c5db199SXin Li config.add_file_handler( 153*9c5db199SXin Li file_path=os.path.abspath(options.logfile), 154*9c5db199SXin Li level=logging.DEBUG if options.verbose else logging.INFO) 155*9c5db199SXin Li 156*9c5db199SXin Li bucket = lxc.ContainerBucket() 157*9c5db199SXin Li logging.info('') 158*9c5db199SXin Li logging.info('Cleaning container bucket %s', bucket.container_path) 159*9c5db199SXin Li success_count = 0 160*9c5db199SXin Li failure_count = 0 161*9c5db199SXin Li for container in bucket.get_all().values(): 162*9c5db199SXin Li if is_container_orphaned(container): 163*9c5db199SXin Li if cleanup(container, options): 164*9c5db199SXin Li success_count += 1 165*9c5db199SXin Li else: 166*9c5db199SXin Li failure_count += 1 167*9c5db199SXin Li logging.info('Cleanup finished.') 168*9c5db199SXin Li 169*9c5db199SXin Li 170*9c5db199SXin Liif __name__ == '__main__': 171*9c5db199SXin Li options = parse_options() 172*9c5db199SXin Li main(options) 173