xref: /aosp_15_r20/external/autotest/site_utils/lxc_cleanup.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li#!/usr/bin/env python3
2*9c5db199SXin Li# Copyright 2015 The Chromium Authors. All rights reserved.
3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
4*9c5db199SXin Li# found in the LICENSE file.
5*9c5db199SXin Li
6*9c5db199SXin Li"""Cleanup orphaned containers.
7*9c5db199SXin Li
8*9c5db199SXin LiIf an autoserv process dies without being able to call handler of SIGTERM, the
9*9c5db199SXin Licontainer used to run the test will be orphaned. This adds overhead to the
10*9c5db199SXin Lidrone. This script is used to clean up such containers.
11*9c5db199SXin Li
12*9c5db199SXin LiThis module also checks if the test job associated with a container has
13*9c5db199SXin Lifinished. If so, kill the autoserv process for the test job and destroy the
14*9c5db199SXin Licontainer. To avoid racing condition, this only applies to job finished at least
15*9c5db199SXin Li1 hour ago.
16*9c5db199SXin Li
17*9c5db199SXin Li"""
18*9c5db199SXin Li
19*9c5db199SXin Liimport argparse
20*9c5db199SXin Liimport datetime
21*9c5db199SXin Liimport logging
22*9c5db199SXin Liimport os
23*9c5db199SXin Liimport signal
24*9c5db199SXin Li
25*9c5db199SXin Liimport common
26*9c5db199SXin Lifrom autotest_lib.client.common_lib import logging_config
27*9c5db199SXin Lifrom autotest_lib.client.common_lib import time_utils
28*9c5db199SXin Lifrom autotest_lib.client.common_lib import utils
29*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
30*9c5db199SXin Lifrom autotest_lib.site_utils import lxc
31*9c5db199SXin Li
32*9c5db199SXin Li
33*9c5db199SXin LiAFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
34*9c5db199SXin Li# The cutoff time to declare a test job is completed and container is orphaned.
35*9c5db199SXin Li# This is to avoid a race condition that scheduler aborts a job and autoserv
36*9c5db199SXin Li# is still in the process of destroying the container it used.
37*9c5db199SXin LiFINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
38*9c5db199SXin Li
39*9c5db199SXin Lidef is_container_orphaned(container):
40*9c5db199SXin Li    """Check if a container is orphaned.
41*9c5db199SXin Li
42*9c5db199SXin Li    A container is orphaned if any of these condition is True:
43*9c5db199SXin Li    1. The autoserv process created the container is no longer running.
44*9c5db199SXin Li    2. The test job is finished at least 1 hour ago.
45*9c5db199SXin Li
46*9c5db199SXin Li    @param container: A Container object.
47*9c5db199SXin Li
48*9c5db199SXin Li    @return: True if the container is orphaned.
49*9c5db199SXin Li
50*9c5db199SXin Li    """
51*9c5db199SXin Li    logging.debug('Checking if container is orphaned: %s', container.name)
52*9c5db199SXin Li    if container.id is None:
53*9c5db199SXin Li        logging.debug('Container %s is not created for test.', container.name)
54*9c5db199SXin Li        return False
55*9c5db199SXin Li
56*9c5db199SXin Li    job_id = container.id.job_id
57*9c5db199SXin Li    pid = container.id.pid
58*9c5db199SXin Li
59*9c5db199SXin Li    if pid and not utils.pid_is_alive(pid):
60*9c5db199SXin Li        logging.debug('Process with PID %s is not alive, container %s is '
61*9c5db199SXin Li                      'orphaned.', pid, container.name)
62*9c5db199SXin Li        return True
63*9c5db199SXin Li
64*9c5db199SXin Li    try:
65*9c5db199SXin Li        hqes = AFE.get_host_queue_entries(job_id=job_id)
66*9c5db199SXin Li    except Exception as e:
67*9c5db199SXin Li        logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
68*9c5db199SXin Li        return False
69*9c5db199SXin Li
70*9c5db199SXin Li    if not hqes:
71*9c5db199SXin Li        # The job has not run yet.
72*9c5db199SXin Li        return False
73*9c5db199SXin Li    for hqe in hqes:
74*9c5db199SXin Li        if hqe.active or not hqe.complete:
75*9c5db199SXin Li            logging.debug('Test job %s is not completed yet, container %s is '
76*9c5db199SXin Li                          'not orphaned.', job_id, container.name)
77*9c5db199SXin Li            return False
78*9c5db199SXin Li        if (hqe.finished_on and
79*9c5db199SXin Li            (time_utils.time_string_to_datetime(hqe.finished_on) >
80*9c5db199SXin Li             FINISHED_JOB_CUTOFF_TIME)):
81*9c5db199SXin Li            logging.debug('Test job %s was completed less than an hour ago.',
82*9c5db199SXin Li                          job_id)
83*9c5db199SXin Li            return False
84*9c5db199SXin Li
85*9c5db199SXin Li    logging.debug('Test job %s was completed, container %s is orphaned.',
86*9c5db199SXin Li                  job_id, container.name)
87*9c5db199SXin Li    return True
88*9c5db199SXin Li
89*9c5db199SXin Li
90*9c5db199SXin Lidef cleanup(container, options):
91*9c5db199SXin Li    """Cleanup orphaned container.
92*9c5db199SXin Li
93*9c5db199SXin Li    @param container: A Container object to be cleaned up.
94*9c5db199SXin Li    @param options: Options to do cleanup.
95*9c5db199SXin Li
96*9c5db199SXin Li    @return: True if cleanup is successful. False otherwise.
97*9c5db199SXin Li
98*9c5db199SXin Li    """
99*9c5db199SXin Li    if not options.execute:
100*9c5db199SXin Li        logging.info('dryrun: Cleanup container %s', container.name)
101*9c5db199SXin Li        return False
102*9c5db199SXin Li
103*9c5db199SXin Li    try:
104*9c5db199SXin Li        # cleanup is protected by is_container_orphaned.  At this point the
105*9c5db199SXin Li        # container may be assumed to have a valid ID.
106*9c5db199SXin Li        pid = container.id.pid
107*9c5db199SXin Li        # Kill autoserv process
108*9c5db199SXin Li        if pid and utils.pid_is_alive(pid):
109*9c5db199SXin Li            logging.info('Stopping process %s...', pid)
110*9c5db199SXin Li            utils.nuke_pid(int(pid), (signal.SIGKILL,))
111*9c5db199SXin Li
112*9c5db199SXin Li        # Destroy container
113*9c5db199SXin Li        logging.info('Destroying container %s...', container.name)
114*9c5db199SXin Li        container.destroy()
115*9c5db199SXin Li        return True
116*9c5db199SXin Li    except Exception as e:
117*9c5db199SXin Li        logging.error('Failed to cleanup container %s. Error: %s',
118*9c5db199SXin Li                      container.name, e)
119*9c5db199SXin Li        return False
120*9c5db199SXin Li
121*9c5db199SXin Li
122*9c5db199SXin Lidef parse_options():
123*9c5db199SXin Li    """Parse command line inputs.
124*9c5db199SXin Li
125*9c5db199SXin Li    @return: Options to run the script.
126*9c5db199SXin Li    """
127*9c5db199SXin Li    parser = argparse.ArgumentParser()
128*9c5db199SXin Li    parser.add_argument('-v', '--verbose', action='store_true',
129*9c5db199SXin Li                        default=False,
130*9c5db199SXin Li                        help='Print out ALL entries.')
131*9c5db199SXin Li    parser.add_argument('-x', '--execute', action='store_true',
132*9c5db199SXin Li                        default=False,
133*9c5db199SXin Li                        help=('Execute the actions to kill autoserv processes '
134*9c5db199SXin Li                              'and destroy containers. Default is False to do '
135*9c5db199SXin Li                              'dry run'))
136*9c5db199SXin Li    # TODO(dshi): Consider to adopt the scheduler log model:
137*9c5db199SXin Li    # 1. Create one log per run.
138*9c5db199SXin Li    # 2. Create a symlink to the latest log.
139*9c5db199SXin Li    parser.add_argument('-l', '--logfile', type=str,
140*9c5db199SXin Li                        default=None,
141*9c5db199SXin Li                        help='Path to the log file to save logs.')
142*9c5db199SXin Li    return parser.parse_args()
143*9c5db199SXin Li
144*9c5db199SXin Li
145*9c5db199SXin Lidef main(options):
146*9c5db199SXin Li    """Main script.
147*9c5db199SXin Li
148*9c5db199SXin Li    @param options: Options to run the script.
149*9c5db199SXin Li    """
150*9c5db199SXin Li    config = logging_config.LoggingConfig()
151*9c5db199SXin Li    if options.logfile:
152*9c5db199SXin Li        config.add_file_handler(
153*9c5db199SXin Li                file_path=os.path.abspath(options.logfile),
154*9c5db199SXin Li                level=logging.DEBUG if options.verbose else logging.INFO)
155*9c5db199SXin Li
156*9c5db199SXin Li    bucket = lxc.ContainerBucket()
157*9c5db199SXin Li    logging.info('')
158*9c5db199SXin Li    logging.info('Cleaning container bucket %s', bucket.container_path)
159*9c5db199SXin Li    success_count = 0
160*9c5db199SXin Li    failure_count = 0
161*9c5db199SXin Li    for container in bucket.get_all().values():
162*9c5db199SXin Li        if is_container_orphaned(container):
163*9c5db199SXin Li            if cleanup(container, options):
164*9c5db199SXin Li                success_count += 1
165*9c5db199SXin Li            else:
166*9c5db199SXin Li                failure_count += 1
167*9c5db199SXin Li    logging.info('Cleanup finished.')
168*9c5db199SXin Li
169*9c5db199SXin Li
170*9c5db199SXin Liif __name__ == '__main__':
171*9c5db199SXin Li    options = parse_options()
172*9c5db199SXin Li    main(options)
173