xref: /aosp_15_r20/external/autotest/server/cros/tradefed/tradefed_utils.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li# Lint as: python2, python3
2*9c5db199SXin Li# Copyright 2018 The Chromium OS Authors. All rights reserved.
3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
4*9c5db199SXin Li# found in the LICENSE file.
5*9c5db199SXin Li
6*9c5db199SXin Liimport contextlib
7*9c5db199SXin Liimport logging
8*9c5db199SXin Liimport os
9*9c5db199SXin Liimport random
10*9c5db199SXin Liimport re
11*9c5db199SXin Lifrom xml.etree import ElementTree
12*9c5db199SXin Li
13*9c5db199SXin Lifrom autotest_lib.client.common_lib import utils as common_utils
14*9c5db199SXin Lifrom autotest_lib.client.common_lib import error
15*9c5db199SXin Lifrom autotest_lib.server.cros import lockfile
16*9c5db199SXin Li
17*9c5db199SXin LiPERF_MODULE_NAME_PREFIX = 'CTS.'
18*9c5db199SXin Li
19*9c5db199SXin Li@contextlib.contextmanager
20*9c5db199SXin Lidef lock(filename):
21*9c5db199SXin Li    """Prevents other autotest/tradefed instances from accessing cache.
22*9c5db199SXin Li
23*9c5db199SXin Li    @param filename: The file to be locked.
24*9c5db199SXin Li    """
25*9c5db199SXin Li    filelock = lockfile.FileLock(filename)
26*9c5db199SXin Li    # It is tempting just to call filelock.acquire(3600). But the implementation
27*9c5db199SXin Li    # has very poor temporal granularity (timeout/10), which is unsuitable for
28*9c5db199SXin Li    # our needs. See /usr/lib64/python2.7/site-packages/lockfile/
29*9c5db199SXin Li    attempts = 0
30*9c5db199SXin Li    total_wait_seconds = 0
31*9c5db199SXin Li    while not filelock.i_am_locking():
32*9c5db199SXin Li        try:
33*9c5db199SXin Li            attempts += 1
34*9c5db199SXin Li            logging.info('Waiting for cache lock...')
35*9c5db199SXin Li            # Cap the wait by 2 minutes. Setting too long wait makes long-lived
36*9c5db199SXin Li            # tasks to have less chance to obtain the lock, leading to failures.
37*9c5db199SXin Li            # We must not use a random integer as the filelock implementations
38*9c5db199SXin Li            # may underflow an integer division.
39*9c5db199SXin Li            wait = min(120.0, random.uniform(0.0, pow(2.0, attempts)))
40*9c5db199SXin Li            total_wait_seconds += wait
41*9c5db199SXin Li            filelock.acquire(wait)
42*9c5db199SXin Li        except (lockfile.AlreadyLocked, lockfile.LockTimeout):
43*9c5db199SXin Li            # Our goal is to wait long enough to be sure something very bad
44*9c5db199SXin Li            # happened to the locking thread. Wait 2 hours at maximum
45*9c5db199SXin Li            if total_wait_seconds >= 7200:
46*9c5db199SXin Li                # Normally we should acquire the lock immediately. Once we
47*9c5db199SXin Li                # wait on the order of 10 minutes either the dev server IO is
48*9c5db199SXin Li                # overloaded or a lock didn't get cleaned up. Take one for the
49*9c5db199SXin Li                # team, break the lock and report a failure. This should fix
50*9c5db199SXin Li                # the lock for following tests. If the failure affects more than
51*9c5db199SXin Li                # one job look for a deadlock or dev server overload.
52*9c5db199SXin Li                age = filelock.age_of_lock()
53*9c5db199SXin Li                logging.error('Permanent lock failure. Lock age = %d.', age)
54*9c5db199SXin Li                # Breaking a lock is a destructive operation that causes
55*9c5db199SXin Li                # abort of locking jobs, cleanup and redowloading of cache
56*9c5db199SXin Li                # contents. When the error was due to overloaded server,
57*9c5db199SXin Li                # it cause even more cascade of lock errors. Wait 4 hours
58*9c5db199SXin Li                # before breaking the lock. Tasks inside the critical section
59*9c5db199SXin Li                # is about downloading a few gigabytes of files. Taking 4 hours
60*9c5db199SXin Li                # strongly implies the job had went wrong.
61*9c5db199SXin Li                if age > 4 * 60 * 60:
62*9c5db199SXin Li                    logging.error('Trying to break lock.')
63*9c5db199SXin Li                    filelock.break_lock()
64*9c5db199SXin Li                raise error.TestFail('Error: permanent cache lock failure.')
65*9c5db199SXin Li        else:
66*9c5db199SXin Li            logging.info('Acquired cache lock after %d attempts.', attempts)
67*9c5db199SXin Li    try:
68*9c5db199SXin Li        yield
69*9c5db199SXin Li    finally:
70*9c5db199SXin Li        filelock.release()
71*9c5db199SXin Li        logging.info('Released cache lock.')
72*9c5db199SXin Li
73*9c5db199SXin Li
74*9c5db199SXin Li@contextlib.contextmanager
75*9c5db199SXin Lidef adb_keepalive(targets, extra_paths):
76*9c5db199SXin Li    """A context manager that keeps the adb connection alive.
77*9c5db199SXin Li
78*9c5db199SXin Li    AdbKeepalive will spin off a new process that will continuously poll for
79*9c5db199SXin Li    adb's connected state, and will attempt to reconnect if it ever goes down.
80*9c5db199SXin Li    This is the only way we can currently recover safely from (intentional)
81*9c5db199SXin Li    reboots.
82*9c5db199SXin Li
83*9c5db199SXin Li    @param target: the hostname and port of the DUT.
84*9c5db199SXin Li    @param extra_paths: any additional components to the PATH environment
85*9c5db199SXin Li                        variable.
86*9c5db199SXin Li    """
87*9c5db199SXin Li    from autotest_lib.client.common_lib.cros import adb_keepalive as module
88*9c5db199SXin Li    # |__file__| returns the absolute path of the compiled bytecode of the
89*9c5db199SXin Li    # module. We want to run the original .py file, so we need to change the
90*9c5db199SXin Li    # extension back.
91*9c5db199SXin Li    script_filename = module.__file__.replace('.pyc', '.py')
92*9c5db199SXin Li    jobs = [common_utils.BgJob(
93*9c5db199SXin Li        [script_filename, target],
94*9c5db199SXin Li        nickname='adb_keepalive',
95*9c5db199SXin Li        stderr_level=logging.DEBUG,
96*9c5db199SXin Li        stdout_tee=common_utils.TEE_TO_LOGS,
97*9c5db199SXin Li        stderr_tee=common_utils.TEE_TO_LOGS,
98*9c5db199SXin Li        extra_paths=extra_paths) for target in targets]
99*9c5db199SXin Li
100*9c5db199SXin Li    try:
101*9c5db199SXin Li        yield
102*9c5db199SXin Li    finally:
103*9c5db199SXin Li        # The adb_keepalive.py script runs forever until SIGTERM is sent.
104*9c5db199SXin Li        for job in jobs:
105*9c5db199SXin Li            common_utils.nuke_subprocess(job.sp)
106*9c5db199SXin Li        common_utils.join_bg_jobs(jobs)
107*9c5db199SXin Li
108*9c5db199SXin Li
109*9c5db199SXin Lidef parse_tradefed_testresults_xml(test_result_xml_path, waivers=None):
110*9c5db199SXin Li    """ Check the result from tradefed through test_results.xml
111*9c5db199SXin Li    @param waivers: a set() of tests which are permitted to fail.
112*9c5db199SXin Li    """
113*9c5db199SXin Li    waived_count = dict()
114*9c5db199SXin Li    failed_tests = set()
115*9c5db199SXin Li    try:
116*9c5db199SXin Li        root = ElementTree.parse(test_result_xml_path)
117*9c5db199SXin Li        for module in root.iter('Module'):
118*9c5db199SXin Li            module_name = module.get('name')
119*9c5db199SXin Li            for testcase in module.iter('TestCase'):
120*9c5db199SXin Li                testcase_name = testcase.get('name')
121*9c5db199SXin Li                for test in testcase.iter('Test'):
122*9c5db199SXin Li                    test_case = test.get('name')
123*9c5db199SXin Li                    test_res = test.get('result')
124*9c5db199SXin Li                    test_name = '%s#%s' % (testcase_name, test_case)
125*9c5db199SXin Li
126*9c5db199SXin Li                    if test_res == "fail":
127*9c5db199SXin Li                        test_fail = test.find('Failure')
128*9c5db199SXin Li                        failed_message = test_fail.get('message')
129*9c5db199SXin Li                        failed_stacktrace = test_fail.find('StackTrace').text
130*9c5db199SXin Li
131*9c5db199SXin Li                        if waivers and test_name in waivers:
132*9c5db199SXin Li                            waived_count[test_name] = (
133*9c5db199SXin Li                                waived_count.get(test_name, 0) + 1)
134*9c5db199SXin Li                        else:
135*9c5db199SXin Li                            failed_tests.add(test_name)
136*9c5db199SXin Li
137*9c5db199SXin Li        # Check for test completion.
138*9c5db199SXin Li        for summary in root.iter('Summary'):
139*9c5db199SXin Li            modules_done = summary.get('modules_done')
140*9c5db199SXin Li            modules_total = summary.get('modules_total')
141*9c5db199SXin Li
142*9c5db199SXin Li        if failed_tests:
143*9c5db199SXin Li            logging.error('Failed (but not waived) tests:\n%s',
144*9c5db199SXin Li                          '\n'.join(sorted(failed_tests)))
145*9c5db199SXin Li
146*9c5db199SXin Li        waived = []
147*9c5db199SXin Li        for testname, fail_count in waived_count.items():
148*9c5db199SXin Li            waived += [testname] * fail_count
149*9c5db199SXin Li            logging.info('Waived failure for %s %d time(s)',
150*9c5db199SXin Li                         testname, fail_count)
151*9c5db199SXin Li        logging.info('>> Total waived = %s', waived)
152*9c5db199SXin Li        return waived
153*9c5db199SXin Li
154*9c5db199SXin Li    except Exception as e:
155*9c5db199SXin Li        logging.warning('Exception raised in '
156*9c5db199SXin Li                        '|tradefed_utils.parse_tradefed_testresults_xml|: {'
157*9c5db199SXin Li                        '0}'.format(e))
158*9c5db199SXin Li        return []
159*9c5db199SXin Li
160*9c5db199SXin Li
161*9c5db199SXin Lidef parse_tradefed_result(result, waivers=None):
162*9c5db199SXin Li    """Check the result from the tradefed output.
163*9c5db199SXin Li
164*9c5db199SXin Li    @param result: The result stdout string from the tradefed command.
165*9c5db199SXin Li    @param waivers: a set() of tests which are permitted to fail.
166*9c5db199SXin Li    @return List of the waived tests.
167*9c5db199SXin Li    """
168*9c5db199SXin Li    # Regular expressions for start/end messages of each test-run chunk.
169*9c5db199SXin Li    abi_re = r'arm\S*|x86\S*'
170*9c5db199SXin Li    # TODO(kinaba): use the current running module name.
171*9c5db199SXin Li    module_re = r'\S+'
172*9c5db199SXin Li    start_re = re.compile(r'(?:Start|Continu)ing (%s) %s with'
173*9c5db199SXin Li                          r' (\d+(?:,\d+)?) test' % (abi_re, module_re))
174*9c5db199SXin Li    end_re = re.compile(r'(%s) %s (?:complet|fail)ed in .*\.'
175*9c5db199SXin Li                        r' (\d+) passed, (\d+) failed, (\d+) not executed' %
176*9c5db199SXin Li                        (abi_re, module_re))
177*9c5db199SXin Li    fail_re = re.compile(r'I/ConsoleReporter.* (\S+) fail:')
178*9c5db199SXin Li    inaccurate_re = re.compile(r'IMPORTANT: Some modules failed to run to '
179*9c5db199SXin Li                                'completion, tests counts may be inaccurate')
180*9c5db199SXin Li    abis = set()
181*9c5db199SXin Li    waived_count = dict()
182*9c5db199SXin Li    failed_tests = set()
183*9c5db199SXin Li    accurate = True
184*9c5db199SXin Li    for line in result.splitlines():
185*9c5db199SXin Li        match = start_re.search(line)
186*9c5db199SXin Li        if match:
187*9c5db199SXin Li            abis = abis.union([match.group(1)])
188*9c5db199SXin Li            continue
189*9c5db199SXin Li        match = end_re.search(line)
190*9c5db199SXin Li        if match:
191*9c5db199SXin Li            abi = match.group(1)
192*9c5db199SXin Li            if abi not in abis:
193*9c5db199SXin Li                logging.error('Trunk end with %s abi but have not seen '
194*9c5db199SXin Li                              'any trunk start with this abi.(%s)', abi, line)
195*9c5db199SXin Li            continue
196*9c5db199SXin Li        match = fail_re.search(line)
197*9c5db199SXin Li        if match:
198*9c5db199SXin Li            testname = match.group(1)
199*9c5db199SXin Li            if waivers and testname in waivers:
200*9c5db199SXin Li                waived_count[testname] = waived_count.get(testname, 0) + 1
201*9c5db199SXin Li            else:
202*9c5db199SXin Li                failed_tests.add(testname)
203*9c5db199SXin Li            continue
204*9c5db199SXin Li        # b/66899135, tradefed may reported inaccuratly with `list results`.
205*9c5db199SXin Li        # Add warning if summary section shows that the result is inacurrate.
206*9c5db199SXin Li        match = inaccurate_re.search(line)
207*9c5db199SXin Li        if match:
208*9c5db199SXin Li            accurate = False
209*9c5db199SXin Li
210*9c5db199SXin Li    logging.info('Total ABIs: %s', abis)
211*9c5db199SXin Li    if failed_tests:
212*9c5db199SXin Li        logging.error('Failed (but not waived) tests:\n%s',
213*9c5db199SXin Li            '\n'.join(sorted(failed_tests)))
214*9c5db199SXin Li
215*9c5db199SXin Li    # TODO(dhaddock): Find a more robust way to apply waivers.
216*9c5db199SXin Li    waived = []
217*9c5db199SXin Li    for testname, fail_count in waived_count.items():
218*9c5db199SXin Li        if fail_count > len(abis):
219*9c5db199SXin Li            # This should be an error.TestFail, but unfortunately
220*9c5db199SXin Li            # tradefed has a bug that emits "fail" twice when a
221*9c5db199SXin Li            # test failed during teardown. It will anyway causes
222*9c5db199SXin Li            # a test count inconsistency and visible on the dashboard.
223*9c5db199SXin Li            logging.error('Found %d failures for %s but there are only %d '
224*9c5db199SXin Li                          'abis: %s', fail_count, testname, len(abis), abis)
225*9c5db199SXin Li            fail_count = len(abis)
226*9c5db199SXin Li        waived += [testname] * fail_count
227*9c5db199SXin Li        logging.info('Waived failure for %s %d time(s)', testname, fail_count)
228*9c5db199SXin Li    logging.info('Total waived = %s', waived)
229*9c5db199SXin Li    return waived, accurate
230*9c5db199SXin Li
231*9c5db199SXin Li
232*9c5db199SXin Li# A similar implementation in Java can be found at
233*9c5db199SXin Li# https://android.googlesource.com/platform/test/suite_harness/+/refs/heads/\
234*9c5db199SXin Li# pie-cts-dev/common/util/src/com/android/compatibility/common/util/\
235*9c5db199SXin Li# ResultHandler.java
236*9c5db199SXin Lidef get_test_result_xml_path(results_destination):
237*9c5db199SXin Li    """Get the path of test_result.xml from the last session.
238*9c5db199SXin Li    Raises:
239*9c5db199SXin Li        Should never raise!
240*9c5db199SXin Li    """
241*9c5db199SXin Li    try:
242*9c5db199SXin Li        last_result_path = None
243*9c5db199SXin Li        for dir in os.listdir(results_destination):
244*9c5db199SXin Li            result_dir = os.path.join(results_destination, dir)
245*9c5db199SXin Li            result_path = os.path.join(result_dir, 'test_result.xml')
246*9c5db199SXin Li            # We use the lexicographically largest path, because |dir| are
247*9c5db199SXin Li            # of format YYYY.MM.DD_HH.MM.SS. The last session will always
248*9c5db199SXin Li            # have the latest date which leads to the lexicographically
249*9c5db199SXin Li            # largest path.
250*9c5db199SXin Li            if last_result_path and last_result_path > result_path:
251*9c5db199SXin Li                continue
252*9c5db199SXin Li            # We need to check for `islink` as `isdir` returns true if
253*9c5db199SXin Li            # |result_dir| is a symbolic link to a directory.
254*9c5db199SXin Li            if not os.path.isdir(result_dir) or os.path.islink(result_dir):
255*9c5db199SXin Li                continue
256*9c5db199SXin Li            if not os.path.exists(result_path):
257*9c5db199SXin Li                continue
258*9c5db199SXin Li            last_result_path = result_path
259*9c5db199SXin Li        return last_result_path
260*9c5db199SXin Li    except Exception as e:
261*9c5db199SXin Li        logging.warning(
262*9c5db199SXin Li            'Exception raised in '
263*9c5db199SXin Li            '|tradefed_utils.get_test_result_xml_path|: {'
264*9c5db199SXin Li            '0}'.format(e))
265*9c5db199SXin Li
266*9c5db199SXin Li
267*9c5db199SXin Lidef get_perf_metrics_from_test_result_xml(result_path, resultsdir):
268*9c5db199SXin Li    """Parse test_result.xml and each <Metric /> is mapped to a dict that
269*9c5db199SXin Li    can be used as kwargs of |TradefedTest.output_perf_value|.
270*9c5db199SXin Li    Raises:
271*9c5db199SXin Li        Should never raise!
272*9c5db199SXin Li    """
273*9c5db199SXin Li    try:
274*9c5db199SXin Li        root = ElementTree.parse(result_path)
275*9c5db199SXin Li        for module in root.iter('Module'):
276*9c5db199SXin Li            module_name = module.get('name')
277*9c5db199SXin Li            for testcase in module.iter('TestCase'):
278*9c5db199SXin Li                testcase_name = testcase.get('name')
279*9c5db199SXin Li                for test in testcase.iter('Test'):
280*9c5db199SXin Li                    test_name = test.get('name')
281*9c5db199SXin Li                    for metric in test.iter('Metric'):
282*9c5db199SXin Li                        score_type = metric.get('score_type')
283*9c5db199SXin Li                        if score_type == None:
284*9c5db199SXin Li                            continue
285*9c5db199SXin Li                        if score_type not in ['higher_better', 'lower_better']:
286*9c5db199SXin Li                            logging.warning(
287*9c5db199SXin Li                                'Unsupported score_type in %s/%s/%s',
288*9c5db199SXin Li                                module_name, testcase_name, test_name)
289*9c5db199SXin Li                            continue
290*9c5db199SXin Li                        higher_is_better = (score_type == 'higher_better')
291*9c5db199SXin Li                        units = metric.get('score_unit')
292*9c5db199SXin Li                        yield dict(
293*9c5db199SXin Li                            description=testcase_name + '#' + test_name,
294*9c5db199SXin Li                            value=metric[0].text,
295*9c5db199SXin Li                            units=units,
296*9c5db199SXin Li                            higher_is_better=higher_is_better,
297*9c5db199SXin Li                            resultsdir=os.path.join(resultsdir, 'tests',
298*9c5db199SXin Li                                PERF_MODULE_NAME_PREFIX + module_name)
299*9c5db199SXin Li                        )
300*9c5db199SXin Li    except Exception as e:
301*9c5db199SXin Li        logging.warning(
302*9c5db199SXin Li            'Exception raised in '
303*9c5db199SXin Li            '|tradefed_utils.get_perf_metrics_from_test_result_xml|: {'
304*9c5db199SXin Li            '0}'.format(e))
305