xref: /aosp_15_r20/external/autotest/server/cros/tradefed/tradefed_utils.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Lint as: python2, python3
2# Copyright 2018 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import contextlib
7import logging
8import os
9import random
10import re
11from xml.etree import ElementTree
12
13from autotest_lib.client.common_lib import utils as common_utils
14from autotest_lib.client.common_lib import error
15from autotest_lib.server.cros import lockfile
16
17PERF_MODULE_NAME_PREFIX = 'CTS.'
18
19@contextlib.contextmanager
20def lock(filename):
21    """Prevents other autotest/tradefed instances from accessing cache.
22
23    @param filename: The file to be locked.
24    """
25    filelock = lockfile.FileLock(filename)
26    # It is tempting just to call filelock.acquire(3600). But the implementation
27    # has very poor temporal granularity (timeout/10), which is unsuitable for
28    # our needs. See /usr/lib64/python2.7/site-packages/lockfile/
29    attempts = 0
30    total_wait_seconds = 0
31    while not filelock.i_am_locking():
32        try:
33            attempts += 1
34            logging.info('Waiting for cache lock...')
35            # Cap the wait by 2 minutes. Setting too long wait makes long-lived
36            # tasks to have less chance to obtain the lock, leading to failures.
37            # We must not use a random integer as the filelock implementations
38            # may underflow an integer division.
39            wait = min(120.0, random.uniform(0.0, pow(2.0, attempts)))
40            total_wait_seconds += wait
41            filelock.acquire(wait)
42        except (lockfile.AlreadyLocked, lockfile.LockTimeout):
43            # Our goal is to wait long enough to be sure something very bad
44            # happened to the locking thread. Wait 2 hours at maximum
45            if total_wait_seconds >= 7200:
46                # Normally we should acquire the lock immediately. Once we
47                # wait on the order of 10 minutes either the dev server IO is
48                # overloaded or a lock didn't get cleaned up. Take one for the
49                # team, break the lock and report a failure. This should fix
50                # the lock for following tests. If the failure affects more than
51                # one job look for a deadlock or dev server overload.
52                age = filelock.age_of_lock()
53                logging.error('Permanent lock failure. Lock age = %d.', age)
54                # Breaking a lock is a destructive operation that causes
55                # abort of locking jobs, cleanup and redowloading of cache
56                # contents. When the error was due to overloaded server,
57                # it cause even more cascade of lock errors. Wait 4 hours
58                # before breaking the lock. Tasks inside the critical section
59                # is about downloading a few gigabytes of files. Taking 4 hours
60                # strongly implies the job had went wrong.
61                if age > 4 * 60 * 60:
62                    logging.error('Trying to break lock.')
63                    filelock.break_lock()
64                raise error.TestFail('Error: permanent cache lock failure.')
65        else:
66            logging.info('Acquired cache lock after %d attempts.', attempts)
67    try:
68        yield
69    finally:
70        filelock.release()
71        logging.info('Released cache lock.')
72
73
74@contextlib.contextmanager
75def adb_keepalive(targets, extra_paths):
76    """A context manager that keeps the adb connection alive.
77
78    AdbKeepalive will spin off a new process that will continuously poll for
79    adb's connected state, and will attempt to reconnect if it ever goes down.
80    This is the only way we can currently recover safely from (intentional)
81    reboots.
82
83    @param target: the hostname and port of the DUT.
84    @param extra_paths: any additional components to the PATH environment
85                        variable.
86    """
87    from autotest_lib.client.common_lib.cros import adb_keepalive as module
88    # |__file__| returns the absolute path of the compiled bytecode of the
89    # module. We want to run the original .py file, so we need to change the
90    # extension back.
91    script_filename = module.__file__.replace('.pyc', '.py')
92    jobs = [common_utils.BgJob(
93        [script_filename, target],
94        nickname='adb_keepalive',
95        stderr_level=logging.DEBUG,
96        stdout_tee=common_utils.TEE_TO_LOGS,
97        stderr_tee=common_utils.TEE_TO_LOGS,
98        extra_paths=extra_paths) for target in targets]
99
100    try:
101        yield
102    finally:
103        # The adb_keepalive.py script runs forever until SIGTERM is sent.
104        for job in jobs:
105            common_utils.nuke_subprocess(job.sp)
106        common_utils.join_bg_jobs(jobs)
107
108
109def parse_tradefed_testresults_xml(test_result_xml_path, waivers=None):
110    """ Check the result from tradefed through test_results.xml
111    @param waivers: a set() of tests which are permitted to fail.
112    """
113    waived_count = dict()
114    failed_tests = set()
115    try:
116        root = ElementTree.parse(test_result_xml_path)
117        for module in root.iter('Module'):
118            module_name = module.get('name')
119            for testcase in module.iter('TestCase'):
120                testcase_name = testcase.get('name')
121                for test in testcase.iter('Test'):
122                    test_case = test.get('name')
123                    test_res = test.get('result')
124                    test_name = '%s#%s' % (testcase_name, test_case)
125
126                    if test_res == "fail":
127                        test_fail = test.find('Failure')
128                        failed_message = test_fail.get('message')
129                        failed_stacktrace = test_fail.find('StackTrace').text
130
131                        if waivers and test_name in waivers:
132                            waived_count[test_name] = (
133                                waived_count.get(test_name, 0) + 1)
134                        else:
135                            failed_tests.add(test_name)
136
137        # Check for test completion.
138        for summary in root.iter('Summary'):
139            modules_done = summary.get('modules_done')
140            modules_total = summary.get('modules_total')
141
142        if failed_tests:
143            logging.error('Failed (but not waived) tests:\n%s',
144                          '\n'.join(sorted(failed_tests)))
145
146        waived = []
147        for testname, fail_count in waived_count.items():
148            waived += [testname] * fail_count
149            logging.info('Waived failure for %s %d time(s)',
150                         testname, fail_count)
151        logging.info('>> Total waived = %s', waived)
152        return waived
153
154    except Exception as e:
155        logging.warning('Exception raised in '
156                        '|tradefed_utils.parse_tradefed_testresults_xml|: {'
157                        '0}'.format(e))
158        return []
159
160
161def parse_tradefed_result(result, waivers=None):
162    """Check the result from the tradefed output.
163
164    @param result: The result stdout string from the tradefed command.
165    @param waivers: a set() of tests which are permitted to fail.
166    @return List of the waived tests.
167    """
168    # Regular expressions for start/end messages of each test-run chunk.
169    abi_re = r'arm\S*|x86\S*'
170    # TODO(kinaba): use the current running module name.
171    module_re = r'\S+'
172    start_re = re.compile(r'(?:Start|Continu)ing (%s) %s with'
173                          r' (\d+(?:,\d+)?) test' % (abi_re, module_re))
174    end_re = re.compile(r'(%s) %s (?:complet|fail)ed in .*\.'
175                        r' (\d+) passed, (\d+) failed, (\d+) not executed' %
176                        (abi_re, module_re))
177    fail_re = re.compile(r'I/ConsoleReporter.* (\S+) fail:')
178    inaccurate_re = re.compile(r'IMPORTANT: Some modules failed to run to '
179                                'completion, tests counts may be inaccurate')
180    abis = set()
181    waived_count = dict()
182    failed_tests = set()
183    accurate = True
184    for line in result.splitlines():
185        match = start_re.search(line)
186        if match:
187            abis = abis.union([match.group(1)])
188            continue
189        match = end_re.search(line)
190        if match:
191            abi = match.group(1)
192            if abi not in abis:
193                logging.error('Trunk end with %s abi but have not seen '
194                              'any trunk start with this abi.(%s)', abi, line)
195            continue
196        match = fail_re.search(line)
197        if match:
198            testname = match.group(1)
199            if waivers and testname in waivers:
200                waived_count[testname] = waived_count.get(testname, 0) + 1
201            else:
202                failed_tests.add(testname)
203            continue
204        # b/66899135, tradefed may reported inaccuratly with `list results`.
205        # Add warning if summary section shows that the result is inacurrate.
206        match = inaccurate_re.search(line)
207        if match:
208            accurate = False
209
210    logging.info('Total ABIs: %s', abis)
211    if failed_tests:
212        logging.error('Failed (but not waived) tests:\n%s',
213            '\n'.join(sorted(failed_tests)))
214
215    # TODO(dhaddock): Find a more robust way to apply waivers.
216    waived = []
217    for testname, fail_count in waived_count.items():
218        if fail_count > len(abis):
219            # This should be an error.TestFail, but unfortunately
220            # tradefed has a bug that emits "fail" twice when a
221            # test failed during teardown. It will anyway causes
222            # a test count inconsistency and visible on the dashboard.
223            logging.error('Found %d failures for %s but there are only %d '
224                          'abis: %s', fail_count, testname, len(abis), abis)
225            fail_count = len(abis)
226        waived += [testname] * fail_count
227        logging.info('Waived failure for %s %d time(s)', testname, fail_count)
228    logging.info('Total waived = %s', waived)
229    return waived, accurate
230
231
232# A similar implementation in Java can be found at
233# https://android.googlesource.com/platform/test/suite_harness/+/refs/heads/\
234# pie-cts-dev/common/util/src/com/android/compatibility/common/util/\
235# ResultHandler.java
236def get_test_result_xml_path(results_destination):
237    """Get the path of test_result.xml from the last session.
238    Raises:
239        Should never raise!
240    """
241    try:
242        last_result_path = None
243        for dir in os.listdir(results_destination):
244            result_dir = os.path.join(results_destination, dir)
245            result_path = os.path.join(result_dir, 'test_result.xml')
246            # We use the lexicographically largest path, because |dir| are
247            # of format YYYY.MM.DD_HH.MM.SS. The last session will always
248            # have the latest date which leads to the lexicographically
249            # largest path.
250            if last_result_path and last_result_path > result_path:
251                continue
252            # We need to check for `islink` as `isdir` returns true if
253            # |result_dir| is a symbolic link to a directory.
254            if not os.path.isdir(result_dir) or os.path.islink(result_dir):
255                continue
256            if not os.path.exists(result_path):
257                continue
258            last_result_path = result_path
259        return last_result_path
260    except Exception as e:
261        logging.warning(
262            'Exception raised in '
263            '|tradefed_utils.get_test_result_xml_path|: {'
264            '0}'.format(e))
265
266
267def get_perf_metrics_from_test_result_xml(result_path, resultsdir):
268    """Parse test_result.xml and each <Metric /> is mapped to a dict that
269    can be used as kwargs of |TradefedTest.output_perf_value|.
270    Raises:
271        Should never raise!
272    """
273    try:
274        root = ElementTree.parse(result_path)
275        for module in root.iter('Module'):
276            module_name = module.get('name')
277            for testcase in module.iter('TestCase'):
278                testcase_name = testcase.get('name')
279                for test in testcase.iter('Test'):
280                    test_name = test.get('name')
281                    for metric in test.iter('Metric'):
282                        score_type = metric.get('score_type')
283                        if score_type == None:
284                            continue
285                        if score_type not in ['higher_better', 'lower_better']:
286                            logging.warning(
287                                'Unsupported score_type in %s/%s/%s',
288                                module_name, testcase_name, test_name)
289                            continue
290                        higher_is_better = (score_type == 'higher_better')
291                        units = metric.get('score_unit')
292                        yield dict(
293                            description=testcase_name + '#' + test_name,
294                            value=metric[0].text,
295                            units=units,
296                            higher_is_better=higher_is_better,
297                            resultsdir=os.path.join(resultsdir, 'tests',
298                                PERF_MODULE_NAME_PREFIX + module_name)
299                        )
300    except Exception as e:
301        logging.warning(
302            'Exception raised in '
303            '|tradefed_utils.get_perf_metrics_from_test_result_xml|: {'
304            '0}'.format(e))
305