1# Lint as: python2, python3 2# Copyright 2018 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import contextlib 7import logging 8import os 9import random 10import re 11from xml.etree import ElementTree 12 13from autotest_lib.client.common_lib import utils as common_utils 14from autotest_lib.client.common_lib import error 15from autotest_lib.server.cros import lockfile 16 17PERF_MODULE_NAME_PREFIX = 'CTS.' 18 19@contextlib.contextmanager 20def lock(filename): 21 """Prevents other autotest/tradefed instances from accessing cache. 22 23 @param filename: The file to be locked. 24 """ 25 filelock = lockfile.FileLock(filename) 26 # It is tempting just to call filelock.acquire(3600). But the implementation 27 # has very poor temporal granularity (timeout/10), which is unsuitable for 28 # our needs. See /usr/lib64/python2.7/site-packages/lockfile/ 29 attempts = 0 30 total_wait_seconds = 0 31 while not filelock.i_am_locking(): 32 try: 33 attempts += 1 34 logging.info('Waiting for cache lock...') 35 # Cap the wait by 2 minutes. Setting too long wait makes long-lived 36 # tasks to have less chance to obtain the lock, leading to failures. 37 # We must not use a random integer as the filelock implementations 38 # may underflow an integer division. 39 wait = min(120.0, random.uniform(0.0, pow(2.0, attempts))) 40 total_wait_seconds += wait 41 filelock.acquire(wait) 42 except (lockfile.AlreadyLocked, lockfile.LockTimeout): 43 # Our goal is to wait long enough to be sure something very bad 44 # happened to the locking thread. Wait 2 hours at maximum 45 if total_wait_seconds >= 7200: 46 # Normally we should acquire the lock immediately. Once we 47 # wait on the order of 10 minutes either the dev server IO is 48 # overloaded or a lock didn't get cleaned up. Take one for the 49 # team, break the lock and report a failure. This should fix 50 # the lock for following tests. If the failure affects more than 51 # one job look for a deadlock or dev server overload. 52 age = filelock.age_of_lock() 53 logging.error('Permanent lock failure. Lock age = %d.', age) 54 # Breaking a lock is a destructive operation that causes 55 # abort of locking jobs, cleanup and redowloading of cache 56 # contents. When the error was due to overloaded server, 57 # it cause even more cascade of lock errors. Wait 4 hours 58 # before breaking the lock. Tasks inside the critical section 59 # is about downloading a few gigabytes of files. Taking 4 hours 60 # strongly implies the job had went wrong. 61 if age > 4 * 60 * 60: 62 logging.error('Trying to break lock.') 63 filelock.break_lock() 64 raise error.TestFail('Error: permanent cache lock failure.') 65 else: 66 logging.info('Acquired cache lock after %d attempts.', attempts) 67 try: 68 yield 69 finally: 70 filelock.release() 71 logging.info('Released cache lock.') 72 73 74@contextlib.contextmanager 75def adb_keepalive(targets, extra_paths): 76 """A context manager that keeps the adb connection alive. 77 78 AdbKeepalive will spin off a new process that will continuously poll for 79 adb's connected state, and will attempt to reconnect if it ever goes down. 80 This is the only way we can currently recover safely from (intentional) 81 reboots. 82 83 @param target: the hostname and port of the DUT. 84 @param extra_paths: any additional components to the PATH environment 85 variable. 86 """ 87 from autotest_lib.client.common_lib.cros import adb_keepalive as module 88 # |__file__| returns the absolute path of the compiled bytecode of the 89 # module. We want to run the original .py file, so we need to change the 90 # extension back. 91 script_filename = module.__file__.replace('.pyc', '.py') 92 jobs = [common_utils.BgJob( 93 [script_filename, target], 94 nickname='adb_keepalive', 95 stderr_level=logging.DEBUG, 96 stdout_tee=common_utils.TEE_TO_LOGS, 97 stderr_tee=common_utils.TEE_TO_LOGS, 98 extra_paths=extra_paths) for target in targets] 99 100 try: 101 yield 102 finally: 103 # The adb_keepalive.py script runs forever until SIGTERM is sent. 104 for job in jobs: 105 common_utils.nuke_subprocess(job.sp) 106 common_utils.join_bg_jobs(jobs) 107 108 109def parse_tradefed_testresults_xml(test_result_xml_path, waivers=None): 110 """ Check the result from tradefed through test_results.xml 111 @param waivers: a set() of tests which are permitted to fail. 112 """ 113 waived_count = dict() 114 failed_tests = set() 115 try: 116 root = ElementTree.parse(test_result_xml_path) 117 for module in root.iter('Module'): 118 module_name = module.get('name') 119 for testcase in module.iter('TestCase'): 120 testcase_name = testcase.get('name') 121 for test in testcase.iter('Test'): 122 test_case = test.get('name') 123 test_res = test.get('result') 124 test_name = '%s#%s' % (testcase_name, test_case) 125 126 if test_res == "fail": 127 test_fail = test.find('Failure') 128 failed_message = test_fail.get('message') 129 failed_stacktrace = test_fail.find('StackTrace').text 130 131 if waivers and test_name in waivers: 132 waived_count[test_name] = ( 133 waived_count.get(test_name, 0) + 1) 134 else: 135 failed_tests.add(test_name) 136 137 # Check for test completion. 138 for summary in root.iter('Summary'): 139 modules_done = summary.get('modules_done') 140 modules_total = summary.get('modules_total') 141 142 if failed_tests: 143 logging.error('Failed (but not waived) tests:\n%s', 144 '\n'.join(sorted(failed_tests))) 145 146 waived = [] 147 for testname, fail_count in waived_count.items(): 148 waived += [testname] * fail_count 149 logging.info('Waived failure for %s %d time(s)', 150 testname, fail_count) 151 logging.info('>> Total waived = %s', waived) 152 return waived 153 154 except Exception as e: 155 logging.warning('Exception raised in ' 156 '|tradefed_utils.parse_tradefed_testresults_xml|: {' 157 '0}'.format(e)) 158 return [] 159 160 161def parse_tradefed_result(result, waivers=None): 162 """Check the result from the tradefed output. 163 164 @param result: The result stdout string from the tradefed command. 165 @param waivers: a set() of tests which are permitted to fail. 166 @return List of the waived tests. 167 """ 168 # Regular expressions for start/end messages of each test-run chunk. 169 abi_re = r'arm\S*|x86\S*' 170 # TODO(kinaba): use the current running module name. 171 module_re = r'\S+' 172 start_re = re.compile(r'(?:Start|Continu)ing (%s) %s with' 173 r' (\d+(?:,\d+)?) test' % (abi_re, module_re)) 174 end_re = re.compile(r'(%s) %s (?:complet|fail)ed in .*\.' 175 r' (\d+) passed, (\d+) failed, (\d+) not executed' % 176 (abi_re, module_re)) 177 fail_re = re.compile(r'I/ConsoleReporter.* (\S+) fail:') 178 inaccurate_re = re.compile(r'IMPORTANT: Some modules failed to run to ' 179 'completion, tests counts may be inaccurate') 180 abis = set() 181 waived_count = dict() 182 failed_tests = set() 183 accurate = True 184 for line in result.splitlines(): 185 match = start_re.search(line) 186 if match: 187 abis = abis.union([match.group(1)]) 188 continue 189 match = end_re.search(line) 190 if match: 191 abi = match.group(1) 192 if abi not in abis: 193 logging.error('Trunk end with %s abi but have not seen ' 194 'any trunk start with this abi.(%s)', abi, line) 195 continue 196 match = fail_re.search(line) 197 if match: 198 testname = match.group(1) 199 if waivers and testname in waivers: 200 waived_count[testname] = waived_count.get(testname, 0) + 1 201 else: 202 failed_tests.add(testname) 203 continue 204 # b/66899135, tradefed may reported inaccuratly with `list results`. 205 # Add warning if summary section shows that the result is inacurrate. 206 match = inaccurate_re.search(line) 207 if match: 208 accurate = False 209 210 logging.info('Total ABIs: %s', abis) 211 if failed_tests: 212 logging.error('Failed (but not waived) tests:\n%s', 213 '\n'.join(sorted(failed_tests))) 214 215 # TODO(dhaddock): Find a more robust way to apply waivers. 216 waived = [] 217 for testname, fail_count in waived_count.items(): 218 if fail_count > len(abis): 219 # This should be an error.TestFail, but unfortunately 220 # tradefed has a bug that emits "fail" twice when a 221 # test failed during teardown. It will anyway causes 222 # a test count inconsistency and visible on the dashboard. 223 logging.error('Found %d failures for %s but there are only %d ' 224 'abis: %s', fail_count, testname, len(abis), abis) 225 fail_count = len(abis) 226 waived += [testname] * fail_count 227 logging.info('Waived failure for %s %d time(s)', testname, fail_count) 228 logging.info('Total waived = %s', waived) 229 return waived, accurate 230 231 232# A similar implementation in Java can be found at 233# https://android.googlesource.com/platform/test/suite_harness/+/refs/heads/\ 234# pie-cts-dev/common/util/src/com/android/compatibility/common/util/\ 235# ResultHandler.java 236def get_test_result_xml_path(results_destination): 237 """Get the path of test_result.xml from the last session. 238 Raises: 239 Should never raise! 240 """ 241 try: 242 last_result_path = None 243 for dir in os.listdir(results_destination): 244 result_dir = os.path.join(results_destination, dir) 245 result_path = os.path.join(result_dir, 'test_result.xml') 246 # We use the lexicographically largest path, because |dir| are 247 # of format YYYY.MM.DD_HH.MM.SS. The last session will always 248 # have the latest date which leads to the lexicographically 249 # largest path. 250 if last_result_path and last_result_path > result_path: 251 continue 252 # We need to check for `islink` as `isdir` returns true if 253 # |result_dir| is a symbolic link to a directory. 254 if not os.path.isdir(result_dir) or os.path.islink(result_dir): 255 continue 256 if not os.path.exists(result_path): 257 continue 258 last_result_path = result_path 259 return last_result_path 260 except Exception as e: 261 logging.warning( 262 'Exception raised in ' 263 '|tradefed_utils.get_test_result_xml_path|: {' 264 '0}'.format(e)) 265 266 267def get_perf_metrics_from_test_result_xml(result_path, resultsdir): 268 """Parse test_result.xml and each <Metric /> is mapped to a dict that 269 can be used as kwargs of |TradefedTest.output_perf_value|. 270 Raises: 271 Should never raise! 272 """ 273 try: 274 root = ElementTree.parse(result_path) 275 for module in root.iter('Module'): 276 module_name = module.get('name') 277 for testcase in module.iter('TestCase'): 278 testcase_name = testcase.get('name') 279 for test in testcase.iter('Test'): 280 test_name = test.get('name') 281 for metric in test.iter('Metric'): 282 score_type = metric.get('score_type') 283 if score_type == None: 284 continue 285 if score_type not in ['higher_better', 'lower_better']: 286 logging.warning( 287 'Unsupported score_type in %s/%s/%s', 288 module_name, testcase_name, test_name) 289 continue 290 higher_is_better = (score_type == 'higher_better') 291 units = metric.get('score_unit') 292 yield dict( 293 description=testcase_name + '#' + test_name, 294 value=metric[0].text, 295 units=units, 296 higher_is_better=higher_is_better, 297 resultsdir=os.path.join(resultsdir, 'tests', 298 PERF_MODULE_NAME_PREFIX + module_name) 299 ) 300 except Exception as e: 301 logging.warning( 302 'Exception raised in ' 303 '|tradefed_utils.get_perf_metrics_from_test_result_xml|: {' 304 '0}'.format(e)) 305