1*9c5db199SXin Li# Lint as: python2, python3 2*9c5db199SXin Li# Copyright 2018 The Chromium OS Authors. All rights reserved. 3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be 4*9c5db199SXin Li# found in the LICENSE file. 5*9c5db199SXin Li 6*9c5db199SXin Liimport contextlib 7*9c5db199SXin Liimport logging 8*9c5db199SXin Liimport os 9*9c5db199SXin Liimport random 10*9c5db199SXin Liimport re 11*9c5db199SXin Lifrom xml.etree import ElementTree 12*9c5db199SXin Li 13*9c5db199SXin Lifrom autotest_lib.client.common_lib import utils as common_utils 14*9c5db199SXin Lifrom autotest_lib.client.common_lib import error 15*9c5db199SXin Lifrom autotest_lib.server.cros import lockfile 16*9c5db199SXin Li 17*9c5db199SXin LiPERF_MODULE_NAME_PREFIX = 'CTS.' 18*9c5db199SXin Li 19*9c5db199SXin Li@contextlib.contextmanager 20*9c5db199SXin Lidef lock(filename): 21*9c5db199SXin Li """Prevents other autotest/tradefed instances from accessing cache. 22*9c5db199SXin Li 23*9c5db199SXin Li @param filename: The file to be locked. 24*9c5db199SXin Li """ 25*9c5db199SXin Li filelock = lockfile.FileLock(filename) 26*9c5db199SXin Li # It is tempting just to call filelock.acquire(3600). But the implementation 27*9c5db199SXin Li # has very poor temporal granularity (timeout/10), which is unsuitable for 28*9c5db199SXin Li # our needs. See /usr/lib64/python2.7/site-packages/lockfile/ 29*9c5db199SXin Li attempts = 0 30*9c5db199SXin Li total_wait_seconds = 0 31*9c5db199SXin Li while not filelock.i_am_locking(): 32*9c5db199SXin Li try: 33*9c5db199SXin Li attempts += 1 34*9c5db199SXin Li logging.info('Waiting for cache lock...') 35*9c5db199SXin Li # Cap the wait by 2 minutes. Setting too long wait makes long-lived 36*9c5db199SXin Li # tasks to have less chance to obtain the lock, leading to failures. 37*9c5db199SXin Li # We must not use a random integer as the filelock implementations 38*9c5db199SXin Li # may underflow an integer division. 39*9c5db199SXin Li wait = min(120.0, random.uniform(0.0, pow(2.0, attempts))) 40*9c5db199SXin Li total_wait_seconds += wait 41*9c5db199SXin Li filelock.acquire(wait) 42*9c5db199SXin Li except (lockfile.AlreadyLocked, lockfile.LockTimeout): 43*9c5db199SXin Li # Our goal is to wait long enough to be sure something very bad 44*9c5db199SXin Li # happened to the locking thread. Wait 2 hours at maximum 45*9c5db199SXin Li if total_wait_seconds >= 7200: 46*9c5db199SXin Li # Normally we should acquire the lock immediately. Once we 47*9c5db199SXin Li # wait on the order of 10 minutes either the dev server IO is 48*9c5db199SXin Li # overloaded or a lock didn't get cleaned up. Take one for the 49*9c5db199SXin Li # team, break the lock and report a failure. This should fix 50*9c5db199SXin Li # the lock for following tests. If the failure affects more than 51*9c5db199SXin Li # one job look for a deadlock or dev server overload. 52*9c5db199SXin Li age = filelock.age_of_lock() 53*9c5db199SXin Li logging.error('Permanent lock failure. Lock age = %d.', age) 54*9c5db199SXin Li # Breaking a lock is a destructive operation that causes 55*9c5db199SXin Li # abort of locking jobs, cleanup and redowloading of cache 56*9c5db199SXin Li # contents. When the error was due to overloaded server, 57*9c5db199SXin Li # it cause even more cascade of lock errors. Wait 4 hours 58*9c5db199SXin Li # before breaking the lock. Tasks inside the critical section 59*9c5db199SXin Li # is about downloading a few gigabytes of files. Taking 4 hours 60*9c5db199SXin Li # strongly implies the job had went wrong. 61*9c5db199SXin Li if age > 4 * 60 * 60: 62*9c5db199SXin Li logging.error('Trying to break lock.') 63*9c5db199SXin Li filelock.break_lock() 64*9c5db199SXin Li raise error.TestFail('Error: permanent cache lock failure.') 65*9c5db199SXin Li else: 66*9c5db199SXin Li logging.info('Acquired cache lock after %d attempts.', attempts) 67*9c5db199SXin Li try: 68*9c5db199SXin Li yield 69*9c5db199SXin Li finally: 70*9c5db199SXin Li filelock.release() 71*9c5db199SXin Li logging.info('Released cache lock.') 72*9c5db199SXin Li 73*9c5db199SXin Li 74*9c5db199SXin Li@contextlib.contextmanager 75*9c5db199SXin Lidef adb_keepalive(targets, extra_paths): 76*9c5db199SXin Li """A context manager that keeps the adb connection alive. 77*9c5db199SXin Li 78*9c5db199SXin Li AdbKeepalive will spin off a new process that will continuously poll for 79*9c5db199SXin Li adb's connected state, and will attempt to reconnect if it ever goes down. 80*9c5db199SXin Li This is the only way we can currently recover safely from (intentional) 81*9c5db199SXin Li reboots. 82*9c5db199SXin Li 83*9c5db199SXin Li @param target: the hostname and port of the DUT. 84*9c5db199SXin Li @param extra_paths: any additional components to the PATH environment 85*9c5db199SXin Li variable. 86*9c5db199SXin Li """ 87*9c5db199SXin Li from autotest_lib.client.common_lib.cros import adb_keepalive as module 88*9c5db199SXin Li # |__file__| returns the absolute path of the compiled bytecode of the 89*9c5db199SXin Li # module. We want to run the original .py file, so we need to change the 90*9c5db199SXin Li # extension back. 91*9c5db199SXin Li script_filename = module.__file__.replace('.pyc', '.py') 92*9c5db199SXin Li jobs = [common_utils.BgJob( 93*9c5db199SXin Li [script_filename, target], 94*9c5db199SXin Li nickname='adb_keepalive', 95*9c5db199SXin Li stderr_level=logging.DEBUG, 96*9c5db199SXin Li stdout_tee=common_utils.TEE_TO_LOGS, 97*9c5db199SXin Li stderr_tee=common_utils.TEE_TO_LOGS, 98*9c5db199SXin Li extra_paths=extra_paths) for target in targets] 99*9c5db199SXin Li 100*9c5db199SXin Li try: 101*9c5db199SXin Li yield 102*9c5db199SXin Li finally: 103*9c5db199SXin Li # The adb_keepalive.py script runs forever until SIGTERM is sent. 104*9c5db199SXin Li for job in jobs: 105*9c5db199SXin Li common_utils.nuke_subprocess(job.sp) 106*9c5db199SXin Li common_utils.join_bg_jobs(jobs) 107*9c5db199SXin Li 108*9c5db199SXin Li 109*9c5db199SXin Lidef parse_tradefed_testresults_xml(test_result_xml_path, waivers=None): 110*9c5db199SXin Li """ Check the result from tradefed through test_results.xml 111*9c5db199SXin Li @param waivers: a set() of tests which are permitted to fail. 112*9c5db199SXin Li """ 113*9c5db199SXin Li waived_count = dict() 114*9c5db199SXin Li failed_tests = set() 115*9c5db199SXin Li try: 116*9c5db199SXin Li root = ElementTree.parse(test_result_xml_path) 117*9c5db199SXin Li for module in root.iter('Module'): 118*9c5db199SXin Li module_name = module.get('name') 119*9c5db199SXin Li for testcase in module.iter('TestCase'): 120*9c5db199SXin Li testcase_name = testcase.get('name') 121*9c5db199SXin Li for test in testcase.iter('Test'): 122*9c5db199SXin Li test_case = test.get('name') 123*9c5db199SXin Li test_res = test.get('result') 124*9c5db199SXin Li test_name = '%s#%s' % (testcase_name, test_case) 125*9c5db199SXin Li 126*9c5db199SXin Li if test_res == "fail": 127*9c5db199SXin Li test_fail = test.find('Failure') 128*9c5db199SXin Li failed_message = test_fail.get('message') 129*9c5db199SXin Li failed_stacktrace = test_fail.find('StackTrace').text 130*9c5db199SXin Li 131*9c5db199SXin Li if waivers and test_name in waivers: 132*9c5db199SXin Li waived_count[test_name] = ( 133*9c5db199SXin Li waived_count.get(test_name, 0) + 1) 134*9c5db199SXin Li else: 135*9c5db199SXin Li failed_tests.add(test_name) 136*9c5db199SXin Li 137*9c5db199SXin Li # Check for test completion. 138*9c5db199SXin Li for summary in root.iter('Summary'): 139*9c5db199SXin Li modules_done = summary.get('modules_done') 140*9c5db199SXin Li modules_total = summary.get('modules_total') 141*9c5db199SXin Li 142*9c5db199SXin Li if failed_tests: 143*9c5db199SXin Li logging.error('Failed (but not waived) tests:\n%s', 144*9c5db199SXin Li '\n'.join(sorted(failed_tests))) 145*9c5db199SXin Li 146*9c5db199SXin Li waived = [] 147*9c5db199SXin Li for testname, fail_count in waived_count.items(): 148*9c5db199SXin Li waived += [testname] * fail_count 149*9c5db199SXin Li logging.info('Waived failure for %s %d time(s)', 150*9c5db199SXin Li testname, fail_count) 151*9c5db199SXin Li logging.info('>> Total waived = %s', waived) 152*9c5db199SXin Li return waived 153*9c5db199SXin Li 154*9c5db199SXin Li except Exception as e: 155*9c5db199SXin Li logging.warning('Exception raised in ' 156*9c5db199SXin Li '|tradefed_utils.parse_tradefed_testresults_xml|: {' 157*9c5db199SXin Li '0}'.format(e)) 158*9c5db199SXin Li return [] 159*9c5db199SXin Li 160*9c5db199SXin Li 161*9c5db199SXin Lidef parse_tradefed_result(result, waivers=None): 162*9c5db199SXin Li """Check the result from the tradefed output. 163*9c5db199SXin Li 164*9c5db199SXin Li @param result: The result stdout string from the tradefed command. 165*9c5db199SXin Li @param waivers: a set() of tests which are permitted to fail. 166*9c5db199SXin Li @return List of the waived tests. 167*9c5db199SXin Li """ 168*9c5db199SXin Li # Regular expressions for start/end messages of each test-run chunk. 169*9c5db199SXin Li abi_re = r'arm\S*|x86\S*' 170*9c5db199SXin Li # TODO(kinaba): use the current running module name. 171*9c5db199SXin Li module_re = r'\S+' 172*9c5db199SXin Li start_re = re.compile(r'(?:Start|Continu)ing (%s) %s with' 173*9c5db199SXin Li r' (\d+(?:,\d+)?) test' % (abi_re, module_re)) 174*9c5db199SXin Li end_re = re.compile(r'(%s) %s (?:complet|fail)ed in .*\.' 175*9c5db199SXin Li r' (\d+) passed, (\d+) failed, (\d+) not executed' % 176*9c5db199SXin Li (abi_re, module_re)) 177*9c5db199SXin Li fail_re = re.compile(r'I/ConsoleReporter.* (\S+) fail:') 178*9c5db199SXin Li inaccurate_re = re.compile(r'IMPORTANT: Some modules failed to run to ' 179*9c5db199SXin Li 'completion, tests counts may be inaccurate') 180*9c5db199SXin Li abis = set() 181*9c5db199SXin Li waived_count = dict() 182*9c5db199SXin Li failed_tests = set() 183*9c5db199SXin Li accurate = True 184*9c5db199SXin Li for line in result.splitlines(): 185*9c5db199SXin Li match = start_re.search(line) 186*9c5db199SXin Li if match: 187*9c5db199SXin Li abis = abis.union([match.group(1)]) 188*9c5db199SXin Li continue 189*9c5db199SXin Li match = end_re.search(line) 190*9c5db199SXin Li if match: 191*9c5db199SXin Li abi = match.group(1) 192*9c5db199SXin Li if abi not in abis: 193*9c5db199SXin Li logging.error('Trunk end with %s abi but have not seen ' 194*9c5db199SXin Li 'any trunk start with this abi.(%s)', abi, line) 195*9c5db199SXin Li continue 196*9c5db199SXin Li match = fail_re.search(line) 197*9c5db199SXin Li if match: 198*9c5db199SXin Li testname = match.group(1) 199*9c5db199SXin Li if waivers and testname in waivers: 200*9c5db199SXin Li waived_count[testname] = waived_count.get(testname, 0) + 1 201*9c5db199SXin Li else: 202*9c5db199SXin Li failed_tests.add(testname) 203*9c5db199SXin Li continue 204*9c5db199SXin Li # b/66899135, tradefed may reported inaccuratly with `list results`. 205*9c5db199SXin Li # Add warning if summary section shows that the result is inacurrate. 206*9c5db199SXin Li match = inaccurate_re.search(line) 207*9c5db199SXin Li if match: 208*9c5db199SXin Li accurate = False 209*9c5db199SXin Li 210*9c5db199SXin Li logging.info('Total ABIs: %s', abis) 211*9c5db199SXin Li if failed_tests: 212*9c5db199SXin Li logging.error('Failed (but not waived) tests:\n%s', 213*9c5db199SXin Li '\n'.join(sorted(failed_tests))) 214*9c5db199SXin Li 215*9c5db199SXin Li # TODO(dhaddock): Find a more robust way to apply waivers. 216*9c5db199SXin Li waived = [] 217*9c5db199SXin Li for testname, fail_count in waived_count.items(): 218*9c5db199SXin Li if fail_count > len(abis): 219*9c5db199SXin Li # This should be an error.TestFail, but unfortunately 220*9c5db199SXin Li # tradefed has a bug that emits "fail" twice when a 221*9c5db199SXin Li # test failed during teardown. It will anyway causes 222*9c5db199SXin Li # a test count inconsistency and visible on the dashboard. 223*9c5db199SXin Li logging.error('Found %d failures for %s but there are only %d ' 224*9c5db199SXin Li 'abis: %s', fail_count, testname, len(abis), abis) 225*9c5db199SXin Li fail_count = len(abis) 226*9c5db199SXin Li waived += [testname] * fail_count 227*9c5db199SXin Li logging.info('Waived failure for %s %d time(s)', testname, fail_count) 228*9c5db199SXin Li logging.info('Total waived = %s', waived) 229*9c5db199SXin Li return waived, accurate 230*9c5db199SXin Li 231*9c5db199SXin Li 232*9c5db199SXin Li# A similar implementation in Java can be found at 233*9c5db199SXin Li# https://android.googlesource.com/platform/test/suite_harness/+/refs/heads/\ 234*9c5db199SXin Li# pie-cts-dev/common/util/src/com/android/compatibility/common/util/\ 235*9c5db199SXin Li# ResultHandler.java 236*9c5db199SXin Lidef get_test_result_xml_path(results_destination): 237*9c5db199SXin Li """Get the path of test_result.xml from the last session. 238*9c5db199SXin Li Raises: 239*9c5db199SXin Li Should never raise! 240*9c5db199SXin Li """ 241*9c5db199SXin Li try: 242*9c5db199SXin Li last_result_path = None 243*9c5db199SXin Li for dir in os.listdir(results_destination): 244*9c5db199SXin Li result_dir = os.path.join(results_destination, dir) 245*9c5db199SXin Li result_path = os.path.join(result_dir, 'test_result.xml') 246*9c5db199SXin Li # We use the lexicographically largest path, because |dir| are 247*9c5db199SXin Li # of format YYYY.MM.DD_HH.MM.SS. The last session will always 248*9c5db199SXin Li # have the latest date which leads to the lexicographically 249*9c5db199SXin Li # largest path. 250*9c5db199SXin Li if last_result_path and last_result_path > result_path: 251*9c5db199SXin Li continue 252*9c5db199SXin Li # We need to check for `islink` as `isdir` returns true if 253*9c5db199SXin Li # |result_dir| is a symbolic link to a directory. 254*9c5db199SXin Li if not os.path.isdir(result_dir) or os.path.islink(result_dir): 255*9c5db199SXin Li continue 256*9c5db199SXin Li if not os.path.exists(result_path): 257*9c5db199SXin Li continue 258*9c5db199SXin Li last_result_path = result_path 259*9c5db199SXin Li return last_result_path 260*9c5db199SXin Li except Exception as e: 261*9c5db199SXin Li logging.warning( 262*9c5db199SXin Li 'Exception raised in ' 263*9c5db199SXin Li '|tradefed_utils.get_test_result_xml_path|: {' 264*9c5db199SXin Li '0}'.format(e)) 265*9c5db199SXin Li 266*9c5db199SXin Li 267*9c5db199SXin Lidef get_perf_metrics_from_test_result_xml(result_path, resultsdir): 268*9c5db199SXin Li """Parse test_result.xml and each <Metric /> is mapped to a dict that 269*9c5db199SXin Li can be used as kwargs of |TradefedTest.output_perf_value|. 270*9c5db199SXin Li Raises: 271*9c5db199SXin Li Should never raise! 272*9c5db199SXin Li """ 273*9c5db199SXin Li try: 274*9c5db199SXin Li root = ElementTree.parse(result_path) 275*9c5db199SXin Li for module in root.iter('Module'): 276*9c5db199SXin Li module_name = module.get('name') 277*9c5db199SXin Li for testcase in module.iter('TestCase'): 278*9c5db199SXin Li testcase_name = testcase.get('name') 279*9c5db199SXin Li for test in testcase.iter('Test'): 280*9c5db199SXin Li test_name = test.get('name') 281*9c5db199SXin Li for metric in test.iter('Metric'): 282*9c5db199SXin Li score_type = metric.get('score_type') 283*9c5db199SXin Li if score_type == None: 284*9c5db199SXin Li continue 285*9c5db199SXin Li if score_type not in ['higher_better', 'lower_better']: 286*9c5db199SXin Li logging.warning( 287*9c5db199SXin Li 'Unsupported score_type in %s/%s/%s', 288*9c5db199SXin Li module_name, testcase_name, test_name) 289*9c5db199SXin Li continue 290*9c5db199SXin Li higher_is_better = (score_type == 'higher_better') 291*9c5db199SXin Li units = metric.get('score_unit') 292*9c5db199SXin Li yield dict( 293*9c5db199SXin Li description=testcase_name + '#' + test_name, 294*9c5db199SXin Li value=metric[0].text, 295*9c5db199SXin Li units=units, 296*9c5db199SXin Li higher_is_better=higher_is_better, 297*9c5db199SXin Li resultsdir=os.path.join(resultsdir, 'tests', 298*9c5db199SXin Li PERF_MODULE_NAME_PREFIX + module_name) 299*9c5db199SXin Li ) 300*9c5db199SXin Li except Exception as e: 301*9c5db199SXin Li logging.warning( 302*9c5db199SXin Li 'Exception raised in ' 303*9c5db199SXin Li '|tradefed_utils.get_perf_metrics_from_test_result_xml|: {' 304*9c5db199SXin Li '0}'.format(e)) 305