1# Lint as: python2, python3 2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6from __future__ import print_function 7from __future__ import division 8from __future__ import absolute_import 9 10import datetime 11import logging 12import time 13import six 14import warnings 15 16import common 17 18from autotest_lib.client.common_lib import base_job 19from autotest_lib.client.common_lib import error 20from autotest_lib.client.common_lib import priorities 21from autotest_lib.client.common_lib import time_utils 22from autotest_lib.client.common_lib import utils 23from autotest_lib.client.common_lib.cros import dev_server 24from autotest_lib.server.cros import provision 25from autotest_lib.server.cros.dynamic_suite import constants 26from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 27from autotest_lib.server.cros.dynamic_suite.suite import ProvisionSuite 28from autotest_lib.server.cros.dynamic_suite.suite import Suite 29from autotest_lib.tko import utils as tko_utils 30 31 32"""CrOS dynamic test suite generation and execution module. 33 34This module implements runtime-generated test suites for CrOS. 35Design doc: http://goto.google.com/suitesv2 36 37Individual tests can declare themselves as a part of one or more 38suites, and the code here enables control files to be written 39that can refer to these "dynamic suites" by name. We also provide 40support for reimaging devices with a given build and running a 41dynamic suite across all reimaged devices. 42 43The public API for defining a suite includes one method: reimage_and_run(). 44A suite control file can be written by importing this module and making 45an appropriate call to this single method. In normal usage, this control 46file will be run in a 'hostless' server-side autotest job, scheduling 47sub-jobs to do the needed reimaging and test running. 48 49Example control file: 50 51import common 52from autotest_lib.server.cros import provision 53from autotest_lib.server.cros.dynamic_suite import dynamic_suite 54 55dynamic_suite.reimage_and_run( 56 builds={provision.CROS_VERSION_PREFIX: build}, board=board, name='bvt', 57 job=job, pool=pool, check_hosts=check_hosts, add_experimental=True, 58 devserver_url=devserver_url) 59 60This will -- at runtime -- find all control files that contain "bvt" in their 61"SUITE=" clause, schedule jobs to reimage devices in the 62specified pool of the specified board with the specified build and, upon 63completion of those jobs, schedule and wait for jobs that run all the tests it 64discovered. 65 66Suites can be run by using the atest command-line tool: 67 atest suite create -b <board> -i <build/name> <suite> 68e.g. 69 atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt 70 71------------------------------------------------------------------------- 72Implementation details 73 74A Suite instance represents a single test suite, defined by some predicate 75run over all known control files. The simplest example is creating a Suite 76by 'name'. 77 78create_suite_job() takes the parameters needed to define a suite run (board, 79build to test, machine pool, and which suite to run), ensures important 80preconditions are met, finds the appropraite suite control file, and then 81schedules the hostless job that will do the rest of the work. 82 83Note that we have more than one Dev server in our test lab architecture. 84We currently load balance per-build being tested, so one and only one dev 85server is used by any given run through the reimaging/testing flow. 86 87- create_suite_job() 88The primary role of create_suite_job() is to ensure that the required 89artifacts for the build to be tested are staged on the dev server. This 90includes payloads required to autoupdate machines to the desired build, as 91well as the autotest control files appropriate for that build. Then, the 92RPC pulls the control file for the suite to be run from the dev server and 93uses it to create the suite job with the autotest frontend. 94 95 +----------------+ 96 | Google Storage | Client 97 +----------------+ | 98 | ^ | create_suite_job() 99 payloads/ | | | 100 control files | | request | 101 V | V 102 +-------------+ download request +--------------------------+ 103 | |<----------------------| | 104 | Dev Server | | Autotest Frontend (AFE) | 105 | |---------------------->| | 106 +-------------+ suite control file +--------------------------+ 107 | 108 V 109 Suite Job (hostless) 110 111- Reimage and Run 112The overall process is to schedule all the tests, and then wait for the tests 113to complete. 114 115- The Reimaging Process 116 117As an artifact of an old implementation, the number of machines to use 118is called the 'sharding_factor', and the default is defined in the [CROS] 119section of global_config.ini. 120 121There used to be a 'num' parameter to control the maximum number of 122machines, but it does not do anything any more. 123 124A test control file can specify a list of DEPENDENCIES, which are really just 125the set of labels a host needs to have in order for that test to be scheduled 126on it. In the case of a dynamic_suite, many tests in the suite may have 127DEPENDENCIES specified. All tests are scheduled with the DEPENDENCIES that 128they specify, along with any suite dependencies that were specified, and the 129scheduler will find and provision a host capable of running the test. 130 131- Scheduling Suites 132A Suite instance uses the labels specified in the suite dependencies to 133schedule tests across all the hosts in the pool. It then waits for all these 134jobs. As an optimization, the Dev server stages the payloads necessary to 135run a suite in the background _after_ it has completed all the things 136necessary for reimaging. Before running a suite, reimage_and_run() calls out 137to the Dev server and blocks until it's completed staging all build artifacts 138needed to run test suites. 139 140Step by step: 1410) At instantiation time, find all appropriate control files for this suite 142 that were included in the build to be tested. To do this, we consult the 143 Dev Server, where all these control files are staged. 144 145 +------------+ control files? +--------------------------+ 146 | |<----------------------| | 147 | Dev Server | | Autotest Frontend (AFE) | 148 | |---------------------->| [Suite Job] | 149 +------------+ control files! +--------------------------+ 150 1511) Now that the Suite instance exists, it schedules jobs for every control 152 file it deemed appropriate, to be run on the hosts that were labeled 153 by the provisioning. We stuff keyvals into these jobs, indicating what 154 build they were testing and which suite they were for. 155 156 +--------------------------+ Job for VersLabel +--------+ 157 | |------------------------>| Host 1 | VersLabel 158 | Autotest Frontend (AFE) | +--------+ +--------+ 159 | [Suite Job] |----------->| Host 2 | 160 +--------------------------+ Job for +--------+ 161 | ^ VersLabel VersLabel 162 | | 163 +----------------+ 164 One job per test 165 {'build': build/name, 166 'suite': suite_name} 167 1682) Now that all jobs are scheduled, they'll be doled out as labeled hosts 169 finish their assigned work and become available again. 170 171- Waiting on Suites 1720) As we clean up each test job, we check to see if any crashes occurred. If 173 they did, we look at the 'build' keyval in the job to see which build's debug 174 symbols we'll need to symbolicate the crash dump we just found. 175 1761) Using this info, we tell a special Crash Server to stage the required debug 177 symbols. Once that's done, we ask the Crash Server to use those symbols to 178 symbolicate the crash dump in question. 179 180 +----------------+ 181 | Google Storage | 182 +----------------+ 183 | ^ 184 symbols! | | symbols? 185 V | 186 +------------+ stage symbols for build +--------------------------+ 187 | |<--------------------------| | 188 | Crash | | | 189 | Server | dump to symbolicate | Autotest Frontend (AFE) | 190 | |<--------------------------| [Suite Job] | 191 | |-------------------------->| | 192 +------------+ symbolicated dump +--------------------------+ 193 1942) As jobs finish, we record their success or failure in the status of the suite 195 job. We also record a 'job keyval' in the suite job for each test, noting 196 the job ID and job owner. This can be used to refer to test logs later. 1973) Once all jobs are complete, status is recorded for the suite job, and the 198 job_repo_url host attribute is removed from all hosts used by the suite. 199 200""" 201 202 203# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py. 204 205class _SuiteSpec(object): 206 """This class contains the info that defines a suite run.""" 207 208 _REQUIRED_KEYWORDS = { 209 'board': str, 210 'builds': dict, 211 'name': str, 212 'job': base_job.base_job, 213 'devserver_url': str, 214 } 215 216 _VERSION_PREFIXES = frozenset(( 217 provision.CROS_VERSION_PREFIX, 218 provision.CROS_ANDROID_VERSION_PREFIX, 219 )) 220 221 def __init__( 222 self, 223 builds=None, 224 board=None, 225 name=None, 226 job=None, 227 devserver_url=None, 228 pool=None, 229 check_hosts=True, 230 add_experimental=True, 231 file_bugs=False, 232 max_runtime_mins=24*60, 233 timeout_mins=24*60, 234 suite_dependencies=None, 235 bug_template=None, 236 priority=priorities.Priority.DEFAULT, 237 predicate=None, 238 wait_for_results=True, 239 job_retry=False, 240 max_retries=None, 241 offload_failures_only=False, 242 test_source_build=None, 243 run_prod_code=False, 244 delay_minutes=0, 245 job_keyvals=None, 246 test_args=None, 247 child_dependencies=(), 248 test_names=None, 249 **dargs): 250 """ 251 Vets arguments for reimage_and_run() and populates self with supplied 252 values. 253 254 Currently required args: 255 @param builds: the builds to install e.g. 256 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 257 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 258 @param board: which kind of devices to reimage. 259 @param name: a value of the SUITE control file variable to search for. 260 @param job: an instance of client.common_lib.base_job representing the 261 currently running suite job. 262 @param devserver_url: url to the selected devserver. 263 264 Currently supported optional args: 265 @param pool: the pool of machines to use for scheduling purposes. 266 @param check_hosts: require appropriate hosts to be available now. 267 @param add_experimental: schedule experimental tests as well, or not. 268 @param file_bugs: File bugs when tests in this suite fail. 269 @param max_runtime_mins: Max runtime in mins for each of the sub-jobs 270 this suite will run. 271 @param timeout_mins: Max lifetime in minutes for each of the sub-jobs 272 that this suite runs. 273 @param suite_dependencies: A list of strings of suite level 274 dependencies, which act just like test 275 dependencies and are appended to each test's 276 set of dependencies at job creation time. 277 A string of comma seperated labels is 278 accepted for backwards compatibility. 279 @param bug_template: A template dictionary specifying the default bug 280 filing options for failures in this suite. 281 @param priority: Integer priority level. Higher is more important. 282 @param predicate: Optional argument. If present, should be a function 283 mapping ControlData objects to True if they should be 284 included in suite. If argument is absent, suite 285 behavior will default to creating a suite of based 286 on the SUITE field of control files. 287 @param wait_for_results: Set to False to run the suite job without 288 waiting for test jobs to finish. 289 @param job_retry: Set to True to enable job-level retry. 290 @param max_retries: Maximum retry limit at suite level if not None. 291 Regardless how many times each individual test 292 has been retried, the total number of retries 293 happening in the suite can't exceed max_retries. 294 @param offload_failures_only: Only enable gs_offloading for failed 295 jobs. 296 @param test_source_build: Build that contains the server-side test code, 297 e.g., it can be the value of builds['cros-version:'] or 298 builds['fw-version:']. None uses the server-side test code from 299 builds['cros-version:']. 300 @param run_prod_code: If true, the suite will run the test code that 301 lives in prod aka the test code currently on the 302 lab servers. 303 @param delay_minutes: Delay the creation of test jobs for a given number 304 of minutes. 305 @param job_keyvals: General job keyvals to be inserted into keyval file 306 @param test_args: A dict of args passed all the way to each individual 307 test that will be actually ran. 308 @param child_dependencies: (optional) list of dependency strings 309 to be added as dependencies to child jobs. 310 @param test_names: (optional) if provided, Suite will consist of the 311 tests named in this list. 312 @param **dargs: these arguments will be ignored. This allows us to 313 deprecate and remove arguments in ToT while not 314 breaking branch builds. 315 """ 316 self._check_init_params( 317 board=board, 318 builds=builds, 319 name=name, 320 job=job, 321 devserver_url=devserver_url) 322 323 self.board = 'board:%s' % board 324 self.builds = builds 325 self.name = name 326 self.job = job 327 self.pool = ('pool:%s' % pool) if pool else pool 328 self.check_hosts = check_hosts 329 self.add_experimental = add_experimental 330 self.file_bugs = file_bugs 331 self.dependencies = {'': []} 332 self.max_runtime_mins = max_runtime_mins 333 self.timeout_mins = timeout_mins 334 self.bug_template = {} if bug_template is None else bug_template 335 self.priority = priority 336 self.wait_for_results = wait_for_results 337 self.job_retry = job_retry 338 self.max_retries = max_retries 339 self.offload_failures_only = offload_failures_only 340 self.run_prod_code = run_prod_code 341 self.delay_minutes = delay_minutes 342 self.job_keyvals = job_keyvals 343 self.test_args = test_args 344 self.child_dependencies = child_dependencies 345 346 self._init_predicate(predicate, test_names) 347 self._init_suite_dependencies(suite_dependencies) 348 self._init_devserver(devserver_url) 349 self._init_test_source_build(test_source_build) 350 self._translate_builds() 351 self._add_builds_to_suite_deps() 352 353 for key, value in six.iteritems(dargs): 354 warnings.warn('Ignored key %r was passed to suite with value %r' 355 % (key, value)) 356 357 def _check_init_params(self, **kwargs): 358 for key, expected_type in six.iteritems(self._REQUIRED_KEYWORDS): 359 value = kwargs.get(key) 360 # TODO(ayatane): `not value` includes both the cases where value is 361 # None and where value is the correct type, but empty (e.g., empty 362 # dict). It looks like this is NOT the intended behavior, but I'm 363 # hesitant to remove it in case something is actually relying on 364 # this behavior. 365 if not value or not isinstance(value, expected_type): 366 raise error.SuiteArgumentException( 367 'reimage_and_run() needs %s=<%r>' 368 % (key, expected_type)) 369 370 def _init_predicate(self, predicate, test_names): 371 """Initialize predicate attribute.""" 372 if test_names: 373 self.predicate = Suite.test_name_in_list_predicate(test_names) 374 return 375 376 if predicate: 377 self.predicate = predicate 378 return 379 380 self.predicate = Suite.name_in_tag_predicate(self.name) 381 382 def _init_suite_dependencies(self, suite_dependencies): 383 """Initialize suite dependencies attribute.""" 384 if suite_dependencies is None: 385 self.suite_dependencies = [] 386 elif isinstance(suite_dependencies, str): 387 self.suite_dependencies = [dep.strip(' ') for dep 388 in suite_dependencies.split(',')] 389 else: 390 self.suite_dependencies = suite_dependencies 391 392 def _init_devserver(self, devserver_url): 393 """Initialize devserver attribute.""" 394 self.devserver = dev_server.ImageServer(devserver_url) 395 396 def _init_test_source_build(self, test_source_build): 397 """Initialize test_source_build attribute.""" 398 if test_source_build: 399 test_source_build = self.devserver.translate(test_source_build) 400 401 self.test_source_build = Suite.get_test_source_build( 402 self.builds, test_source_build=test_source_build) 403 404 def _translate_builds(self): 405 """Translate build names if they are in LATEST format.""" 406 for prefix in self._VERSION_PREFIXES: 407 if prefix in self.builds: 408 translated_build = self.devserver.translate( 409 self.builds[prefix]) 410 self.builds[prefix] = translated_build 411 412 def _add_builds_to_suite_deps(self): 413 """Add builds to suite_dependencies. 414 415 To support provision both CrOS and firmware, option builds are added to 416 _SuiteSpec, e.g., 417 418 builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0', 419 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 420 421 version_prefix+build should make it into each test as a DEPENDENCY. 422 The easiest way to do this is to tack it onto the suite_dependencies. 423 """ 424 self.suite_dependencies.extend( 425 provision.join(version_prefix, build) 426 for version_prefix, build in six.iteritems(self.builds) 427 ) 428 429 430class _ProvisionSuiteSpec(_SuiteSpec): 431 432 def __init__(self, num_required, **kwargs): 433 self.num_required = num_required 434 super(_ProvisionSuiteSpec, self).__init__(**kwargs) 435 436 437def run_provision_suite(**dargs): 438 """ 439 Run a provision suite. 440 441 Will re-image a number of devices (of the specified board) with the 442 provided builds by scheduling stub_Pass. 443 444 @param job: an instance of client.common_lib.base_job representing the 445 currently running suite job. 446 447 @raises AsynchronousBuildFailure: if there was an issue finishing staging 448 from the devserver. 449 @raises MalformedDependenciesException: if the dependency_info file for 450 the required build fails to parse. 451 """ 452 spec = _ProvisionSuiteSpec(**dargs) 453 454 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 455 user=spec.job.user, debug=False) 456 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 457 user=spec.job.user, debug=False) 458 459 try: 460 my_job_id = int(tko_utils.get_afe_job_id(spec.job.tag)) 461 logging.debug('Determined own job id: %d', my_job_id) 462 except (TypeError, ValueError): 463 my_job_id = None 464 logging.warning('Could not determine own job id.') 465 466 suite = ProvisionSuite( 467 tag=spec.name, 468 builds=spec.builds, 469 board=spec.board, 470 devserver=spec.devserver, 471 num_required=spec.num_required, 472 afe=afe, 473 tko=tko, 474 pool=spec.pool, 475 results_dir=spec.job.resultdir, 476 max_runtime_mins=spec.max_runtime_mins, 477 timeout_mins=spec.timeout_mins, 478 file_bugs=spec.file_bugs, 479 suite_job_id=my_job_id, 480 extra_deps=spec.suite_dependencies, 481 priority=spec.priority, 482 wait_for_results=spec.wait_for_results, 483 job_retry=spec.job_retry, 484 max_retries=spec.max_retries, 485 offload_failures_only=spec.offload_failures_only, 486 test_source_build=spec.test_source_build, 487 run_prod_code=spec.run_prod_code, 488 job_keyvals=spec.job_keyvals, 489 test_args=spec.test_args, 490 child_dependencies=spec.child_dependencies, 491 ) 492 493 _run_suite_with_spec(suite, spec) 494 495 logging.debug('Returning from dynamic_suite.run_provision_suite') 496 497 498def reimage_and_run(**dargs): 499 """ 500 Backward-compatible API for dynamic_suite. 501 502 Will re-image a number of devices (of the specified board) with the 503 provided builds, and then run the indicated test suite on them. 504 Guaranteed to be compatible with any build from stable to dev. 505 506 @param dargs: Dictionary containing the arguments passed to _SuiteSpec(). 507 @raises AsynchronousBuildFailure: if there was an issue finishing staging 508 from the devserver. 509 @raises MalformedDependenciesException: if the dependency_info file for 510 the required build fails to parse. 511 """ 512 suite_spec = _SuiteSpec(**dargs) 513 514 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 515 user=suite_spec.job.user, debug=False) 516 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 517 user=suite_spec.job.user, debug=False) 518 519 try: 520 my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag)) 521 logging.debug('Determined own job id: %d', my_job_id) 522 except (TypeError, ValueError): 523 my_job_id = None 524 logging.warning('Could not determine own job id.') 525 526 _perform_reimage_and_run(suite_spec, afe, tko, suite_job_id=my_job_id) 527 528 logging.debug('Returning from dynamic_suite.reimage_and_run.') 529 530 531def _perform_reimage_and_run(spec, afe, tko, suite_job_id=None): 532 """ 533 Do the work of reimaging hosts and running tests. 534 535 @param spec: a populated _SuiteSpec object. 536 @param afe: an instance of AFE as defined in server/frontend.py. 537 @param tko: an instance of TKO as defined in server/frontend.py. 538 @param suite_job_id: Job id that will act as parent id to all sub jobs. 539 Default: None 540 """ 541 # We can't create the suite until the devserver has finished downloading 542 # control_files and test_suites packages so that we can get the control 543 # files to schedule. 544 if not spec.run_prod_code: 545 _stage_artifacts_for_build(spec.devserver, spec.test_source_build) 546 suite = Suite.create_from_predicates( 547 predicates=[spec.predicate], 548 name=spec.name, 549 builds=spec.builds, 550 board=spec.board, 551 devserver=spec.devserver, 552 afe=afe, 553 tko=tko, 554 pool=spec.pool, 555 results_dir=spec.job.resultdir, 556 max_runtime_mins=spec.max_runtime_mins, 557 timeout_mins=spec.timeout_mins, 558 file_bugs=spec.file_bugs, 559 suite_job_id=suite_job_id, 560 extra_deps=spec.suite_dependencies, 561 priority=spec.priority, 562 wait_for_results=spec.wait_for_results, 563 job_retry=spec.job_retry, 564 max_retries=spec.max_retries, 565 offload_failures_only=spec.offload_failures_only, 566 test_source_build=spec.test_source_build, 567 run_prod_code=spec.run_prod_code, 568 job_keyvals=spec.job_keyvals, 569 test_args=spec.test_args, 570 child_dependencies=spec.child_dependencies, 571 ) 572 _run_suite_with_spec(suite, spec) 573 574 575def _run_suite_with_spec(suite, spec): 576 """ 577 Do the work of reimaging hosts and running tests. 578 579 @param suite: _BaseSuite instance to run. 580 @param spec: a populated _SuiteSpec object. 581 """ 582 _run_suite( 583 suite=suite, 584 job=spec.job, 585 delay_minutes=spec.delay_minutes, 586 bug_template=spec.bug_template) 587 588 589def _run_suite( 590 suite, 591 job, 592 delay_minutes, 593 bug_template): 594 """ 595 Run a suite. 596 597 @param suite: _BaseSuite instance. 598 @param job: an instance of client.common_lib.base_job representing the 599 currently running suite job. 600 @param delay_minutes: Delay the creation of test jobs for a given number 601 of minutes. 602 @param bug_template: A template dictionary specifying the default bug 603 filing options for failures in this suite. 604 """ 605 timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT) 606 utils.write_keyval( 607 job.resultdir, 608 {constants.ARTIFACT_FINISHED_TIME: timestamp}) 609 610 if delay_minutes: 611 logging.debug('delay_minutes is set. Sleeping %d minutes before ' 612 'creating test jobs.', delay_minutes) 613 time.sleep(delay_minutes*60) 614 logging.debug('Finished waiting for %d minutes before creating test ' 615 'jobs.', delay_minutes) 616 617 # Now we get to asychronously schedule tests. 618 suite.schedule(job.record_entry) 619 620 if suite.wait_for_results: 621 logging.debug('Waiting on suite.') 622 suite.wait(job.record_entry) 623 logging.debug('Finished waiting on suite. ' 624 'Returning from _perform_reimage_and_run.') 625 else: 626 logging.info('wait_for_results is set to False, suite job will exit ' 627 'without waiting for test jobs to finish.') 628 629 630def _stage_artifacts_for_build(devserver, build): 631 """Stage artifacts for a suite job. 632 633 @param devserver: devserver to stage artifacts with. 634 @param build: image to stage artifacts for. 635 """ 636 try: 637 devserver.stage_artifacts( 638 image=build, 639 artifacts=['control_files', 'test_suites']) 640 except dev_server.DevServerException as e: 641 # If we can't get the control files, there's nothing to run. 642 raise error.AsynchronousBuildFailure(e) 643 644 645# This function is used by the cros_test_platform suite, to unwrap json-decoded 646# arguments from the cros_test_platform recipe and convert them to byte string. 647# 648# It should not be used for other purposes. It exists in this module simply 649# to limit the number of necessary module imports in cros_test_platform. 650def byteify(input_): 651 """Walk a json object, turning unicode strings into byte strings.""" 652 if isinstance(input_, dict): 653 return {byteify(key): byteify(value) 654 for key, value in six.iteritems(input_)} 655 elif isinstance(input_, list): 656 return [byteify(element) for element in input_] 657 elif isinstance(input_, six.text_type): 658 return six.ensure_binary(input_, 'utf-8') 659 else: 660 return input_ 661