xref: /aosp_15_r20/external/autotest/server/cros/dynamic_suite/dynamic_suite.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Lint as: python2, python3
2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6from __future__ import print_function
7from __future__ import division
8from __future__ import absolute_import
9
10import datetime
11import logging
12import time
13import six
14import warnings
15
16import common
17
18from autotest_lib.client.common_lib import base_job
19from autotest_lib.client.common_lib import error
20from autotest_lib.client.common_lib import priorities
21from autotest_lib.client.common_lib import time_utils
22from autotest_lib.client.common_lib import utils
23from autotest_lib.client.common_lib.cros import dev_server
24from autotest_lib.server.cros import provision
25from autotest_lib.server.cros.dynamic_suite import constants
26from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
27from autotest_lib.server.cros.dynamic_suite.suite import ProvisionSuite
28from autotest_lib.server.cros.dynamic_suite.suite import Suite
29from autotest_lib.tko import utils as tko_utils
30
31
32"""CrOS dynamic test suite generation and execution module.
33
34This module implements runtime-generated test suites for CrOS.
35Design doc: http://goto.google.com/suitesv2
36
37Individual tests can declare themselves as a part of one or more
38suites, and the code here enables control files to be written
39that can refer to these "dynamic suites" by name.  We also provide
40support for reimaging devices with a given build and running a
41dynamic suite across all reimaged devices.
42
43The public API for defining a suite includes one method: reimage_and_run().
44A suite control file can be written by importing this module and making
45an appropriate call to this single method.  In normal usage, this control
46file will be run in a 'hostless' server-side autotest job, scheduling
47sub-jobs to do the needed reimaging and test running.
48
49Example control file:
50
51import common
52from autotest_lib.server.cros import provision
53from autotest_lib.server.cros.dynamic_suite import dynamic_suite
54
55dynamic_suite.reimage_and_run(
56    builds={provision.CROS_VERSION_PREFIX: build}, board=board, name='bvt',
57    job=job, pool=pool, check_hosts=check_hosts, add_experimental=True,
58    devserver_url=devserver_url)
59
60This will -- at runtime -- find all control files that contain "bvt" in their
61"SUITE=" clause, schedule jobs to reimage devices in the
62specified pool of the specified board with the specified build and, upon
63completion of those jobs, schedule and wait for jobs that run all the tests it
64discovered.
65
66Suites can be run by using the atest command-line tool:
67  atest suite create -b <board> -i <build/name> <suite>
68e.g.
69  atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt
70
71-------------------------------------------------------------------------
72Implementation details
73
74A Suite instance represents a single test suite, defined by some predicate
75run over all known control files.  The simplest example is creating a Suite
76by 'name'.
77
78create_suite_job() takes the parameters needed to define a suite run (board,
79build to test, machine pool, and which suite to run), ensures important
80preconditions are met, finds the appropraite suite control file, and then
81schedules the hostless job that will do the rest of the work.
82
83Note that we have more than one Dev server in our test lab architecture.
84We currently load balance per-build being tested, so one and only one dev
85server is used by any given run through the reimaging/testing flow.
86
87- create_suite_job()
88The primary role of create_suite_job() is to ensure that the required
89artifacts for the build to be tested are staged on the dev server.  This
90includes payloads required to autoupdate machines to the desired build, as
91well as the autotest control files appropriate for that build.  Then, the
92RPC pulls the control file for the suite to be run from the dev server and
93uses it to create the suite job with the autotest frontend.
94
95     +----------------+
96     | Google Storage |                                Client
97     +----------------+                                   |
98               | ^                                        | create_suite_job()
99 payloads/     | |                                        |
100 control files | | request                                |
101               V |                                        V
102       +-------------+   download request    +--------------------------+
103       |             |<----------------------|                          |
104       | Dev Server  |                       | Autotest Frontend (AFE)  |
105       |             |---------------------->|                          |
106       +-------------+  suite control file   +--------------------------+
107                                                          |
108                                                          V
109                                                      Suite Job (hostless)
110
111- Reimage and Run
112The overall process is to schedule all the tests, and then wait for the tests
113to complete.
114
115- The Reimaging Process
116
117As an artifact of an old implementation, the number of machines to use
118is called the 'sharding_factor', and the default is defined in the [CROS]
119section of global_config.ini.
120
121There used to be a 'num' parameter to control the maximum number of
122machines, but it does not do anything any more.
123
124A test control file can specify a list of DEPENDENCIES, which are really just
125the set of labels a host needs to have in order for that test to be scheduled
126on it.  In the case of a dynamic_suite, many tests in the suite may have
127DEPENDENCIES specified.  All tests are scheduled with the DEPENDENCIES that
128they specify, along with any suite dependencies that were specified, and the
129scheduler will find and provision a host capable of running the test.
130
131- Scheduling Suites
132A Suite instance uses the labels specified in the suite dependencies to
133schedule tests across all the hosts in the pool.  It then waits for all these
134jobs.  As an optimization, the Dev server stages the payloads necessary to
135run a suite in the background _after_ it has completed all the things
136necessary for reimaging.  Before running a suite, reimage_and_run() calls out
137to the Dev server and blocks until it's completed staging all build artifacts
138needed to run test suites.
139
140Step by step:
1410) At instantiation time, find all appropriate control files for this suite
142   that were included in the build to be tested.  To do this, we consult the
143   Dev Server, where all these control files are staged.
144
145          +------------+    control files?     +--------------------------+
146          |            |<----------------------|                          |
147          | Dev Server |                       | Autotest Frontend (AFE)  |
148          |            |---------------------->|       [Suite Job]        |
149          +------------+    control files!     +--------------------------+
150
1511) Now that the Suite instance exists, it schedules jobs for every control
152   file it deemed appropriate, to be run on the hosts that were labeled
153   by the provisioning.  We stuff keyvals into these jobs, indicating what
154   build they were testing and which suite they were for.
155
156   +--------------------------+ Job for VersLabel       +--------+
157   |                          |------------------------>| Host 1 | VersLabel
158   | Autotest Frontend (AFE)  |            +--------+   +--------+
159   |       [Suite Job]        |----------->| Host 2 |
160   +--------------------------+ Job for    +--------+
161       |                ^       VersLabel        VersLabel
162       |                |
163       +----------------+
164        One job per test
165        {'build': build/name,
166         'suite': suite_name}
167
1682) Now that all jobs are scheduled, they'll be doled out as labeled hosts
169   finish their assigned work and become available again.
170
171- Waiting on Suites
1720) As we clean up each test job, we check to see if any crashes occurred.  If
173   they did, we look at the 'build' keyval in the job to see which build's debug
174   symbols we'll need to symbolicate the crash dump we just found.
175
1761) Using this info, we tell a special Crash Server to stage the required debug
177   symbols. Once that's done, we ask the Crash Server to use those symbols to
178   symbolicate the crash dump in question.
179
180     +----------------+
181     | Google Storage |
182     +----------------+
183          |     ^
184 symbols! |     | symbols?
185          V     |
186      +------------+  stage symbols for build  +--------------------------+
187      |            |<--------------------------|                          |
188      |   Crash    |                           |                          |
189      |   Server   |   dump to symbolicate     | Autotest Frontend (AFE)  |
190      |            |<--------------------------|       [Suite Job]        |
191      |            |-------------------------->|                          |
192      +------------+    symbolicated dump      +--------------------------+
193
1942) As jobs finish, we record their success or failure in the status of the suite
195   job.  We also record a 'job keyval' in the suite job for each test, noting
196   the job ID and job owner.  This can be used to refer to test logs later.
1973) Once all jobs are complete, status is recorded for the suite job, and the
198   job_repo_url host attribute is removed from all hosts used by the suite.
199
200"""
201
202
203# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
204
205class _SuiteSpec(object):
206    """This class contains the info that defines a suite run."""
207
208    _REQUIRED_KEYWORDS = {
209            'board': str,
210            'builds': dict,
211            'name': str,
212            'job': base_job.base_job,
213            'devserver_url': str,
214    }
215
216    _VERSION_PREFIXES = frozenset((
217            provision.CROS_VERSION_PREFIX,
218            provision.CROS_ANDROID_VERSION_PREFIX,
219    ))
220
221    def __init__(
222            self,
223            builds=None,
224            board=None,
225            name=None,
226            job=None,
227            devserver_url=None,
228            pool=None,
229            check_hosts=True,
230            add_experimental=True,
231            file_bugs=False,
232            max_runtime_mins=24*60,
233            timeout_mins=24*60,
234            suite_dependencies=None,
235            bug_template=None,
236            priority=priorities.Priority.DEFAULT,
237            predicate=None,
238            wait_for_results=True,
239            job_retry=False,
240            max_retries=None,
241            offload_failures_only=False,
242            test_source_build=None,
243            run_prod_code=False,
244            delay_minutes=0,
245            job_keyvals=None,
246            test_args=None,
247            child_dependencies=(),
248            test_names=None,
249            **dargs):
250        """
251        Vets arguments for reimage_and_run() and populates self with supplied
252        values.
253
254        Currently required args:
255        @param builds: the builds to install e.g.
256                       {'cros-version:': 'x86-alex-release/R18-1655.0.0',
257                        'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'}
258        @param board: which kind of devices to reimage.
259        @param name: a value of the SUITE control file variable to search for.
260        @param job: an instance of client.common_lib.base_job representing the
261                    currently running suite job.
262        @param devserver_url: url to the selected devserver.
263
264        Currently supported optional args:
265        @param pool: the pool of machines to use for scheduling purposes.
266        @param check_hosts: require appropriate hosts to be available now.
267        @param add_experimental: schedule experimental tests as well, or not.
268        @param file_bugs: File bugs when tests in this suite fail.
269        @param max_runtime_mins: Max runtime in mins for each of the sub-jobs
270                                 this suite will run.
271        @param timeout_mins: Max lifetime in minutes for each of the sub-jobs
272                             that this suite runs.
273        @param suite_dependencies: A list of strings of suite level
274                                   dependencies, which act just like test
275                                   dependencies and are appended to each test's
276                                   set of dependencies at job creation time.
277                                   A string of comma seperated labels is
278                                   accepted for backwards compatibility.
279        @param bug_template: A template dictionary specifying the default bug
280                             filing options for failures in this suite.
281        @param priority: Integer priority level.  Higher is more important.
282        @param predicate: Optional argument. If present, should be a function
283                          mapping ControlData objects to True if they should be
284                          included in suite. If argument is absent, suite
285                          behavior will default to creating a suite of based
286                          on the SUITE field of control files.
287        @param wait_for_results: Set to False to run the suite job without
288                                 waiting for test jobs to finish.
289        @param job_retry: Set to True to enable job-level retry.
290        @param max_retries: Maximum retry limit at suite level if not None.
291                            Regardless how many times each individual test
292                            has been retried, the total number of retries
293                            happening in the suite can't exceed max_retries.
294        @param offload_failures_only: Only enable gs_offloading for failed
295                                      jobs.
296        @param test_source_build: Build that contains the server-side test code,
297                e.g., it can be the value of builds['cros-version:'] or
298                builds['fw-version:']. None uses the server-side test code from
299                builds['cros-version:'].
300        @param run_prod_code: If true, the suite will run the test code that
301                              lives in prod aka the test code currently on the
302                              lab servers.
303        @param delay_minutes: Delay the creation of test jobs for a given number
304                              of minutes.
305        @param job_keyvals: General job keyvals to be inserted into keyval file
306        @param test_args: A dict of args passed all the way to each individual
307                          test that will be actually ran.
308        @param child_dependencies: (optional) list of dependency strings
309                to be added as dependencies to child jobs.
310        @param test_names: (optional) if provided, Suite will consist of the
311                tests named in this list.
312        @param **dargs: these arguments will be ignored.  This allows us to
313                        deprecate and remove arguments in ToT while not
314                        breaking branch builds.
315        """
316        self._check_init_params(
317                board=board,
318                builds=builds,
319                name=name,
320                job=job,
321                devserver_url=devserver_url)
322
323        self.board = 'board:%s' % board
324        self.builds = builds
325        self.name = name
326        self.job = job
327        self.pool = ('pool:%s' % pool) if pool else pool
328        self.check_hosts = check_hosts
329        self.add_experimental = add_experimental
330        self.file_bugs = file_bugs
331        self.dependencies = {'': []}
332        self.max_runtime_mins = max_runtime_mins
333        self.timeout_mins = timeout_mins
334        self.bug_template = {} if bug_template is None else bug_template
335        self.priority = priority
336        self.wait_for_results = wait_for_results
337        self.job_retry = job_retry
338        self.max_retries = max_retries
339        self.offload_failures_only = offload_failures_only
340        self.run_prod_code = run_prod_code
341        self.delay_minutes = delay_minutes
342        self.job_keyvals = job_keyvals
343        self.test_args = test_args
344        self.child_dependencies = child_dependencies
345
346        self._init_predicate(predicate, test_names)
347        self._init_suite_dependencies(suite_dependencies)
348        self._init_devserver(devserver_url)
349        self._init_test_source_build(test_source_build)
350        self._translate_builds()
351        self._add_builds_to_suite_deps()
352
353        for key, value in six.iteritems(dargs):
354            warnings.warn('Ignored key %r was passed to suite with value %r'
355                          % (key, value))
356
357    def _check_init_params(self, **kwargs):
358        for key, expected_type in six.iteritems(self._REQUIRED_KEYWORDS):
359            value = kwargs.get(key)
360            # TODO(ayatane): `not value` includes both the cases where value is
361            # None and where value is the correct type, but empty (e.g., empty
362            # dict).  It looks like this is NOT the intended behavior, but I'm
363            # hesitant to remove it in case something is actually relying on
364            # this behavior.
365            if not value or not isinstance(value, expected_type):
366                raise error.SuiteArgumentException(
367                        'reimage_and_run() needs %s=<%r>'
368                        % (key, expected_type))
369
370    def _init_predicate(self, predicate, test_names):
371        """Initialize predicate attribute."""
372        if test_names:
373            self.predicate = Suite.test_name_in_list_predicate(test_names)
374            return
375
376        if predicate:
377            self.predicate = predicate
378            return
379
380        self.predicate = Suite.name_in_tag_predicate(self.name)
381
382    def _init_suite_dependencies(self, suite_dependencies):
383        """Initialize suite dependencies attribute."""
384        if suite_dependencies is None:
385            self.suite_dependencies = []
386        elif isinstance(suite_dependencies, str):
387            self.suite_dependencies = [dep.strip(' ') for dep
388                                       in suite_dependencies.split(',')]
389        else:
390            self.suite_dependencies = suite_dependencies
391
392    def _init_devserver(self, devserver_url):
393        """Initialize devserver attribute."""
394        self.devserver = dev_server.ImageServer(devserver_url)
395
396    def _init_test_source_build(self, test_source_build):
397        """Initialize test_source_build attribute."""
398        if test_source_build:
399            test_source_build = self.devserver.translate(test_source_build)
400
401        self.test_source_build = Suite.get_test_source_build(
402                self.builds, test_source_build=test_source_build)
403
404    def _translate_builds(self):
405        """Translate build names if they are in LATEST format."""
406        for prefix in self._VERSION_PREFIXES:
407            if prefix in self.builds:
408                translated_build = self.devserver.translate(
409                        self.builds[prefix])
410                self.builds[prefix] = translated_build
411
412    def _add_builds_to_suite_deps(self):
413        """Add builds to suite_dependencies.
414
415        To support provision both CrOS and firmware, option builds are added to
416        _SuiteSpec, e.g.,
417
418        builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0',
419                  'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'}
420
421        version_prefix+build should make it into each test as a DEPENDENCY.
422        The easiest way to do this is to tack it onto the suite_dependencies.
423        """
424        self.suite_dependencies.extend(
425                provision.join(version_prefix, build)
426                for version_prefix, build in six.iteritems(self.builds)
427        )
428
429
430class _ProvisionSuiteSpec(_SuiteSpec):
431
432    def __init__(self, num_required, **kwargs):
433        self.num_required = num_required
434        super(_ProvisionSuiteSpec, self).__init__(**kwargs)
435
436
437def run_provision_suite(**dargs):
438    """
439    Run a provision suite.
440
441    Will re-image a number of devices (of the specified board) with the
442    provided builds by scheduling stub_Pass.
443
444    @param job: an instance of client.common_lib.base_job representing the
445                currently running suite job.
446
447    @raises AsynchronousBuildFailure: if there was an issue finishing staging
448                                      from the devserver.
449    @raises MalformedDependenciesException: if the dependency_info file for
450                                            the required build fails to parse.
451    """
452    spec = _ProvisionSuiteSpec(**dargs)
453
454    afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10,
455                                        user=spec.job.user, debug=False)
456    tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10,
457                                        user=spec.job.user, debug=False)
458
459    try:
460        my_job_id = int(tko_utils.get_afe_job_id(spec.job.tag))
461        logging.debug('Determined own job id: %d', my_job_id)
462    except (TypeError, ValueError):
463        my_job_id = None
464        logging.warning('Could not determine own job id.')
465
466    suite = ProvisionSuite(
467            tag=spec.name,
468            builds=spec.builds,
469            board=spec.board,
470            devserver=spec.devserver,
471            num_required=spec.num_required,
472            afe=afe,
473            tko=tko,
474            pool=spec.pool,
475            results_dir=spec.job.resultdir,
476            max_runtime_mins=spec.max_runtime_mins,
477            timeout_mins=spec.timeout_mins,
478            file_bugs=spec.file_bugs,
479            suite_job_id=my_job_id,
480            extra_deps=spec.suite_dependencies,
481            priority=spec.priority,
482            wait_for_results=spec.wait_for_results,
483            job_retry=spec.job_retry,
484            max_retries=spec.max_retries,
485            offload_failures_only=spec.offload_failures_only,
486            test_source_build=spec.test_source_build,
487            run_prod_code=spec.run_prod_code,
488            job_keyvals=spec.job_keyvals,
489            test_args=spec.test_args,
490            child_dependencies=spec.child_dependencies,
491    )
492
493    _run_suite_with_spec(suite, spec)
494
495    logging.debug('Returning from dynamic_suite.run_provision_suite')
496
497
498def reimage_and_run(**dargs):
499    """
500    Backward-compatible API for dynamic_suite.
501
502    Will re-image a number of devices (of the specified board) with the
503    provided builds, and then run the indicated test suite on them.
504    Guaranteed to be compatible with any build from stable to dev.
505
506    @param dargs: Dictionary containing the arguments passed to _SuiteSpec().
507    @raises AsynchronousBuildFailure: if there was an issue finishing staging
508                                      from the devserver.
509    @raises MalformedDependenciesException: if the dependency_info file for
510                                            the required build fails to parse.
511    """
512    suite_spec = _SuiteSpec(**dargs)
513
514    afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10,
515                                        user=suite_spec.job.user, debug=False)
516    tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10,
517                                        user=suite_spec.job.user, debug=False)
518
519    try:
520        my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag))
521        logging.debug('Determined own job id: %d', my_job_id)
522    except (TypeError, ValueError):
523        my_job_id = None
524        logging.warning('Could not determine own job id.')
525
526    _perform_reimage_and_run(suite_spec, afe, tko, suite_job_id=my_job_id)
527
528    logging.debug('Returning from dynamic_suite.reimage_and_run.')
529
530
531def _perform_reimage_and_run(spec, afe, tko, suite_job_id=None):
532    """
533    Do the work of reimaging hosts and running tests.
534
535    @param spec: a populated _SuiteSpec object.
536    @param afe: an instance of AFE as defined in server/frontend.py.
537    @param tko: an instance of TKO as defined in server/frontend.py.
538    @param suite_job_id: Job id that will act as parent id to all sub jobs.
539                         Default: None
540    """
541    # We can't create the suite until the devserver has finished downloading
542    # control_files and test_suites packages so that we can get the control
543    # files to schedule.
544    if not spec.run_prod_code:
545        _stage_artifacts_for_build(spec.devserver, spec.test_source_build)
546    suite = Suite.create_from_predicates(
547            predicates=[spec.predicate],
548            name=spec.name,
549            builds=spec.builds,
550            board=spec.board,
551            devserver=spec.devserver,
552            afe=afe,
553            tko=tko,
554            pool=spec.pool,
555            results_dir=spec.job.resultdir,
556            max_runtime_mins=spec.max_runtime_mins,
557            timeout_mins=spec.timeout_mins,
558            file_bugs=spec.file_bugs,
559            suite_job_id=suite_job_id,
560            extra_deps=spec.suite_dependencies,
561            priority=spec.priority,
562            wait_for_results=spec.wait_for_results,
563            job_retry=spec.job_retry,
564            max_retries=spec.max_retries,
565            offload_failures_only=spec.offload_failures_only,
566            test_source_build=spec.test_source_build,
567            run_prod_code=spec.run_prod_code,
568            job_keyvals=spec.job_keyvals,
569            test_args=spec.test_args,
570            child_dependencies=spec.child_dependencies,
571    )
572    _run_suite_with_spec(suite, spec)
573
574
575def _run_suite_with_spec(suite, spec):
576    """
577    Do the work of reimaging hosts and running tests.
578
579    @param suite: _BaseSuite instance to run.
580    @param spec: a populated _SuiteSpec object.
581    """
582    _run_suite(
583        suite=suite,
584        job=spec.job,
585        delay_minutes=spec.delay_minutes,
586        bug_template=spec.bug_template)
587
588
589def _run_suite(
590        suite,
591        job,
592        delay_minutes,
593        bug_template):
594    """
595    Run a suite.
596
597    @param suite: _BaseSuite instance.
598    @param job: an instance of client.common_lib.base_job representing the
599                currently running suite job.
600    @param delay_minutes: Delay the creation of test jobs for a given number
601                          of minutes.
602    @param bug_template: A template dictionary specifying the default bug
603                         filing options for failures in this suite.
604    """
605    timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT)
606    utils.write_keyval(
607        job.resultdir,
608        {constants.ARTIFACT_FINISHED_TIME: timestamp})
609
610    if delay_minutes:
611        logging.debug('delay_minutes is set. Sleeping %d minutes before '
612                      'creating test jobs.', delay_minutes)
613        time.sleep(delay_minutes*60)
614        logging.debug('Finished waiting for %d minutes before creating test '
615                      'jobs.', delay_minutes)
616
617    # Now we get to asychronously schedule tests.
618    suite.schedule(job.record_entry)
619
620    if suite.wait_for_results:
621        logging.debug('Waiting on suite.')
622        suite.wait(job.record_entry)
623        logging.debug('Finished waiting on suite. '
624                      'Returning from _perform_reimage_and_run.')
625    else:
626        logging.info('wait_for_results is set to False, suite job will exit '
627                     'without waiting for test jobs to finish.')
628
629
630def _stage_artifacts_for_build(devserver, build):
631    """Stage artifacts for a suite job.
632
633    @param devserver: devserver to stage artifacts with.
634    @param build: image to stage artifacts for.
635    """
636    try:
637        devserver.stage_artifacts(
638                image=build,
639                artifacts=['control_files', 'test_suites'])
640    except dev_server.DevServerException as e:
641        # If we can't get the control files, there's nothing to run.
642        raise error.AsynchronousBuildFailure(e)
643
644
645# This function is used by the cros_test_platform suite, to unwrap json-decoded
646# arguments from the cros_test_platform recipe and convert them to byte string.
647#
648# It should not be used for other purposes. It exists in this module simply
649# to limit the number of necessary module imports in cros_test_platform.
650def byteify(input_):
651    """Walk a json object, turning unicode strings into byte strings."""
652    if isinstance(input_, dict):
653        return {byteify(key): byteify(value)
654                for key, value in six.iteritems(input_)}
655    elif isinstance(input_, list):
656        return [byteify(element) for element in input_]
657    elif isinstance(input_, six.text_type):
658        return six.ensure_binary(input_, 'utf-8')
659    else:
660        return input_
661