xref: /aosp_15_r20/external/autotest/server/site_tests/telemetry_AFDOGenerate/telemetry_AFDOGenerate.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""
5Test to generate the AFDO profile for a set of ChromeOS benchmarks.
6
7This will run a pre-determined set of benchmarks on the DUT under
8the monitoring of the linux "perf" tool. The resulting perf.data
9file will then be copied to Google Storage (GS) where it can be
10used by the AFDO optimized build.
11
12Given that the telemetry benchmarks are quite unstable on ChromeOS at
13this point, this test also supports a mode where the benchmarks are
14executed outside of the telemetry framework. It is not the same as
15executing the benchmarks under telemetry because there is no telemetry
16measurement taken but, for the purposes of profiling Chrome, it should
17be pretty close.
18
19Example invocation:
20/usr/bin/test_that --debug --board=lumpy <DUT IP>
21  --args="ignore_failures=True local=True gs_test_location=True"
22  telemetry_AFDOGenerate
23"""
24
25
26import bz2
27import logging
28import os
29import time
30import sys
31
32# TODO (b/206008069), remove this when migrated to new env
33sys.path.insert(0,
34                '/usr/local/lib/python2.7/dist-packages/six-1.16.0-py2.7.egg')
35try:
36    # This is weird. But it seems something is bringing in six earlier
37    # Going to force a reload after the egg is inserted.
38    import six
39    if six.PY2:
40        reload(six)
41    else:
42        import importlib
43        importlib.reload(six)
44    logging.debug("six version is {}".format(six.__version__))
45    if six.__version__ != '1.16.0':
46        logging.debug(sys.path)
47except ImportError as e:
48    logging.warning("Could not import six due to %s", e)
49
50from contextlib import contextmanager
51
52from autotest_lib.client.common_lib import error
53from autotest_lib.server import autotest
54from autotest_lib.server import test
55from autotest_lib.server import utils
56from autotest_lib.server.cros import filesystem_util
57from autotest_lib.server.cros import telemetry_runner
58from autotest_lib.site_utils import test_runner_utils
59
60# These are arguments to the linux "perf" tool.
61# The -e value is processor specific and comes from the Intel SDM vol 3b
62# TODO(b:229298221): Revert to -c 50000 when fixed.
63PROFILER_ARGS = 'record -a -e r20c4 -c 200003 -b'
64
65# In practice, it takes >2min to copy the perf.data back from the DUT, set
66# this timeout to 600 secs to be safe.
67WAIT_FOR_CMD_TIMEOUT_SECS = 600
68
69# Reuse ssh and scp settings from telemetry_Crosperf
70RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
71DUT_SCP_OPTIONS = ' '.join([
72        '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
73        '-o BatchMode=yes', '-o ConnectTimeout=30',
74        '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
75        '-o ConnectionAttempts=4', '-o Protocol=2'
76])
77DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf'
78
79_WAIT_CMD_TEMPLATE = """\
80for _ in {1..%(timeout)d}; do \
81  ps %(pid)d >/dev/null || break; \
82  sleep 1; \
83done; \
84! ps %(pid)d >/dev/null \
85"""
86
87
88def _wait_for_process(host, pid, timeout=-1):
89    """Waits for a process on the DUT to terminate.
90
91    @param host: A host object representing the DUT.
92    @param pid: The process ID (integer).
93    @param timeout: Number of seconds to wait; default is wait forever.
94    """
95    wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout}
96    return host.run(wait_cmd, ignore_status=True).exit_status
97
98
99# List of benchmarks to run to capture profile information. This is
100# based on the "superhero" list and other telemetry benchmarks. Goal is
101# to have a short list that is as representative as possible and takes a
102# short time to execute. At this point the list of benchmarks is in flux.
103TELEMETRY_AFDO_BENCHMARKS = (
104        # page_cycler tests are deprecated. Replace them with loading.desktop.
105        ('loading.desktop', ('--pageset-repeat=1',
106                             '--story-tag-filter=typical')),
107        # TODO(b:229298221): Re-enabled when fixed.
108        # ('loading.desktop', ('--pageset-repeat=1',
109        #                      '--story-tag-filter=intl_ja_zh')),
110        ('rendering.desktop', ('--pageset-repeat=1',
111                               '--story-tag-filter=tough_canvas')),
112        ('octane', ),
113        ('kraken', ),
114        ('speedometer2', ),
115)
116
117# Temporarily disable this benchmark because it is failing a
118# lot. Filed chromium:590127
119# ('smoothness.tough_webgl_cases',)
120
121# Some benchmarks removed from the profile set:
122# 'page_cycler.morejs' -> uninteresting, seems to fail frequently,
123# 'page_cycler.moz' -> seems very old.
124# 'media.tough_video_cases' -> removed this because it does not bring
125#                              any benefit and takes more than 12 mins
126
127# List of boards where this test can be run.  Currently, it needs a
128# machines with at least 4GB of memory or 2GB of /tmp.
129# This must be consistent with chromite.
130GCC_BOARDS = ['lumpy']
131
132# Should be disjoint with GCC_BOARDS
133LLVM_BOARDS = ['chell']
134
135# FIXME(tcwang): only used for testing Async AFDO generation builders.
136# Remove this after testing is done.
137# Due to crbug.com/991299 and crbug.com/992539, AFDO profiles generated
138# by samus is not suitable for production in both main and branch.
139# So it's suitable to test generation profiles but not actually use it.
140LLVM_BOARDS_ASYNC = ['samus']
141
142
143class telemetry_AFDOGenerate(test.test):
144    """
145    Run one or more telemetry benchmarks under the "perf" monitoring
146    tool, generate a "perf.data" file and upload to GS for comsumption
147    by the AFDO optimized build.
148    """
149    version = 1
150
151    def scp_perf_data(self, dut, host_dir):
152        """Copy perf data from dut.
153
154        @param dut: The autotest host object representing DUT.
155        @param host_dir: The directory on host to put the file .
156
157        @returns status code for scp command.
158        """
159        cmd = []
160        src = ('root@%s:%s/%s' % (dut.hostname, DUT_CHROME_RESULTS_DIR,
161                                  'perf.data'))
162        cmd.extend([
163                'scp', DUT_SCP_OPTIONS, RSA_KEY,
164                '-P %s' % str(dut.port) if dut.port else '', '-v', src,
165                host_dir
166        ])
167        command = ' '.join(cmd)
168
169        logging.debug('Retrieving Perf Data: %s', command)
170        try:
171            result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS)
172            exit_code = result.exit_status
173        except Exception as e:
174            logging.error('Failed to retrieve results: %s', e)
175            raise
176
177        logging.debug('command return value: %d', exit_code)
178        return exit_code
179
180    @contextmanager
181    def perf_on_dut(self):
182        """Start and kill perf process on DUT.
183        """
184        logging.info('Starting perf process in background.')
185        perf_cmd = 'nohup perf %s -o %s/perf.data' \
186                    % (PROFILER_ARGS, DUT_CHROME_RESULTS_DIR)
187        perf_pid = self._host.run_background(perf_cmd)
188
189        try:
190            # Use `kill -0` to check whether the perf process is alive
191            verify_cmd = 'kill -0 %s' % perf_pid
192            if self._host.run(verify_cmd, ignore_status=True).exit_status != 0:
193                logging.error('Perf process not started correctly on DUT')
194                raise RuntimeError
195            logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd)
196            yield
197        finally:
198            # Check if process is still alive after benchmark run, if yes,
199            # then kill it with -2 (which is SIGINT).
200            kill_cmd = 'kill -0 %s && killall -2 perf' % perf_pid
201            if self._host.run(kill_cmd, ignore_status=True).exit_status != 0:
202                logging.error('Perf process is not killed correctly on DUT.')
203                raise RuntimeError
204            # Perf process may not be terminated right after the kill command,
205            # wait until perf process finishes.
206            status = _wait_for_process(self._host, int(perf_pid),
207                                       WAIT_FOR_CMD_TIMEOUT_SECS)
208            if status != 0:
209                logging.error('Error waiting for perf process to be killed.')
210                raise RuntimeError
211            logging.info('Perf has been killed on DUT.')
212
213        status = self.scp_perf_data(self._host, self.profdir)
214        if status != 0:
215            logging.error('Cannot copy perf.data file to host.')
216            raise RuntimeError
217
218    def run_once(self, host, args):
219        """Run a set of telemetry benchmarks.
220
221        @param host: Host machine where test is run
222        @param args: A dictionary of the arguments that were passed
223                to this test.
224        @returns None.
225        """
226        self._host = host
227        host_board = host.get_board().split(':')[1]
228
229        if not (host_board in LLVM_BOARDS or host_board in GCC_BOARDS
230                or host_board in LLVM_BOARDS_ASYNC):
231            raise error.TestFail(
232                    'This test cannot be run on board %s' % host_board)
233
234        self._parse_args(args)
235
236        # Remove write protection on host, as now telemetry code will
237        # try to remove write protection that causes the machine to
238        # reboot and remount during run_benchmark. We want to avoid it.
239        filesystem_util.make_rootfs_writable(self._host)
240
241        with self.perf_on_dut():
242            if self._minimal_telemetry:
243                self._run_tests_minimal_telemetry()
244            else:
245                with telemetry_runner.TelemetryRunnerFactory().get_runner(
246                        self._host, self._local, telemetry_on_dut=False) as tr:
247                    for benchmark_info in TELEMETRY_AFDO_BENCHMARKS:
248                        benchmark = benchmark_info[0]
249                        args = (
250                        ) if len(benchmark_info) == 1 else benchmark_info[1]
251                        try:
252                            self._run_test_with_retry(tr, benchmark, *args)
253                        except error.TestBaseException:
254                            if not self._ignore_failures:
255                                raise
256                            logging.info('Ignoring failure from benchmark %s.',
257                                         benchmark)
258
259    def after_run_once(self):
260        """After the profile information has been collected, compress it
261        and upload it to GS
262        """
263        PERF_FILE = 'perf.data'
264        COMP_PERF_FILE = 'chromeos-chrome-%s-%s.perf.data'
265        perf_data = os.path.join(self.profdir, PERF_FILE)
266        comp_data = os.path.join(self.profdir,
267                                 COMP_PERF_FILE % (self._arch, self._version))
268        compressed = self._compress_file(perf_data, comp_data)
269        self._gs_upload(compressed, os.path.basename(compressed))
270
271        # Also create copy of this file using "LATEST" as version so
272        # it can be found in case the builder is looking for a version
273        # number that does not match. It is ok to use a slighly old
274        # version of the this file for the optimized build
275        latest_data = COMP_PERF_FILE % (self._arch, 'LATEST')
276        latest_compressed = self._get_compressed_name(latest_data)
277        self._gs_upload(compressed, latest_compressed)
278
279        # So that they are not uploaded along with the logs.
280        os.remove(compressed)
281        os.remove(perf_data)
282
283    def _parse_args(self, args):
284        """Parses input arguments to this autotest.
285
286        @param args: Options->values dictionary.
287        @raises error.TestFail if a bad option is passed.
288        """
289
290        # Set default values for the options.
291        # Architecture for which we are collecting afdo data.
292        self._arch = 'amd64'
293        # Use an alternate GS location where everyone can write.
294        # Set default depending on whether this is executing in
295        # the lab environment or not
296        self._gs_test_location = not utils.host_is_in_lab_zone(
297                self._host.hostname)
298        # Ignore individual test failures.
299        self._ignore_failures = False
300        # Use local copy of telemetry instead of using the dev server copy.
301        self._local = False
302        # Chrome version to which the AFDO data corresponds.
303        self._version, _ = self._host.get_chrome_version()
304        # Try to use the minimal support from Telemetry. The Telemetry
305        # benchmarks in ChromeOS are too flaky at this point. So, initially,
306        # this will be set to True by default.
307        self._minimal_telemetry = False
308
309        # Ignored servo arguments.
310        ignored_options = ('servo_host', 'servo_port')
311
312        for option_name, value in args.items():
313            if option_name == 'arch':
314                self._arch = value
315            elif option_name == 'gs_test_location':
316                self._gs_test_location = (value == 'True')
317            elif option_name == 'ignore_failures':
318                self._ignore_failures = (value == 'True')
319            elif option_name == 'local':
320                self._local = (value == 'True')
321            elif option_name == 'minimal_telemetry':
322                self._minimal_telemetry = (value == 'True')
323            elif option_name == 'version':
324                self._version = value
325            elif option_name in ignored_options:
326                continue
327            else:
328                raise error.TestFail('Unknown option passed: %s' % option_name)
329
330    def _run_test(self, tr, benchmark, *args):
331        """Run the benchmark using Telemetry.
332
333        @param tr: Instance of the TelemetryRunner subclass.
334        @param benchmark: Name of the benchmark to run.
335        @param args: Additional arguments to pass to the telemetry execution
336                     script.
337        @raises Raises error.TestFail if execution of test failed.
338                Also re-raise any exceptions thrown by run_telemetry benchmark.
339        """
340        try:
341            logging.info('Starting run for Telemetry benchmark %s', benchmark)
342            start_time = time.time()
343            result = tr.run_telemetry_benchmark(benchmark, None, *args)
344            end_time = time.time()
345            logging.info('Completed Telemetry benchmark %s in %f seconds',
346                         benchmark, end_time - start_time)
347        except error.TestBaseException as e:
348            end_time = time.time()
349            logging.info(
350                    'Got exception from Telemetry benchmark %s '
351                    'after %f seconds. Exception: %s', benchmark,
352                    end_time - start_time, str(e))
353            raise
354
355        # We dont generate any keyvals for this run. This is not
356        # an official run of the benchmark. We are just running it to get
357        # a profile from it.
358
359        if result.status is telemetry_runner.SUCCESS_STATUS:
360            logging.info('Benchmark %s succeeded', benchmark)
361        else:
362            raise error.TestFail('An error occurred while executing'
363                                 ' benchmark: %s' % benchmark)
364
365    def _run_test_with_retry(self, tr, benchmark, *args):
366        """Run the benchmark using Telemetry. Retry in case of failure.
367
368        @param tr: An instance of the TelemetryRunner subclass.
369        @param benchmark: Name of the benchmark to run.
370        @param args: Additional arguments to pass to the telemetry execution
371                     script.
372        @raises Re-raise any exceptions thrown by _run_test.
373        """
374
375        tried = False
376        while True:
377            try:
378                self._run_test(tr, benchmark, *args)
379                logging.info('Benchmark %s succeeded on %s try', benchmark,
380                             'first' if not tried else 'second')
381                break
382            except error.TestBaseException:
383                if not tried:
384                    tried = True
385                    logging.info('Benchmark %s failed. Retrying ...',
386                                 benchmark)
387                else:
388                    logging.info('Benchmark %s failed twice. Not retrying',
389                                 benchmark)
390                    raise
391
392    def _run_tests_minimal_telemetry(self):
393        """Run the benchmarks using the minimal support from Telemetry.
394
395        The benchmarks are run using a client side autotest test. This test
396        will control Chrome directly using the chrome.Chrome support and it
397        will ask Chrome to display the benchmark pages directly instead of
398        using the "page sets" and "measurements" support from Telemetry.
399        In this way we avoid using Telemetry benchmark support which is not
400        stable on ChromeOS yet.
401        """
402        AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient'
403
404        # Execute the client side test.
405        client_at = autotest.Autotest(self._host)
406        client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='')
407
408    @staticmethod
409    def _get_compressed_name(name):
410        """Given a file name, return bz2 compressed name.
411        @param name: Name of uncompressed file.
412        @returns name of compressed file.
413        """
414        return name + '.bz2'
415
416    @staticmethod
417    def _compress_file(unc_file, com_file):
418        """Compresses specified file with bz2.
419
420        @param unc_file: name of file to compress.
421        @param com_file: prefix name of compressed file.
422        @raises error.TestFail if compression failed
423        @returns Name of compressed file.
424        """
425        dest = ''
426        with open(unc_file, 'rb') as inp:
427            dest = telemetry_AFDOGenerate._get_compressed_name(com_file)
428            with bz2.BZ2File(dest, 'wb') as out:
429                for data in inp:
430                    out.write(data)
431        if not dest or not os.path.isfile(dest):
432            raise error.TestFail('Could not compress %s' % unc_file)
433        return dest
434
435    def _gs_upload(self, local_file, remote_basename):
436        """Uploads file to google storage specific location.
437
438        @param local_file: name of file to upload.
439        @param remote_basename: basename of remote file.
440        @raises error.TestFail if upload failed.
441        @returns nothing.
442        """
443        GS_GCC_DEST = 'gs://chromeos-prebuilt/afdo-job/canonicals/%s'
444        GS_LLVM_DEST = 'gs://chromeos-toolchain-artifacts/afdo/unvetted/benchmark/%s'
445        GS_LLVM_ASYNC_DEST = \
446            'gs://chromeos-throw-away-bucket/afdo-job/llvm/benchmarks/%s'
447        GS_TEST_DEST = 'gs://chromeos-throw-away-bucket/afdo-job/canonicals/%s'
448        GS_ACL = 'project-private'
449
450        board = self._host.get_board().split(':')[1]
451
452        if self._gs_test_location:
453            gs_dest = GS_TEST_DEST
454        elif board in GCC_BOARDS:
455            gs_dest = GS_GCC_DEST
456        elif board in LLVM_BOARDS:
457            gs_dest = GS_LLVM_DEST
458        elif board in LLVM_BOARDS_ASYNC:
459            gs_dest = GS_LLVM_ASYNC_DEST
460            GS_ACL = 'public-read'
461        else:
462            raise error.TestFail('This test cannot be run on board %s' % board)
463
464        remote_file = gs_dest % remote_basename
465
466        logging.info('About to upload to GS: %s', remote_file)
467        if not utils.gs_upload(
468                local_file, remote_file, GS_ACL, result_dir=self.resultsdir):
469            logging.info('Failed upload to GS: %s', remote_file)
470            raise error.TestFail(
471                    'Unable to gs upload %s to %s' % (local_file, remote_file))
472
473        logging.info('Successfull upload to GS: %s', remote_file)
474