1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4""" 5Test to generate the AFDO profile for a set of ChromeOS benchmarks. 6 7This will run a pre-determined set of benchmarks on the DUT under 8the monitoring of the linux "perf" tool. The resulting perf.data 9file will then be copied to Google Storage (GS) where it can be 10used by the AFDO optimized build. 11 12Given that the telemetry benchmarks are quite unstable on ChromeOS at 13this point, this test also supports a mode where the benchmarks are 14executed outside of the telemetry framework. It is not the same as 15executing the benchmarks under telemetry because there is no telemetry 16measurement taken but, for the purposes of profiling Chrome, it should 17be pretty close. 18 19Example invocation: 20/usr/bin/test_that --debug --board=lumpy <DUT IP> 21 --args="ignore_failures=True local=True gs_test_location=True" 22 telemetry_AFDOGenerate 23""" 24 25 26import bz2 27import logging 28import os 29import time 30import sys 31 32# TODO (b/206008069), remove this when migrated to new env 33sys.path.insert(0, 34 '/usr/local/lib/python2.7/dist-packages/six-1.16.0-py2.7.egg') 35try: 36 # This is weird. But it seems something is bringing in six earlier 37 # Going to force a reload after the egg is inserted. 38 import six 39 if six.PY2: 40 reload(six) 41 else: 42 import importlib 43 importlib.reload(six) 44 logging.debug("six version is {}".format(six.__version__)) 45 if six.__version__ != '1.16.0': 46 logging.debug(sys.path) 47except ImportError as e: 48 logging.warning("Could not import six due to %s", e) 49 50from contextlib import contextmanager 51 52from autotest_lib.client.common_lib import error 53from autotest_lib.server import autotest 54from autotest_lib.server import test 55from autotest_lib.server import utils 56from autotest_lib.server.cros import filesystem_util 57from autotest_lib.server.cros import telemetry_runner 58from autotest_lib.site_utils import test_runner_utils 59 60# These are arguments to the linux "perf" tool. 61# The -e value is processor specific and comes from the Intel SDM vol 3b 62# TODO(b:229298221): Revert to -c 50000 when fixed. 63PROFILER_ARGS = 'record -a -e r20c4 -c 200003 -b' 64 65# In practice, it takes >2min to copy the perf.data back from the DUT, set 66# this timeout to 600 secs to be safe. 67WAIT_FOR_CMD_TIMEOUT_SECS = 600 68 69# Reuse ssh and scp settings from telemetry_Crosperf 70RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH 71DUT_SCP_OPTIONS = ' '.join([ 72 '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null', 73 '-o BatchMode=yes', '-o ConnectTimeout=30', 74 '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3', 75 '-o ConnectionAttempts=4', '-o Protocol=2' 76]) 77DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf' 78 79_WAIT_CMD_TEMPLATE = """\ 80for _ in {1..%(timeout)d}; do \ 81 ps %(pid)d >/dev/null || break; \ 82 sleep 1; \ 83done; \ 84! ps %(pid)d >/dev/null \ 85""" 86 87 88def _wait_for_process(host, pid, timeout=-1): 89 """Waits for a process on the DUT to terminate. 90 91 @param host: A host object representing the DUT. 92 @param pid: The process ID (integer). 93 @param timeout: Number of seconds to wait; default is wait forever. 94 """ 95 wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout} 96 return host.run(wait_cmd, ignore_status=True).exit_status 97 98 99# List of benchmarks to run to capture profile information. This is 100# based on the "superhero" list and other telemetry benchmarks. Goal is 101# to have a short list that is as representative as possible and takes a 102# short time to execute. At this point the list of benchmarks is in flux. 103TELEMETRY_AFDO_BENCHMARKS = ( 104 # page_cycler tests are deprecated. Replace them with loading.desktop. 105 ('loading.desktop', ('--pageset-repeat=1', 106 '--story-tag-filter=typical')), 107 # TODO(b:229298221): Re-enabled when fixed. 108 # ('loading.desktop', ('--pageset-repeat=1', 109 # '--story-tag-filter=intl_ja_zh')), 110 ('rendering.desktop', ('--pageset-repeat=1', 111 '--story-tag-filter=tough_canvas')), 112 ('octane', ), 113 ('kraken', ), 114 ('speedometer2', ), 115) 116 117# Temporarily disable this benchmark because it is failing a 118# lot. Filed chromium:590127 119# ('smoothness.tough_webgl_cases',) 120 121# Some benchmarks removed from the profile set: 122# 'page_cycler.morejs' -> uninteresting, seems to fail frequently, 123# 'page_cycler.moz' -> seems very old. 124# 'media.tough_video_cases' -> removed this because it does not bring 125# any benefit and takes more than 12 mins 126 127# List of boards where this test can be run. Currently, it needs a 128# machines with at least 4GB of memory or 2GB of /tmp. 129# This must be consistent with chromite. 130GCC_BOARDS = ['lumpy'] 131 132# Should be disjoint with GCC_BOARDS 133LLVM_BOARDS = ['chell'] 134 135# FIXME(tcwang): only used for testing Async AFDO generation builders. 136# Remove this after testing is done. 137# Due to crbug.com/991299 and crbug.com/992539, AFDO profiles generated 138# by samus is not suitable for production in both main and branch. 139# So it's suitable to test generation profiles but not actually use it. 140LLVM_BOARDS_ASYNC = ['samus'] 141 142 143class telemetry_AFDOGenerate(test.test): 144 """ 145 Run one or more telemetry benchmarks under the "perf" monitoring 146 tool, generate a "perf.data" file and upload to GS for comsumption 147 by the AFDO optimized build. 148 """ 149 version = 1 150 151 def scp_perf_data(self, dut, host_dir): 152 """Copy perf data from dut. 153 154 @param dut: The autotest host object representing DUT. 155 @param host_dir: The directory on host to put the file . 156 157 @returns status code for scp command. 158 """ 159 cmd = [] 160 src = ('root@%s:%s/%s' % (dut.hostname, DUT_CHROME_RESULTS_DIR, 161 'perf.data')) 162 cmd.extend([ 163 'scp', DUT_SCP_OPTIONS, RSA_KEY, 164 '-P %s' % str(dut.port) if dut.port else '', '-v', src, 165 host_dir 166 ]) 167 command = ' '.join(cmd) 168 169 logging.debug('Retrieving Perf Data: %s', command) 170 try: 171 result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS) 172 exit_code = result.exit_status 173 except Exception as e: 174 logging.error('Failed to retrieve results: %s', e) 175 raise 176 177 logging.debug('command return value: %d', exit_code) 178 return exit_code 179 180 @contextmanager 181 def perf_on_dut(self): 182 """Start and kill perf process on DUT. 183 """ 184 logging.info('Starting perf process in background.') 185 perf_cmd = 'nohup perf %s -o %s/perf.data' \ 186 % (PROFILER_ARGS, DUT_CHROME_RESULTS_DIR) 187 perf_pid = self._host.run_background(perf_cmd) 188 189 try: 190 # Use `kill -0` to check whether the perf process is alive 191 verify_cmd = 'kill -0 %s' % perf_pid 192 if self._host.run(verify_cmd, ignore_status=True).exit_status != 0: 193 logging.error('Perf process not started correctly on DUT') 194 raise RuntimeError 195 logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd) 196 yield 197 finally: 198 # Check if process is still alive after benchmark run, if yes, 199 # then kill it with -2 (which is SIGINT). 200 kill_cmd = 'kill -0 %s && killall -2 perf' % perf_pid 201 if self._host.run(kill_cmd, ignore_status=True).exit_status != 0: 202 logging.error('Perf process is not killed correctly on DUT.') 203 raise RuntimeError 204 # Perf process may not be terminated right after the kill command, 205 # wait until perf process finishes. 206 status = _wait_for_process(self._host, int(perf_pid), 207 WAIT_FOR_CMD_TIMEOUT_SECS) 208 if status != 0: 209 logging.error('Error waiting for perf process to be killed.') 210 raise RuntimeError 211 logging.info('Perf has been killed on DUT.') 212 213 status = self.scp_perf_data(self._host, self.profdir) 214 if status != 0: 215 logging.error('Cannot copy perf.data file to host.') 216 raise RuntimeError 217 218 def run_once(self, host, args): 219 """Run a set of telemetry benchmarks. 220 221 @param host: Host machine where test is run 222 @param args: A dictionary of the arguments that were passed 223 to this test. 224 @returns None. 225 """ 226 self._host = host 227 host_board = host.get_board().split(':')[1] 228 229 if not (host_board in LLVM_BOARDS or host_board in GCC_BOARDS 230 or host_board in LLVM_BOARDS_ASYNC): 231 raise error.TestFail( 232 'This test cannot be run on board %s' % host_board) 233 234 self._parse_args(args) 235 236 # Remove write protection on host, as now telemetry code will 237 # try to remove write protection that causes the machine to 238 # reboot and remount during run_benchmark. We want to avoid it. 239 filesystem_util.make_rootfs_writable(self._host) 240 241 with self.perf_on_dut(): 242 if self._minimal_telemetry: 243 self._run_tests_minimal_telemetry() 244 else: 245 with telemetry_runner.TelemetryRunnerFactory().get_runner( 246 self._host, self._local, telemetry_on_dut=False) as tr: 247 for benchmark_info in TELEMETRY_AFDO_BENCHMARKS: 248 benchmark = benchmark_info[0] 249 args = ( 250 ) if len(benchmark_info) == 1 else benchmark_info[1] 251 try: 252 self._run_test_with_retry(tr, benchmark, *args) 253 except error.TestBaseException: 254 if not self._ignore_failures: 255 raise 256 logging.info('Ignoring failure from benchmark %s.', 257 benchmark) 258 259 def after_run_once(self): 260 """After the profile information has been collected, compress it 261 and upload it to GS 262 """ 263 PERF_FILE = 'perf.data' 264 COMP_PERF_FILE = 'chromeos-chrome-%s-%s.perf.data' 265 perf_data = os.path.join(self.profdir, PERF_FILE) 266 comp_data = os.path.join(self.profdir, 267 COMP_PERF_FILE % (self._arch, self._version)) 268 compressed = self._compress_file(perf_data, comp_data) 269 self._gs_upload(compressed, os.path.basename(compressed)) 270 271 # Also create copy of this file using "LATEST" as version so 272 # it can be found in case the builder is looking for a version 273 # number that does not match. It is ok to use a slighly old 274 # version of the this file for the optimized build 275 latest_data = COMP_PERF_FILE % (self._arch, 'LATEST') 276 latest_compressed = self._get_compressed_name(latest_data) 277 self._gs_upload(compressed, latest_compressed) 278 279 # So that they are not uploaded along with the logs. 280 os.remove(compressed) 281 os.remove(perf_data) 282 283 def _parse_args(self, args): 284 """Parses input arguments to this autotest. 285 286 @param args: Options->values dictionary. 287 @raises error.TestFail if a bad option is passed. 288 """ 289 290 # Set default values for the options. 291 # Architecture for which we are collecting afdo data. 292 self._arch = 'amd64' 293 # Use an alternate GS location where everyone can write. 294 # Set default depending on whether this is executing in 295 # the lab environment or not 296 self._gs_test_location = not utils.host_is_in_lab_zone( 297 self._host.hostname) 298 # Ignore individual test failures. 299 self._ignore_failures = False 300 # Use local copy of telemetry instead of using the dev server copy. 301 self._local = False 302 # Chrome version to which the AFDO data corresponds. 303 self._version, _ = self._host.get_chrome_version() 304 # Try to use the minimal support from Telemetry. The Telemetry 305 # benchmarks in ChromeOS are too flaky at this point. So, initially, 306 # this will be set to True by default. 307 self._minimal_telemetry = False 308 309 # Ignored servo arguments. 310 ignored_options = ('servo_host', 'servo_port') 311 312 for option_name, value in args.items(): 313 if option_name == 'arch': 314 self._arch = value 315 elif option_name == 'gs_test_location': 316 self._gs_test_location = (value == 'True') 317 elif option_name == 'ignore_failures': 318 self._ignore_failures = (value == 'True') 319 elif option_name == 'local': 320 self._local = (value == 'True') 321 elif option_name == 'minimal_telemetry': 322 self._minimal_telemetry = (value == 'True') 323 elif option_name == 'version': 324 self._version = value 325 elif option_name in ignored_options: 326 continue 327 else: 328 raise error.TestFail('Unknown option passed: %s' % option_name) 329 330 def _run_test(self, tr, benchmark, *args): 331 """Run the benchmark using Telemetry. 332 333 @param tr: Instance of the TelemetryRunner subclass. 334 @param benchmark: Name of the benchmark to run. 335 @param args: Additional arguments to pass to the telemetry execution 336 script. 337 @raises Raises error.TestFail if execution of test failed. 338 Also re-raise any exceptions thrown by run_telemetry benchmark. 339 """ 340 try: 341 logging.info('Starting run for Telemetry benchmark %s', benchmark) 342 start_time = time.time() 343 result = tr.run_telemetry_benchmark(benchmark, None, *args) 344 end_time = time.time() 345 logging.info('Completed Telemetry benchmark %s in %f seconds', 346 benchmark, end_time - start_time) 347 except error.TestBaseException as e: 348 end_time = time.time() 349 logging.info( 350 'Got exception from Telemetry benchmark %s ' 351 'after %f seconds. Exception: %s', benchmark, 352 end_time - start_time, str(e)) 353 raise 354 355 # We dont generate any keyvals for this run. This is not 356 # an official run of the benchmark. We are just running it to get 357 # a profile from it. 358 359 if result.status is telemetry_runner.SUCCESS_STATUS: 360 logging.info('Benchmark %s succeeded', benchmark) 361 else: 362 raise error.TestFail('An error occurred while executing' 363 ' benchmark: %s' % benchmark) 364 365 def _run_test_with_retry(self, tr, benchmark, *args): 366 """Run the benchmark using Telemetry. Retry in case of failure. 367 368 @param tr: An instance of the TelemetryRunner subclass. 369 @param benchmark: Name of the benchmark to run. 370 @param args: Additional arguments to pass to the telemetry execution 371 script. 372 @raises Re-raise any exceptions thrown by _run_test. 373 """ 374 375 tried = False 376 while True: 377 try: 378 self._run_test(tr, benchmark, *args) 379 logging.info('Benchmark %s succeeded on %s try', benchmark, 380 'first' if not tried else 'second') 381 break 382 except error.TestBaseException: 383 if not tried: 384 tried = True 385 logging.info('Benchmark %s failed. Retrying ...', 386 benchmark) 387 else: 388 logging.info('Benchmark %s failed twice. Not retrying', 389 benchmark) 390 raise 391 392 def _run_tests_minimal_telemetry(self): 393 """Run the benchmarks using the minimal support from Telemetry. 394 395 The benchmarks are run using a client side autotest test. This test 396 will control Chrome directly using the chrome.Chrome support and it 397 will ask Chrome to display the benchmark pages directly instead of 398 using the "page sets" and "measurements" support from Telemetry. 399 In this way we avoid using Telemetry benchmark support which is not 400 stable on ChromeOS yet. 401 """ 402 AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient' 403 404 # Execute the client side test. 405 client_at = autotest.Autotest(self._host) 406 client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='') 407 408 @staticmethod 409 def _get_compressed_name(name): 410 """Given a file name, return bz2 compressed name. 411 @param name: Name of uncompressed file. 412 @returns name of compressed file. 413 """ 414 return name + '.bz2' 415 416 @staticmethod 417 def _compress_file(unc_file, com_file): 418 """Compresses specified file with bz2. 419 420 @param unc_file: name of file to compress. 421 @param com_file: prefix name of compressed file. 422 @raises error.TestFail if compression failed 423 @returns Name of compressed file. 424 """ 425 dest = '' 426 with open(unc_file, 'rb') as inp: 427 dest = telemetry_AFDOGenerate._get_compressed_name(com_file) 428 with bz2.BZ2File(dest, 'wb') as out: 429 for data in inp: 430 out.write(data) 431 if not dest or not os.path.isfile(dest): 432 raise error.TestFail('Could not compress %s' % unc_file) 433 return dest 434 435 def _gs_upload(self, local_file, remote_basename): 436 """Uploads file to google storage specific location. 437 438 @param local_file: name of file to upload. 439 @param remote_basename: basename of remote file. 440 @raises error.TestFail if upload failed. 441 @returns nothing. 442 """ 443 GS_GCC_DEST = 'gs://chromeos-prebuilt/afdo-job/canonicals/%s' 444 GS_LLVM_DEST = 'gs://chromeos-toolchain-artifacts/afdo/unvetted/benchmark/%s' 445 GS_LLVM_ASYNC_DEST = \ 446 'gs://chromeos-throw-away-bucket/afdo-job/llvm/benchmarks/%s' 447 GS_TEST_DEST = 'gs://chromeos-throw-away-bucket/afdo-job/canonicals/%s' 448 GS_ACL = 'project-private' 449 450 board = self._host.get_board().split(':')[1] 451 452 if self._gs_test_location: 453 gs_dest = GS_TEST_DEST 454 elif board in GCC_BOARDS: 455 gs_dest = GS_GCC_DEST 456 elif board in LLVM_BOARDS: 457 gs_dest = GS_LLVM_DEST 458 elif board in LLVM_BOARDS_ASYNC: 459 gs_dest = GS_LLVM_ASYNC_DEST 460 GS_ACL = 'public-read' 461 else: 462 raise error.TestFail('This test cannot be run on board %s' % board) 463 464 remote_file = gs_dest % remote_basename 465 466 logging.info('About to upload to GS: %s', remote_file) 467 if not utils.gs_upload( 468 local_file, remote_file, GS_ACL, result_dir=self.resultsdir): 469 logging.info('Failed upload to GS: %s', remote_file) 470 raise error.TestFail( 471 'Unable to gs upload %s to %s' % (local_file, remote_file)) 472 473 logging.info('Successfull upload to GS: %s', remote_file) 474