1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5 6import json 7import logging 8import os 9import re 10import shlex 11import shutil 12import sys 13import tempfile 14 15 16# TODO (b/206008069), remove this when migrated to new env 17sys.path.insert(0, 18 '/usr/local/lib/python2.7/dist-packages/six-1.16.0-py2.7.egg') 19try: 20 # This is weird. But it seems something is bringing in six earlier 21 # Going to force a reload after the egg is inserted. 22 import six 23 if six.PY2: 24 reload(six) 25 else: 26 import importlib 27 importlib.reload(six) 28 logging.debug("six version is {}".format(six.__version__)) 29 if six.__version__ != '1.16.0': 30 logging.debug(sys.path) 31except ImportError as e: 32 logging.warning("Could not import six due to %s", e) 33 34from contextlib import contextmanager 35 36from autotest_lib.client.common_lib import error 37from autotest_lib.server import test 38from autotest_lib.server import utils 39from autotest_lib.server.cros import telemetry_runner 40from autotest_lib.server.cros.crosperf import device_setup_utils 41from autotest_lib.site_utils import test_runner_utils 42 43WAIT_FOR_CMD_TIMEOUT_SECS = 60 44DUT_COMMON_SSH_OPTIONS = [ 45 '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null', 46 '-o BatchMode=yes', '-o ConnectTimeout=30', 47 '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3', 48 '-o ConnectionAttempts=4', '-o Protocol=2' 49] 50DUT_SCP_OPTIONS = ' '.join(DUT_COMMON_SSH_OPTIONS) 51 52RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH 53DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src' 54 55TURBOSTAT_LOG = 'turbostat.log' 56CPUSTATS_LOG = 'cpustats.log' 57CPUINFO_LOG = 'cpuinfo.log' 58TOP_LOG = 'top.log' 59WAIT_TIME_LOG = 'wait_time.log' 60 61# Result Statuses 62SUCCESS_STATUS = 'SUCCESS' 63WARNING_STATUS = 'WARNING' 64FAILED_STATUS = 'FAILED' 65 66# Regex for the RESULT output lines understood by chrome buildbot. 67# TODO b:169251326 terms below are set outside of this codebase and 68# should be updated when possible ("slave"). # nocheck 69# Keep in sync with 70# chromium/tools/build/scripts/slave/performance_log_processor.py. # nocheck 71RESULTS_REGEX = re.compile(r'(?P<IMPORTANT>\*)?RESULT ' 72 r'(?P<GRAPH>[^:]*): (?P<TRACE>[^=]*)= ' 73 r'(?P<VALUE>[\{\[]?[-\d\., ]+[\}\]]?)(' 74 r' ?(?P<UNITS>.+))?') 75HISTOGRAM_REGEX = re.compile(r'(?P<IMPORTANT>\*)?HISTOGRAM ' 76 r'(?P<GRAPH>[^:]*): (?P<TRACE>[^=]*)= ' 77 r'(?P<VALUE_JSON>{.*})(?P<UNITS>.+)?') 78 79 80class telemetry_Crosperf(test.test): 81 """ 82 Run one or more telemetry benchmarks under the crosperf script. 83 84 """ 85 version = 1 86 87 def scp_telemetry_results(self, client_ip, dut, file_path, host_dir, 88 ignore_status=False): 89 """ 90 Copy telemetry results from dut. 91 92 @param client_ip: The ip address of the DUT 93 @param dut: The autotest host object representing DUT. 94 @param file_path: The file to copy from DUT. 95 @param host_dir: The directory on host to put the file . 96 97 @returns status code for scp command. 98 99 """ 100 cmd = ['scp', DUT_SCP_OPTIONS, RSA_KEY, '-v'] 101 port = '' 102 103 if dut: 104 port = dut.port 105 ip = dut.hostname 106 else: 107 ip_and_port = client_ip.split(':') 108 ip = ip_and_port[0] 109 if len(ip_and_port) > 1: 110 port = ip_and_port[1] 111 112 if port: 113 cmd.extend(['-P', str(port)]) 114 115 src = 'root@%s:%s' % (ip, file_path) 116 cmd.extend([src, host_dir]) 117 command = ' '.join(cmd) 118 119 logging.debug('Retrieving Results: %s', command) 120 try: 121 result = utils.run( 122 command, 123 timeout=WAIT_FOR_CMD_TIMEOUT_SECS, 124 ignore_status=ignore_status) 125 exit_code = result.exit_status 126 except Exception as e: 127 logging.error('Failed to retrieve results: %s', e) 128 raise 129 130 if exit_code: 131 logging.error('Command "%s" returned non-zero status: %d', command, 132 exit_code) 133 return exit_code 134 135 @contextmanager 136 def no_background(self, *_args): 137 """ 138 Background stub. 139 140 """ 141 yield 142 143 @contextmanager 144 def run_in_background_with_log(self, cmd, dut, log_path): 145 """ 146 Get cpustats periodically in background. 147 148 NOTE: No error handling, exception or error report if command fails 149 to run in background. Command failure is silenced. 150 151 """ 152 logging.info('Running in background:\n%s', cmd) 153 pid = dut.run_background(cmd) 154 try: 155 # TODO(denik http://crbug.com/966514): replace with more reliable 156 # way to check run success/failure in background. 157 # Wait some time before checking the process. 158 check = dut.run('sleep 5; kill -0 %s' % pid, ignore_status=True) 159 if check.exit_status != 0: 160 # command wasn't started correctly 161 logging.error( 162 "Background command wasn't started correctly.\n" 163 'Command:\n%s', cmd) 164 pid = '' 165 yield 166 return 167 168 logging.info('Background process started successfully, pid %s', 169 pid) 170 yield 171 172 finally: 173 if pid: 174 # Stop background processes. 175 logging.info('Killing background process, pid %s', pid) 176 # Kill the process blindly. OK if it's already gone. 177 # There is an issue when underlying child processes stay alive 178 # while the parent main process is killed. 179 # The solution is to kill the chain of processes via process 180 # group id. 181 dut.run('pgid=$(cat /proc/%s/stat | cut -d")" -f2 | ' 182 'cut -d" " -f4) && kill -- -$pgid 2>/dev/null' % pid, 183 ignore_status=True) 184 185 # Copy the results to results directory with silenced failure. 186 scp_res = self.scp_telemetry_results( 187 '', dut, log_path, self.resultsdir, ignore_status=True) 188 if scp_res: 189 logging.error( 190 'scp of cpuinfo logs failed ' 191 'with error %d.', scp_res) 192 193 def run_cpustats_in_background(self, dut, log_name): 194 """ 195 Run command to collect CPU stats in background. 196 197 """ 198 log_path = '/tmp/%s' % log_name 199 cpu_stats_cmd = ( 200 'cpulog=%s; ' 201 'rm -f ${cpulog}; ' 202 # Stop after 720*0.5min=6hours if not killed at clean-up phase. 203 'for i in {1..720} ; do ' 204 # Collect current CPU frequency on all cores and thermal data. 205 ' paste -d" " ' 206 ' <(ls /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)' 207 ' <(cat /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)' 208 ' >> ${cpulog} || break; ' # exit loop if fails. 209 ' paste -d" "' 210 ' <(cat /sys/class/thermal/thermal_zone*/type)' 211 ' <(cat /sys/class/thermal/thermal_zone*/temp)' 212 # Filter in thermal data from only CPU-related sources. 213 ' | grep -iE "soc|cpu|pkg|big|little" >> ${cpulog} || break; ' 214 # Repeat every 30 sec. 215 ' sleep 30; ' 216 'done;') % log_path 217 218 return self.run_in_background_with_log(cpu_stats_cmd, dut, log_path) 219 220 def run_top_in_background(self, dut, log_name, interval_in_sec): 221 """ 222 Run top in background. 223 224 """ 225 log_path = '/tmp/%s' % log_name 226 top_cmd = ( 227 # Run top in batch mode with specified interval and filter out 228 # top system summary and processes not consuming %CPU. 229 # Output of each iteration is separated by a blank line. 230 'HOME=/usr/local COLUMNS=128 top -bi -d%.1f' 231 ' | grep -E "^[ 0-9]|^$" > %s;') % (interval_in_sec, log_path) 232 233 return self.run_in_background_with_log(top_cmd, dut, log_path) 234 235 def run_turbostat_in_background(self, dut, log_name): 236 """ 237 Run turbostat in background. 238 239 """ 240 log_path = '/tmp/%s' % log_name 241 turbostat_cmd = ( 242 'nohup turbostat --quiet --interval 10 ' 243 '--show=CPU,Bzy_MHz,Avg_MHz,TSC_MHz,Busy%%,IRQ,CoreTmp ' 244 '1> %s') % log_path 245 246 return self.run_in_background_with_log(turbostat_cmd, dut, log_path) 247 248 def run_cpuinfo(self, dut, log_name): 249 """ 250 Collect CPU info of "dut" into "log_name" file. 251 252 """ 253 cpuinfo_cmd = ( 254 'for cpunum in ' 255 " $(awk '/^processor/ { print $NF ; }' /proc/cpuinfo ) ; do " 256 ' for i in' 257 ' `ls -d /sys/devices/system/cpu/cpu"${cpunum}"/cpufreq/' 258 '{cpuinfo_cur_freq,scaling_*_freq,scaling_governor} ' 259 ' 2>/dev/null` ; do ' 260 ' paste -d" " <(echo "${i}") <(cat "${i}"); ' 261 ' done;' 262 'done;' 263 'for cpudata in' 264 ' /sys/devices/system/cpu/intel_pstate/no_turbo' 265 ' /sys/devices/system/cpu/online; do ' 266 ' if [[ -e "${cpudata}" ]] ; then ' 267 ' paste <(echo "${cpudata}") <(cat "${cpudata}"); ' 268 ' fi; ' 269 'done; ' 270 # Adding thermal data to the log. 271 'paste -d" "' 272 ' <(cat /sys/class/thermal/thermal_zone*/type)' 273 ' <(cat /sys/class/thermal/thermal_zone*/temp)') 274 275 # Get CPUInfo at the end of the test. 276 logging.info('Get cpuinfo: %s', cpuinfo_cmd) 277 with open(os.path.join(self.resultsdir, log_name), 278 'w') as cpu_log_file: 279 # Handle command failure gracefully. 280 res = dut.run( 281 cpuinfo_cmd, stdout_tee=cpu_log_file, ignore_status=True) 282 if res.exit_status: 283 logging.error('Get cpuinfo command failed with %d.', 284 res.exit_status) 285 286 def run_once(self, args, client_ip='', dut=None): 287 """ 288 Run a single telemetry test. 289 290 @param args: A dictionary of the arguments that were passed 291 to this test. 292 @param client_ip: The ip address of the DUT 293 @param dut: The autotest host object representing DUT. 294 295 @returns A TelemetryResult instance with the results of this execution. 296 """ 297 test_name = args.get('test', '') 298 test_args = args.get('test_args', '') 299 profiler_args = args.get('profiler_args', '') 300 301 dut_config_str = args.get('dut_config', '{}') 302 dut_config = json.loads(dut_config_str) 303 # Setup device with dut_config arguments before running test 304 if dut_config: 305 wait_time = device_setup_utils.setup_device(dut, dut_config) 306 # Wait time can be used to accumulate cooldown time in Crosperf. 307 with open(os.path.join(self.resultsdir, WAIT_TIME_LOG), 'w') as f: 308 f.write(str(wait_time)) 309 310 output_format = 'histograms' 311 312 # For local runs, we set local=True and use local chrome source to run 313 # tests; for lab runs, we use the drone instead. 314 # By default to be True. 315 local = args.get('local', 'true').lower() == 'true' 316 317 # If run_local=true, telemetry benchmark will run on DUT, otherwise 318 # run remotely from host. 319 # By default to be False. 320 # TODO(zhizhouy): It is better to change the field name from "run_local" 321 # to "telemetry_on_dut" in crosperf experiment files for consistency. 322 telemetry_on_dut = args.get('run_local', '').lower() == 'true' 323 324 # Run the test. And collect profile if needed. 325 try: 326 # If profiler_args specified, we want to add several more options 327 # to the command so that run_benchmark will collect system wide 328 # profiles. 329 if profiler_args: 330 profiler_opts = [ 331 '--interval-profiling-period=story_run', 332 '--interval-profiling-target=system_wide', 333 '--interval-profiler-options="%s"' % profiler_args 334 ] 335 336 top_interval = dut_config.get('top_interval', 0) 337 turbostat = dut_config.get('turbostat') 338 339 run_cpuinfo = self.run_cpustats_in_background if dut \ 340 else self.no_background 341 run_turbostat = self.run_turbostat_in_background if ( 342 dut and turbostat) else self.no_background 343 run_top = self.run_top_in_background if ( 344 dut and top_interval > 0) else self.no_background 345 346 # FIXME(denik): replace with ExitStack. 347 with run_cpuinfo(dut, CPUSTATS_LOG) as _cpu_cm, \ 348 run_turbostat(dut, TURBOSTAT_LOG) as _turbo_cm, \ 349 run_top(dut, TOP_LOG, top_interval) as _top_cm: 350 351 arguments = [] 352 if test_args: 353 arguments.extend(shlex.split(test_args)) 354 if profiler_args: 355 arguments.extend(profiler_opts) 356 logging.debug('Telemetry Arguments: %s', arguments) 357 perf_value_writer = self 358 artifacts = True if profiler_args else False 359 with telemetry_runner.TelemetryRunnerFactory().get_runner( 360 dut, local=local, 361 telemetry_on_dut=telemetry_on_dut) as tr: 362 result = tr.run_telemetry_benchmark( 363 test_name, 364 perf_value_writer, 365 *arguments, 366 ex_output_format=output_format, 367 results_dir=self.resultsdir, 368 no_verbose=True, 369 artifacts=artifacts) 370 logging.info('Telemetry completed with exit status: %s.', 371 result.status) 372 logging.info('output: %s\n', result.output) 373 374 except (error.TestFail, error.TestWarn): 375 logging.debug( 376 'Test did not succeed while executing telemetry test.') 377 raise 378 except: 379 logging.debug('Unexpected failure on telemetry_Crosperf.') 380 raise 381 finally: 382 if dut: 383 self.run_cpuinfo(dut, CPUINFO_LOG) 384 385 # Checking whether result file exists. 386 filepath = os.path.join(self.resultsdir, 'histograms.json') 387 if not os.path.exists(filepath): 388 raise RuntimeError('Missing results file: %s' % filepath) 389 390 # Copy the perf data file into the test_that profiling directory, 391 # if necessary. It always comes from DUT. 392 if profiler_args: 393 filepath = os.path.join(self.resultsdir, 'artifacts') 394 if not os.path.isabs(filepath): 395 raise RuntimeError('Expected absolute path of ' 396 'arfifacts: %s' % filepath) 397 perf_exist = False 398 for root, _dirs, files in os.walk(filepath): 399 for f in files: 400 if f.endswith('.perf.data'): 401 perf_exist = True 402 src_file = os.path.join(root, f) 403 # results-cache.py in crosperf supports multiple 404 # perf.data files, but only if they are named exactly 405 # so. Therefore, create a subdir for each perf.data 406 # file. 407 # Use mkdtemp to make sure a directory name is unique. 408 dst_dir = tempfile.mkdtemp(dir=self.profdir, 409 prefix=''.join( 410 f.split('.')[:-2])) 411 dst_file = os.path.join(dst_dir, 'perf.data') 412 shutil.copyfile(src_file, dst_file) 413 if not perf_exist: 414 raise error.TestFail('Error: No profiles collected, test may ' 415 'not run correctly.') 416 417 # In skylab run, environment variable SYNCHRONOUS_OFFLOAD_DIR is set to 418 # be a directory; In test_that run, it will not be set. 419 synchronous_dir = os.getenv('SYNCHRONOUS_OFFLOAD_DIR', '') 420 # For skylab run, push logs and json results to the synchronous offload 421 # directory for instant access for report. 422 if synchronous_dir and os.path.isdir(synchronous_dir): 423 try: 424 dst = os.path.join(synchronous_dir, 'results') 425 shutil.copytree(self.resultsdir, dst) 426 except: 427 raise RuntimeError( 428 'Failed to copy results in %s to synchronous offload' 429 ' directory %s' % (self.resultsdir, synchronous_dir)) 430 431 return result 432