site_tests/telemetry_Crosperf/telemetry_Crosperf.py

# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.


import json
import logging
import os
import re
import shlex
import shutil
import sys
import tempfile


# TODO (b/206008069), remove this when migrated to new env
sys.path.insert(0,
                '/usr/local/lib/python2.7/dist-packages/six-1.16.0-py2.7.egg')
try:
    # This is weird. But it seems something is bringing in six earlier
    # Going to force a reload after the egg is inserted.
    import six
    if six.PY2:
        reload(six)
    else:
        import importlib
        importlib.reload(six)
    logging.debug("six version is {}".format(six.__version__))
    if six.__version__ != '1.16.0':
        logging.debug(sys.path)
except ImportError as e:
    logging.warning("Could not import six due to %s", e)

from contextlib import contextmanager

from autotest_lib.client.common_lib import error
from autotest_lib.server import test
from autotest_lib.server import utils
from autotest_lib.server.cros import telemetry_runner
from autotest_lib.server.cros.crosperf import device_setup_utils
from autotest_lib.site_utils import test_runner_utils

WAIT_FOR_CMD_TIMEOUT_SECS = 60
DUT_COMMON_SSH_OPTIONS = [
        '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
        '-o BatchMode=yes', '-o ConnectTimeout=30',
        '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
        '-o ConnectionAttempts=4', '-o Protocol=2'
]
DUT_SCP_OPTIONS = ' '.join(DUT_COMMON_SSH_OPTIONS)

RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src'

TURBOSTAT_LOG = 'turbostat.log'
CPUSTATS_LOG = 'cpustats.log'
CPUINFO_LOG = 'cpuinfo.log'
TOP_LOG = 'top.log'
WAIT_TIME_LOG = 'wait_time.log'

# Result Statuses
SUCCESS_STATUS = 'SUCCESS'
WARNING_STATUS = 'WARNING'
FAILED_STATUS = 'FAILED'

# Regex for the RESULT output lines understood by chrome buildbot.
# TODO b:169251326 terms below are set outside of this codebase and
# should be updated when possible ("slave"). # nocheck
# Keep in sync with
# chromium/tools/build/scripts/slave/performance_log_processor.py. # nocheck
RESULTS_REGEX = re.compile(r'(?P<IMPORTANT>\*)?RESULT '
                           r'(?P<GRAPH>[^:]*): (?P<TRACE>[^=]*)= '
                           r'(?P<VALUE>[\{\[]?[-\d\., ]+[\}\]]?)('
                           r' ?(?P<UNITS>.+))?')
HISTOGRAM_REGEX = re.compile(r'(?P<IMPORTANT>\*)?HISTOGRAM '
                             r'(?P<GRAPH>[^:]*): (?P<TRACE>[^=]*)= '
                             r'(?P<VALUE_JSON>{.*})(?P<UNITS>.+)?')


class telemetry_Crosperf(test.test):
    """
    Run one or more telemetry benchmarks under the crosperf script.

    """
    version = 1

    def scp_telemetry_results(self, client_ip, dut, file_path, host_dir,
                              ignore_status=False):
        """
        Copy telemetry results from dut.

        @param client_ip: The ip address of the DUT
        @param dut: The autotest host object representing DUT.
        @param file_path: The file to copy from DUT.
        @param host_dir: The directory on host to put the file .

        @returns status code for scp command.

        """
        cmd = ['scp', DUT_SCP_OPTIONS, RSA_KEY, '-v']
        port = ''

        if dut:
            port = dut.port
            ip = dut.hostname
        else:
            ip_and_port = client_ip.split(':')
            ip = ip_and_port[0]
            if len(ip_and_port) > 1:
                port = ip_and_port[1]

        if port:
            cmd.extend(['-P', str(port)])

        src = 'root@%s:%s' % (ip, file_path)
        cmd.extend([src, host_dir])
        command = ' '.join(cmd)

        logging.debug('Retrieving Results: %s', command)
        try:
            result = utils.run(
                    command,
                    timeout=WAIT_FOR_CMD_TIMEOUT_SECS,
                    ignore_status=ignore_status)
            exit_code = result.exit_status
        except Exception as e:
            logging.error('Failed to retrieve results: %s', e)
            raise

        if exit_code:
            logging.error('Command "%s" returned non-zero status: %d', command,
                          exit_code)
        return exit_code

    @contextmanager
    def no_background(self, *_args):
        """
        Background stub.

        """
        yield

    @contextmanager
    def run_in_background_with_log(self, cmd, dut, log_path):
        """
        Get cpustats periodically in background.

        NOTE: No error handling, exception or error report if command fails
        to run in background. Command failure is silenced.

        """
        logging.info('Running in background:\n%s', cmd)
        pid = dut.run_background(cmd)
        try:
            # TODO(denik http://crbug.com/966514): replace with more reliable
            # way to check run success/failure in background.
            # Wait some time before checking the process.
            check = dut.run('sleep 5; kill -0 %s' % pid, ignore_status=True)
            if check.exit_status != 0:
                # command wasn't started correctly
                logging.error(
                        "Background command wasn't started correctly.\n"
                        'Command:\n%s', cmd)
                pid = ''
                yield
                return

            logging.info('Background process started successfully, pid %s',
                         pid)
            yield

        finally:
            if pid:
                # Stop background processes.
                logging.info('Killing background process, pid %s', pid)
                # Kill the process blindly. OK if it's already gone.
                # There is an issue when underlying child processes stay alive
                # while the parent main process is killed.
                # The solution is to kill the chain of processes via process
                # group id.
                dut.run('pgid=$(cat /proc/%s/stat | cut -d")" -f2 | '
                        'cut -d" " -f4) && kill -- -$pgid 2>/dev/null' % pid,
                        ignore_status=True)

                # Copy the results to results directory with silenced failure.
                scp_res = self.scp_telemetry_results(
                        '', dut, log_path, self.resultsdir, ignore_status=True)
                if scp_res:
                    logging.error(
                            'scp of cpuinfo logs failed '
                            'with error %d.', scp_res)

    def run_cpustats_in_background(self, dut, log_name):
        """
        Run command to collect CPU stats in background.

        """
        log_path = '/tmp/%s' % log_name
        cpu_stats_cmd = (
                'cpulog=%s; '
                'rm -f ${cpulog}; '
                # Stop after 720*0.5min=6hours if not killed at clean-up phase.
                'for i in {1..720} ; do '
                # Collect current CPU frequency on all cores and thermal data.
                ' paste -d" " '
                '  <(ls /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)'
                '  <(cat /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)'
                '  >> ${cpulog} || break; '  # exit loop if fails.
                ' paste -d" "'
                '   <(cat /sys/class/thermal/thermal_zone*/type)'
                '   <(cat /sys/class/thermal/thermal_zone*/temp)'
                # Filter in thermal data from only CPU-related sources.
                '  | grep -iE "soc|cpu|pkg|big|little" >> ${cpulog} || break; '
                # Repeat every 30 sec.
                ' sleep 30; '
                'done;') % log_path

        return self.run_in_background_with_log(cpu_stats_cmd, dut, log_path)

    def run_top_in_background(self, dut, log_name, interval_in_sec):
        """
        Run top in background.

        """
        log_path = '/tmp/%s' % log_name
        top_cmd = (
                # Run top in batch mode with specified interval and filter out
                # top system summary and processes not consuming %CPU.
                # Output of each iteration is separated by a blank line.
                'HOME=/usr/local COLUMNS=128 top -bi -d%.1f'
                ' | grep -E "^[ 0-9]|^$" > %s;') % (interval_in_sec, log_path)

        return self.run_in_background_with_log(top_cmd, dut, log_path)

    def run_turbostat_in_background(self, dut, log_name):
        """
        Run turbostat in background.

        """
        log_path = '/tmp/%s' % log_name
        turbostat_cmd = (
                'nohup turbostat --quiet --interval 10 '
                '--show=CPU,Bzy_MHz,Avg_MHz,TSC_MHz,Busy%%,IRQ,CoreTmp '
                '1> %s') % log_path

        return self.run_in_background_with_log(turbostat_cmd, dut, log_path)

    def run_cpuinfo(self, dut, log_name):
        """
        Collect CPU info of "dut" into "log_name" file.

        """
        cpuinfo_cmd = (
                'for cpunum in '
                "   $(awk '/^processor/ { print $NF ; }' /proc/cpuinfo ) ; do "
                ' for i in'
                ' `ls -d /sys/devices/system/cpu/cpu"${cpunum}"/cpufreq/'
                '{cpuinfo_cur_freq,scaling_*_freq,scaling_governor} '
                '     2>/dev/null` ; do '
                '  paste -d" " <(echo "${i}") <(cat "${i}"); '
                ' done;'
                'done;'
                'for cpudata in'
                '  /sys/devices/system/cpu/intel_pstate/no_turbo'
                '  /sys/devices/system/cpu/online; do '
                ' if [[ -e "${cpudata}" ]] ; then '
                '  paste <(echo "${cpudata}") <(cat "${cpudata}"); '
                ' fi; '
                'done; '
                # Adding thermal data to the log.
                'paste -d" "'
                '  <(cat /sys/class/thermal/thermal_zone*/type)'
                '  <(cat /sys/class/thermal/thermal_zone*/temp)')

        # Get CPUInfo at the end of the test.
        logging.info('Get cpuinfo: %s', cpuinfo_cmd)
        with open(os.path.join(self.resultsdir, log_name),
                  'w') as cpu_log_file:
            # Handle command failure gracefully.
            res = dut.run(
                    cpuinfo_cmd, stdout_tee=cpu_log_file, ignore_status=True)
            if res.exit_status:
                logging.error('Get cpuinfo command failed with %d.',
                              res.exit_status)

    def run_once(self, args, client_ip='', dut=None):
        """
        Run a single telemetry test.

        @param args: A dictionary of the arguments that were passed
                to this test.
        @param client_ip: The ip address of the DUT
        @param dut: The autotest host object representing DUT.

        @returns A TelemetryResult instance with the results of this execution.
        """
        test_name = args.get('test', '')
        test_args = args.get('test_args', '')
        profiler_args = args.get('profiler_args', '')

        dut_config_str = args.get('dut_config', '{}')
        dut_config = json.loads(dut_config_str)
        # Setup device with dut_config arguments before running test
        if dut_config:
            wait_time = device_setup_utils.setup_device(dut, dut_config)
            # Wait time can be used to accumulate cooldown time in Crosperf.
            with open(os.path.join(self.resultsdir, WAIT_TIME_LOG), 'w') as f:
                f.write(str(wait_time))

        output_format = 'histograms'

        # For local runs, we set local=True and use local chrome source to run
        # tests; for lab runs, we use the drone instead.
        # By default to be True.
        local = args.get('local', 'true').lower() == 'true'

        # If run_local=true, telemetry benchmark will run on DUT, otherwise
        # run remotely from host.
        # By default to be False.
        # TODO(zhizhouy): It is better to change the field name from "run_local"
        # to "telemetry_on_dut" in crosperf experiment files for consistency.
        telemetry_on_dut = args.get('run_local', '').lower() == 'true'

        # Run the test. And collect profile if needed.
        try:
            # If profiler_args specified, we want to add several more options
            # to the command so that run_benchmark will collect system wide
            # profiles.
            if profiler_args:
                profiler_opts = [
                        '--interval-profiling-period=story_run',
                        '--interval-profiling-target=system_wide',
                        '--interval-profiler-options="%s"' % profiler_args
                ]

            top_interval = dut_config.get('top_interval', 0)
            turbostat = dut_config.get('turbostat')

            run_cpuinfo = self.run_cpustats_in_background if dut \
                else self.no_background
            run_turbostat = self.run_turbostat_in_background if (
                    dut and turbostat) else self.no_background
            run_top = self.run_top_in_background if (
                    dut and top_interval > 0) else self.no_background

            # FIXME(denik): replace with ExitStack.
            with run_cpuinfo(dut, CPUSTATS_LOG) as _cpu_cm, \
                run_turbostat(dut, TURBOSTAT_LOG) as _turbo_cm, \
                run_top(dut, TOP_LOG, top_interval) as _top_cm:

                arguments = []
                if test_args:
                    arguments.extend(shlex.split(test_args))
                if profiler_args:
                    arguments.extend(profiler_opts)
                logging.debug('Telemetry Arguments: %s', arguments)
                perf_value_writer = self
                artifacts = True if profiler_args else False
                with telemetry_runner.TelemetryRunnerFactory().get_runner(
                        dut, local=local,
                        telemetry_on_dut=telemetry_on_dut) as tr:
                    result = tr.run_telemetry_benchmark(
                            test_name,
                            perf_value_writer,
                            *arguments,
                            ex_output_format=output_format,
                            results_dir=self.resultsdir,
                            no_verbose=True,
                            artifacts=artifacts)
                    logging.info('Telemetry completed with exit status: %s.',
                                 result.status)
                    logging.info('output: %s\n', result.output)

        except (error.TestFail, error.TestWarn):
            logging.debug(
                    'Test did not succeed while executing telemetry test.')
            raise
        except:
            logging.debug('Unexpected failure on telemetry_Crosperf.')
            raise
        finally:
            if dut:
                self.run_cpuinfo(dut, CPUINFO_LOG)

        # Checking whether result file exists.
        filepath = os.path.join(self.resultsdir, 'histograms.json')
        if not os.path.exists(filepath):
            raise RuntimeError('Missing results file: %s' % filepath)

        # Copy the perf data file into the test_that profiling directory,
        # if necessary. It always comes from DUT.
        if profiler_args:
            filepath = os.path.join(self.resultsdir, 'artifacts')
            if not os.path.isabs(filepath):
                raise RuntimeError('Expected absolute path of '
                                   'arfifacts: %s' % filepath)
            perf_exist = False
            for root, _dirs, files in os.walk(filepath):
                for f in files:
                    if f.endswith('.perf.data'):
                        perf_exist = True
                        src_file = os.path.join(root, f)
                        # results-cache.py in crosperf supports multiple
                        # perf.data files, but only if they are named exactly
                        # so. Therefore, create a subdir for each perf.data
                        # file.
                        # Use mkdtemp to make sure a directory name is unique.
                        dst_dir = tempfile.mkdtemp(dir=self.profdir,
                                                   prefix=''.join(
                                                           f.split('.')[:-2]))
                        dst_file = os.path.join(dst_dir, 'perf.data')
                        shutil.copyfile(src_file, dst_file)
            if not perf_exist:
                raise error.TestFail('Error: No profiles collected, test may '
                                     'not run correctly.')

        # In skylab run, environment variable SYNCHRONOUS_OFFLOAD_DIR is set to
        # be a directory; In test_that run, it will not be set.
        synchronous_dir = os.getenv('SYNCHRONOUS_OFFLOAD_DIR', '')
        # For skylab run, push logs and json results to the synchronous offload
        # directory for instant access for report.
        if synchronous_dir and os.path.isdir(synchronous_dir):
            try:
                dst = os.path.join(synchronous_dir, 'results')
                shutil.copytree(self.resultsdir, dst)
            except:
                raise RuntimeError(
                    'Failed to copy results in %s to synchronous offload'
                    ' directory %s' % (self.resultsdir, synchronous_dir))

        return result