xref: /aosp_15_r20/external/autotest/server/site_tests/telemetry_Crosperf/telemetry_Crosperf.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5
6import json
7import logging
8import os
9import re
10import shlex
11import shutil
12import sys
13import tempfile
14
15
16# TODO (b/206008069), remove this when migrated to new env
17sys.path.insert(0,
18                '/usr/local/lib/python2.7/dist-packages/six-1.16.0-py2.7.egg')
19try:
20    # This is weird. But it seems something is bringing in six earlier
21    # Going to force a reload after the egg is inserted.
22    import six
23    if six.PY2:
24        reload(six)
25    else:
26        import importlib
27        importlib.reload(six)
28    logging.debug("six version is {}".format(six.__version__))
29    if six.__version__ != '1.16.0':
30        logging.debug(sys.path)
31except ImportError as e:
32    logging.warning("Could not import six due to %s", e)
33
34from contextlib import contextmanager
35
36from autotest_lib.client.common_lib import error
37from autotest_lib.server import test
38from autotest_lib.server import utils
39from autotest_lib.server.cros import telemetry_runner
40from autotest_lib.server.cros.crosperf import device_setup_utils
41from autotest_lib.site_utils import test_runner_utils
42
43WAIT_FOR_CMD_TIMEOUT_SECS = 60
44DUT_COMMON_SSH_OPTIONS = [
45        '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
46        '-o BatchMode=yes', '-o ConnectTimeout=30',
47        '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
48        '-o ConnectionAttempts=4', '-o Protocol=2'
49]
50DUT_SCP_OPTIONS = ' '.join(DUT_COMMON_SSH_OPTIONS)
51
52RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
53DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src'
54
55TURBOSTAT_LOG = 'turbostat.log'
56CPUSTATS_LOG = 'cpustats.log'
57CPUINFO_LOG = 'cpuinfo.log'
58TOP_LOG = 'top.log'
59WAIT_TIME_LOG = 'wait_time.log'
60
61# Result Statuses
62SUCCESS_STATUS = 'SUCCESS'
63WARNING_STATUS = 'WARNING'
64FAILED_STATUS = 'FAILED'
65
66# Regex for the RESULT output lines understood by chrome buildbot.
67# TODO b:169251326 terms below are set outside of this codebase and
68# should be updated when possible ("slave"). # nocheck
69# Keep in sync with
70# chromium/tools/build/scripts/slave/performance_log_processor.py. # nocheck
71RESULTS_REGEX = re.compile(r'(?P<IMPORTANT>\*)?RESULT '
72                           r'(?P<GRAPH>[^:]*): (?P<TRACE>[^=]*)= '
73                           r'(?P<VALUE>[\{\[]?[-\d\., ]+[\}\]]?)('
74                           r' ?(?P<UNITS>.+))?')
75HISTOGRAM_REGEX = re.compile(r'(?P<IMPORTANT>\*)?HISTOGRAM '
76                             r'(?P<GRAPH>[^:]*): (?P<TRACE>[^=]*)= '
77                             r'(?P<VALUE_JSON>{.*})(?P<UNITS>.+)?')
78
79
80class telemetry_Crosperf(test.test):
81    """
82    Run one or more telemetry benchmarks under the crosperf script.
83
84    """
85    version = 1
86
87    def scp_telemetry_results(self, client_ip, dut, file_path, host_dir,
88                              ignore_status=False):
89        """
90        Copy telemetry results from dut.
91
92        @param client_ip: The ip address of the DUT
93        @param dut: The autotest host object representing DUT.
94        @param file_path: The file to copy from DUT.
95        @param host_dir: The directory on host to put the file .
96
97        @returns status code for scp command.
98
99        """
100        cmd = ['scp', DUT_SCP_OPTIONS, RSA_KEY, '-v']
101        port = ''
102
103        if dut:
104            port = dut.port
105            ip = dut.hostname
106        else:
107            ip_and_port = client_ip.split(':')
108            ip = ip_and_port[0]
109            if len(ip_and_port) > 1:
110                port = ip_and_port[1]
111
112        if port:
113            cmd.extend(['-P', str(port)])
114
115        src = 'root@%s:%s' % (ip, file_path)
116        cmd.extend([src, host_dir])
117        command = ' '.join(cmd)
118
119        logging.debug('Retrieving Results: %s', command)
120        try:
121            result = utils.run(
122                    command,
123                    timeout=WAIT_FOR_CMD_TIMEOUT_SECS,
124                    ignore_status=ignore_status)
125            exit_code = result.exit_status
126        except Exception as e:
127            logging.error('Failed to retrieve results: %s', e)
128            raise
129
130        if exit_code:
131            logging.error('Command "%s" returned non-zero status: %d', command,
132                          exit_code)
133        return exit_code
134
135    @contextmanager
136    def no_background(self, *_args):
137        """
138        Background stub.
139
140        """
141        yield
142
143    @contextmanager
144    def run_in_background_with_log(self, cmd, dut, log_path):
145        """
146        Get cpustats periodically in background.
147
148        NOTE: No error handling, exception or error report if command fails
149        to run in background. Command failure is silenced.
150
151        """
152        logging.info('Running in background:\n%s', cmd)
153        pid = dut.run_background(cmd)
154        try:
155            # TODO(denik http://crbug.com/966514): replace with more reliable
156            # way to check run success/failure in background.
157            # Wait some time before checking the process.
158            check = dut.run('sleep 5; kill -0 %s' % pid, ignore_status=True)
159            if check.exit_status != 0:
160                # command wasn't started correctly
161                logging.error(
162                        "Background command wasn't started correctly.\n"
163                        'Command:\n%s', cmd)
164                pid = ''
165                yield
166                return
167
168            logging.info('Background process started successfully, pid %s',
169                         pid)
170            yield
171
172        finally:
173            if pid:
174                # Stop background processes.
175                logging.info('Killing background process, pid %s', pid)
176                # Kill the process blindly. OK if it's already gone.
177                # There is an issue when underlying child processes stay alive
178                # while the parent main process is killed.
179                # The solution is to kill the chain of processes via process
180                # group id.
181                dut.run('pgid=$(cat /proc/%s/stat | cut -d")" -f2 | '
182                        'cut -d" " -f4) && kill -- -$pgid 2>/dev/null' % pid,
183                        ignore_status=True)
184
185                # Copy the results to results directory with silenced failure.
186                scp_res = self.scp_telemetry_results(
187                        '', dut, log_path, self.resultsdir, ignore_status=True)
188                if scp_res:
189                    logging.error(
190                            'scp of cpuinfo logs failed '
191                            'with error %d.', scp_res)
192
193    def run_cpustats_in_background(self, dut, log_name):
194        """
195        Run command to collect CPU stats in background.
196
197        """
198        log_path = '/tmp/%s' % log_name
199        cpu_stats_cmd = (
200                'cpulog=%s; '
201                'rm -f ${cpulog}; '
202                # Stop after 720*0.5min=6hours if not killed at clean-up phase.
203                'for i in {1..720} ; do '
204                # Collect current CPU frequency on all cores and thermal data.
205                ' paste -d" " '
206                '  <(ls /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)'
207                '  <(cat /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)'
208                '  >> ${cpulog} || break; '  # exit loop if fails.
209                ' paste -d" "'
210                '   <(cat /sys/class/thermal/thermal_zone*/type)'
211                '   <(cat /sys/class/thermal/thermal_zone*/temp)'
212                # Filter in thermal data from only CPU-related sources.
213                '  | grep -iE "soc|cpu|pkg|big|little" >> ${cpulog} || break; '
214                # Repeat every 30 sec.
215                ' sleep 30; '
216                'done;') % log_path
217
218        return self.run_in_background_with_log(cpu_stats_cmd, dut, log_path)
219
220    def run_top_in_background(self, dut, log_name, interval_in_sec):
221        """
222        Run top in background.
223
224        """
225        log_path = '/tmp/%s' % log_name
226        top_cmd = (
227                # Run top in batch mode with specified interval and filter out
228                # top system summary and processes not consuming %CPU.
229                # Output of each iteration is separated by a blank line.
230                'HOME=/usr/local COLUMNS=128 top -bi -d%.1f'
231                ' | grep -E "^[ 0-9]|^$" > %s;') % (interval_in_sec, log_path)
232
233        return self.run_in_background_with_log(top_cmd, dut, log_path)
234
235    def run_turbostat_in_background(self, dut, log_name):
236        """
237        Run turbostat in background.
238
239        """
240        log_path = '/tmp/%s' % log_name
241        turbostat_cmd = (
242                'nohup turbostat --quiet --interval 10 '
243                '--show=CPU,Bzy_MHz,Avg_MHz,TSC_MHz,Busy%%,IRQ,CoreTmp '
244                '1> %s') % log_path
245
246        return self.run_in_background_with_log(turbostat_cmd, dut, log_path)
247
248    def run_cpuinfo(self, dut, log_name):
249        """
250        Collect CPU info of "dut" into "log_name" file.
251
252        """
253        cpuinfo_cmd = (
254                'for cpunum in '
255                "   $(awk '/^processor/ { print $NF ; }' /proc/cpuinfo ) ; do "
256                ' for i in'
257                ' `ls -d /sys/devices/system/cpu/cpu"${cpunum}"/cpufreq/'
258                '{cpuinfo_cur_freq,scaling_*_freq,scaling_governor} '
259                '     2>/dev/null` ; do '
260                '  paste -d" " <(echo "${i}") <(cat "${i}"); '
261                ' done;'
262                'done;'
263                'for cpudata in'
264                '  /sys/devices/system/cpu/intel_pstate/no_turbo'
265                '  /sys/devices/system/cpu/online; do '
266                ' if [[ -e "${cpudata}" ]] ; then '
267                '  paste <(echo "${cpudata}") <(cat "${cpudata}"); '
268                ' fi; '
269                'done; '
270                # Adding thermal data to the log.
271                'paste -d" "'
272                '  <(cat /sys/class/thermal/thermal_zone*/type)'
273                '  <(cat /sys/class/thermal/thermal_zone*/temp)')
274
275        # Get CPUInfo at the end of the test.
276        logging.info('Get cpuinfo: %s', cpuinfo_cmd)
277        with open(os.path.join(self.resultsdir, log_name),
278                  'w') as cpu_log_file:
279            # Handle command failure gracefully.
280            res = dut.run(
281                    cpuinfo_cmd, stdout_tee=cpu_log_file, ignore_status=True)
282            if res.exit_status:
283                logging.error('Get cpuinfo command failed with %d.',
284                              res.exit_status)
285
286    def run_once(self, args, client_ip='', dut=None):
287        """
288        Run a single telemetry test.
289
290        @param args: A dictionary of the arguments that were passed
291                to this test.
292        @param client_ip: The ip address of the DUT
293        @param dut: The autotest host object representing DUT.
294
295        @returns A TelemetryResult instance with the results of this execution.
296        """
297        test_name = args.get('test', '')
298        test_args = args.get('test_args', '')
299        profiler_args = args.get('profiler_args', '')
300
301        dut_config_str = args.get('dut_config', '{}')
302        dut_config = json.loads(dut_config_str)
303        # Setup device with dut_config arguments before running test
304        if dut_config:
305            wait_time = device_setup_utils.setup_device(dut, dut_config)
306            # Wait time can be used to accumulate cooldown time in Crosperf.
307            with open(os.path.join(self.resultsdir, WAIT_TIME_LOG), 'w') as f:
308                f.write(str(wait_time))
309
310        output_format = 'histograms'
311
312        # For local runs, we set local=True and use local chrome source to run
313        # tests; for lab runs, we use the drone instead.
314        # By default to be True.
315        local = args.get('local', 'true').lower() == 'true'
316
317        # If run_local=true, telemetry benchmark will run on DUT, otherwise
318        # run remotely from host.
319        # By default to be False.
320        # TODO(zhizhouy): It is better to change the field name from "run_local"
321        # to "telemetry_on_dut" in crosperf experiment files for consistency.
322        telemetry_on_dut = args.get('run_local', '').lower() == 'true'
323
324        # Run the test. And collect profile if needed.
325        try:
326            # If profiler_args specified, we want to add several more options
327            # to the command so that run_benchmark will collect system wide
328            # profiles.
329            if profiler_args:
330                profiler_opts = [
331                        '--interval-profiling-period=story_run',
332                        '--interval-profiling-target=system_wide',
333                        '--interval-profiler-options="%s"' % profiler_args
334                ]
335
336            top_interval = dut_config.get('top_interval', 0)
337            turbostat = dut_config.get('turbostat')
338
339            run_cpuinfo = self.run_cpustats_in_background if dut \
340                else self.no_background
341            run_turbostat = self.run_turbostat_in_background if (
342                    dut and turbostat) else self.no_background
343            run_top = self.run_top_in_background if (
344                    dut and top_interval > 0) else self.no_background
345
346            # FIXME(denik): replace with ExitStack.
347            with run_cpuinfo(dut, CPUSTATS_LOG) as _cpu_cm, \
348                run_turbostat(dut, TURBOSTAT_LOG) as _turbo_cm, \
349                run_top(dut, TOP_LOG, top_interval) as _top_cm:
350
351                arguments = []
352                if test_args:
353                    arguments.extend(shlex.split(test_args))
354                if profiler_args:
355                    arguments.extend(profiler_opts)
356                logging.debug('Telemetry Arguments: %s', arguments)
357                perf_value_writer = self
358                artifacts = True if profiler_args else False
359                with telemetry_runner.TelemetryRunnerFactory().get_runner(
360                        dut, local=local,
361                        telemetry_on_dut=telemetry_on_dut) as tr:
362                    result = tr.run_telemetry_benchmark(
363                            test_name,
364                            perf_value_writer,
365                            *arguments,
366                            ex_output_format=output_format,
367                            results_dir=self.resultsdir,
368                            no_verbose=True,
369                            artifacts=artifacts)
370                    logging.info('Telemetry completed with exit status: %s.',
371                                 result.status)
372                    logging.info('output: %s\n', result.output)
373
374        except (error.TestFail, error.TestWarn):
375            logging.debug(
376                    'Test did not succeed while executing telemetry test.')
377            raise
378        except:
379            logging.debug('Unexpected failure on telemetry_Crosperf.')
380            raise
381        finally:
382            if dut:
383                self.run_cpuinfo(dut, CPUINFO_LOG)
384
385        # Checking whether result file exists.
386        filepath = os.path.join(self.resultsdir, 'histograms.json')
387        if not os.path.exists(filepath):
388            raise RuntimeError('Missing results file: %s' % filepath)
389
390        # Copy the perf data file into the test_that profiling directory,
391        # if necessary. It always comes from DUT.
392        if profiler_args:
393            filepath = os.path.join(self.resultsdir, 'artifacts')
394            if not os.path.isabs(filepath):
395                raise RuntimeError('Expected absolute path of '
396                                   'arfifacts: %s' % filepath)
397            perf_exist = False
398            for root, _dirs, files in os.walk(filepath):
399                for f in files:
400                    if f.endswith('.perf.data'):
401                        perf_exist = True
402                        src_file = os.path.join(root, f)
403                        # results-cache.py in crosperf supports multiple
404                        # perf.data files, but only if they are named exactly
405                        # so. Therefore, create a subdir for each perf.data
406                        # file.
407                        # Use mkdtemp to make sure a directory name is unique.
408                        dst_dir = tempfile.mkdtemp(dir=self.profdir,
409                                                   prefix=''.join(
410                                                           f.split('.')[:-2]))
411                        dst_file = os.path.join(dst_dir, 'perf.data')
412                        shutil.copyfile(src_file, dst_file)
413            if not perf_exist:
414                raise error.TestFail('Error: No profiles collected, test may '
415                                     'not run correctly.')
416
417        # In skylab run, environment variable SYNCHRONOUS_OFFLOAD_DIR is set to
418        # be a directory; In test_that run, it will not be set.
419        synchronous_dir = os.getenv('SYNCHRONOUS_OFFLOAD_DIR', '')
420        # For skylab run, push logs and json results to the synchronous offload
421        # directory for instant access for report.
422        if synchronous_dir and os.path.isdir(synchronous_dir):
423            try:
424                dst = os.path.join(synchronous_dir, 'results')
425                shutil.copytree(self.resultsdir, dst)
426            except:
427                raise RuntimeError(
428                    'Failed to copy results in %s to synchronous offload'
429                    ' directory %s' % (self.resultsdir, synchronous_dir))
430
431        return result
432