xref: /aosp_15_r20/external/chromium-trace/catapult/common/py_utils/py_utils/webpagereplay_go_server.py (revision 1fa4b3da657c0e9ad43c0220bacf9731820715a5)
1# Copyright 2017 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Start and stop Web Page Replay."""
6
7import logging
8import os
9import re
10import signal
11import subprocess
12import sys
13import tempfile
14import urllib
15
16import py_utils
17from py_utils import atexit_with_log
18from py_utils import binary_manager
19
20_WPR_DIR = os.path.abspath(os.path.join(
21    py_utils.GetCatapultDir(), 'web_page_replay_go'))
22
23TELEMETRY_PROJECT_CONFIG = os.path.join(
24    py_utils.GetCatapultDir(), 'telemetry', 'telemetry',
25    'binary_dependencies.json')
26
27CHROME_BINARY_CONFIG = os.path.join(
28    py_utils.GetCatapultDir(), 'common', 'py_utils', 'py_utils',
29    'chrome_binaries.json')
30
31RECORD = '--record'
32INJECT_SCRIPTS = '--inject_scripts='
33USE_LOCAL_WPR = '--use-local-wpr'
34DISABLE_FUZZY_URL_MATCHING = '--disable-fuzzy-url-matching'
35
36class ReplayError(Exception):
37  """Catch-all exception for the module."""
38  pass
39
40
41class ReplayNotFoundError(ReplayError):
42  def __init__(self, label, path):
43    """
44    Create a ReplayNotFoundError instance.
45
46    Args:
47
48      label: A string of label of this error.
49      path: A string of the path in this error.
50
51    """
52    super(ReplayNotFoundError, self).__init__()
53    self.args = (label, path)
54
55  def __str__(self):
56    label, path = self.args
57    return 'Path does not exist for %s: %s' % (label, path)
58
59
60class ReplayNotStartedError(ReplayError):
61  pass
62
63
64class ReplayServer(object):
65  """Start and Stop Web Page Replay.
66
67  Web Page Replay is a proxy that can record and "replay" web pages with
68  simulated network characteristics -- without having to edit the pages
69  by hand. With WPR, tests can use "real" web content, and catch
70  performance issues that may result from introducing network delays and
71  bandwidth throttling.
72
73  This class could be used as a context manager.
74
75  Example:
76     with ReplayServer(archive_path):
77       self.NavigateToURL(start_url)
78       self.WaitUntil(...)
79  """
80
81  _go_binary_path = None
82
83  def __init__(self, archive_path, replay_host, http_port, https_port,
84               replay_options, binary_downloader=None):
85    """Initialize ReplayServer.
86
87    Args:
88      archive_path: a path to a specific WPR archive.
89      replay_host: the hostname to serve traffic.
90      http_port: an integer port on which to serve HTTP traffic. May be zero
91          to let the OS choose an available port.
92      https_port: an integer port on which to serve HTTPS traffic. May be zero
93          to let the OS choose an available port.
94      replay_options: an iterable of options strings to forward to replay.py.
95      binary_downloader: a function to be used to fetch binary. May be None to
96          use py_utils.binary_manager.FetchPath as default downloader.
97    """
98    self.archive_path = archive_path
99    self._replay_host = replay_host
100    self._started_ports = {}  # a dict such as {'http': 80, 'https': 443}
101
102    # A temporary path for storing stdout & stderr of the webpagereplay
103    # subprocess.
104    self._temp_log_file_path = None
105
106    self._downloader = binary_downloader
107    self._replay_options = replay_options
108    self._cmd_line = self._GetCommandLine(
109        self._GetGoBinaryPath(replay_options), http_port, https_port,
110        replay_options, archive_path)
111
112    if RECORD in replay_options or 'record' in replay_options:
113      self._AssertPathExists('archive directory',
114                             os.path.dirname(self.archive_path))
115    elif not os.path.exists(self.archive_path):
116      self._AssertPathExists('archive file', self.archive_path)
117
118    self.replay_process = None
119
120  def _GetDownloader(self):
121    """Gets the downloader used to download wpr_go binary from GCS."""
122    if ReplayServer._go_binary_path:
123      # If the _go_binary_path was already set, then no need to use downloader
124      # to download via binary_manager.
125      self._downloader = None
126    elif not self._downloader:
127      configs = [CHROME_BINARY_CONFIG, TELEMETRY_PROJECT_CONFIG]
128      self._downloader = binary_manager.BinaryManager(configs).FetchPath
129    return self._downloader
130
131  def _GetGoBinaryPath(self, replay_options):
132    """Gets the _go_binary_path if it already set, or downloads it."""
133    if USE_LOCAL_WPR in replay_options:
134      # Build WPR
135      go_folder = os.path.join(_WPR_DIR, 'src')
136      cur_cwd = os.getcwd()
137      os.chdir(go_folder)
138      try:
139        print subprocess.check_output(['go', 'build', os.path.join(go_folder, 'wpr.go')])
140      except subprocess.CalledProcessError:
141        exit(1)
142      os.chdir(cur_cwd)
143
144      return os.path.join(go_folder, 'wpr')
145
146    if not ReplayServer._go_binary_path:
147      downloader = self._GetDownloader()
148      if not downloader:
149        raise RuntimeError('downloader should not be None '
150                           'while _go_binary_path is None')
151      ReplayServer._go_binary_path = downloader(
152          'wpr_go', py_utils.GetHostOsName(), py_utils.GetHostArchName())
153    return ReplayServer._go_binary_path
154
155  @classmethod
156  def SetGoBinaryPath(cls, go_binary_path):
157    """Overrides the _go_binary_path.
158
159    This allows the server to use WPRGO files retrieved from somewhere
160    other than GCS via binary_manager, such as test isolation.
161
162    For chromium project to use WPR, it is encourage to use test isolation,
163    and therefore should call SetGoBinaryPath to set _go_binary_path.
164
165    For Catapult/Telemetry project, the tradition is to download wpr_go
166    binary via binary_manager. So do not call SetGoBinaryPath.
167    """
168    if not os.path.exists(go_binary_path):
169      raise ValueError('SetGoBinaryPath could not set {} as it does not exist'
170                       .format(go_binary_path))
171    cls._go_binary_path = go_binary_path
172
173  @property
174  def http_port(self):
175    return self._started_ports['http']
176
177  @property
178  def https_port(self):
179    return self._started_ports['https']
180
181  @staticmethod
182  def _GetCommandLine(go_binary_path, http_port, https_port,
183                      options, archive_path):
184    """Set WPR command-line arguments. Can be overridden if needed.
185
186    Keyword arguments:
187
188    * go_binary_path: A string of the path to the wpr.go binary.
189    * http_port: A decimal of the port that handles http requests.
190    * https_port: A decimal of the port that handles https requests.
191    * options: A list of options, such as '--record',
192        '--inject_scripts', etc.
193    * archive_path: A string of the path to the archive file.
194
195    """
196    bad_options = []
197    for option in options:
198      if option not in [RECORD, INJECT_SCRIPTS,
199                        USE_LOCAL_WPR, DISABLE_FUZZY_URL_MATCHING]:
200        bad_options.append(option)
201    if len(bad_options) > 0:
202      raise ValueError("Invalid replay options %s" % bad_options)
203
204    cmd_line = [go_binary_path]
205    if RECORD in options:
206      cmd_line.append('record')
207    else:
208      cmd_line.append('replay')
209    if DISABLE_FUZZY_URL_MATCHING in options:
210      cmd_line.append('--disable_fuzzy_url_matching')
211    key_file = os.path.join(_WPR_DIR, 'wpr_key.pem')
212    cert_file = os.path.join(_WPR_DIR, 'wpr_cert.pem')
213    inject_script = os.path.join(_WPR_DIR, 'deterministic.js')
214    cmd_line.extend([
215        '--http_port=%s' % http_port,
216        '--https_port=%s' % https_port,
217        '--https_key_file=%s' % key_file,
218        '--https_cert_file=%s' % cert_file])
219    if INJECT_SCRIPTS in options:
220      cmd_line.append(INJECT_SCRIPTS)
221    else:
222      cmd_line.append('--inject_scripts=%s' % inject_script)
223    cmd_line.append(archive_path)
224    return cmd_line
225
226  def _AssertPathExists(self, label, path):
227    if not os.path.exists(path):
228      raise ReplayNotFoundError(label, path)
229
230  def _OpenLogFile(self):
231    """Opens the log file for writing."""
232    log_dir = os.path.dirname(self._temp_log_file_path)
233    if not os.path.isdir(log_dir):
234      os.makedirs(log_dir)
235    return open(self._temp_log_file_path, 'w')
236
237  def _LogLines(self):
238    """Yields any log lines that have been writtent to disk."""
239    if (not self._temp_log_file_path or
240        not os.path.isfile(self._temp_log_file_path)):
241      yield '(N/A)'
242      return
243    with open(self._temp_log_file_path) as f:
244      for line in f:
245        yield line
246
247  def _IsStarted(self):
248    """Returns true if the server is up and running."""
249    if not self._IsReplayProcessStarted():
250      return False
251
252    def HasIncompleteStartedPorts():
253      return ('http' not in self._started_ports or
254              'https' not in self._started_ports)
255
256    if HasIncompleteStartedPorts():
257      self._started_ports = self._ParseLogFilePorts(self._LogLines())
258    if HasIncompleteStartedPorts():
259      return False
260
261    try:
262      # HTTPS may require SNI (which urllib does not speak), so only check
263      # that HTTP responds.
264      return self._UrlOpen('web-page-replay-generate-200').getcode() == 200
265    except IOError:
266      return False
267
268  @staticmethod
269  def _ParseLogFilePorts(log_lines):
270    """Returns the ports on which replay listens as reported in its log file.
271
272    Only matches HTTP, HTTPS, and DNS. One call may return only some
273    of the ports depending on what has been written to the log file.
274
275    Example log lines:
276      2014-09-03 17:04:27,978 Starting server on http://:51673
277      2014-09-03 17:04:27,978 Starting server on https://:35270
278
279    Returns:
280      a dict with ports available in log_lines. For example,
281         {}  # no ports found
282         {'http': 1234, 'https': 2345, 'dns': 3456}
283    """
284    ports = {}
285    port_re = re.compile(
286        r'.*Starting server on '
287        r'(?P<protocol>http|https)://'
288        r'(?P<host>[^:]*):'
289        r'(?P<port>\d+)')
290    for line in log_lines:
291      m = port_re.match(line.strip())
292      if m:
293        protocol = m.group('protocol').lower()
294        ports[protocol] = int(m.group('port'))
295    return ports
296
297  def StartServer(self):
298    """Start Web Page Replay and verify that it started.
299
300    Returns:
301      A dictionary mapping the keys 'http', 'https', and (if used) 'dns'
302      to the respective ports of the replay server.
303    Raises:
304      ReplayNotStartedError: if Replay start-up fails.
305    """
306    is_posix = sys.platform.startswith('linux') or sys.platform == 'darwin'
307    logging.info('Starting Web-Page-Replay: %s', self._cmd_line)
308    self._CreateTempLogFilePath()
309    with self._OpenLogFile() as log_fh:
310      self.replay_process = subprocess.Popen(
311          self._cmd_line, stdout=log_fh, stderr=subprocess.STDOUT,
312          preexec_fn=(_ResetInterruptHandler if is_posix else None))
313    try:
314      # TODO(crbug.com/805418): consider changing this to wait with I/O timeout.
315      # The 120s timeout is based on past failures (e.g: crbug.com/812639).
316      py_utils.WaitFor(self._IsStarted, timeout=120)
317      logging.info('WPR ports: %s', self._started_ports)
318      atexit_with_log.Register(self.StopServer)
319      return dict(self._started_ports)
320    except Exception:
321      self.StopServer(logging.ERROR)
322      raise ReplayNotStartedError('Web Page Replay failed to start.')
323
324  def _IsReplayProcessStarted(self):
325    if not self.replay_process:
326      return False
327    return self.replay_process and self.replay_process.poll() is None
328
329  def StopServer(self, log_level=logging.DEBUG):
330    """Stop Web Page Replay.
331
332    This also attempts to return stdout/stderr logs of wpr process if there is
333    any. If there is none, '(N/A)' string is returned (see _LogLines()
334    implementation).
335    """
336    if self._IsReplayProcessStarted():
337      self._StopReplayProcess()
338    self._CleanUpTempLogFilePath(log_level)
339    self._started_ports = {}
340
341  def _StopReplayProcess(self):
342    if not self.replay_process:
343      return
344    logging.debug('Trying to stop Web-Page-Replay gracefully')
345    try:
346      if self._started_ports:
347        self._UrlOpen('web-page-replay-command-exit').close()
348    except IOError:
349      # IOError is possible because the server might exit without response.
350      pass
351    try:
352      py_utils.WaitFor(lambda: self.replay_process.poll() is not None, 10)
353    except py_utils.TimeoutException:
354      try:
355        # Use a SIGINT so that it can do graceful cleanup.
356        self.replay_process.send_signal(signal.SIGINT)
357      except Exception:  # pylint: disable=broad-except
358        # On Windows, we are left with no other option than terminate().
359        is_primary_nameserver_changed_by_replay = (
360            self._replay_host == '127.0.0.1')
361        if is_primary_nameserver_changed_by_replay:
362          # Replay changes the DNS nameserver configuration so that DNS
363          # requests are resolved by replay's own DNS server. It resolves
364          # all DNS requests to it own IP address to it can server the
365          # HTTP and HTTPS requests.
366          # If the replay host is not '127.0.0.1', then replay skips the
367          # nameserver change because it assumes a different mechanism
368          # will be used to route DNS requests to replay's DNS server.
369          logging.warning(
370              'Unable to stop Web-Page-Replay gracefully.\n'
371              'Replay changed the DNS nameserver configuration to make replay '
372              'the primary nameserver. That might not be restored!')
373        self.replay_process.terminate()
374      self.replay_process.communicate()
375    finally:
376      self.replay_process = None
377
378  def _CreateTempLogFilePath(self):
379    assert self._temp_log_file_path is None
380    handle, self._temp_log_file_path = tempfile.mkstemp()
381    os.close(handle)
382
383  def _CleanUpTempLogFilePath(self, log_level):
384    if not self._temp_log_file_path:
385      return ''
386    if logging.getLogger('').isEnabledFor(log_level) or USE_LOCAL_WPR in self._replay_options:
387      with open(self._temp_log_file_path, 'r') as f:
388        wpr_log_output = f.read()
389      output = ('************************** WPR LOG *****************************\n' +
390                '\n'.join(wpr_log_output.split('\n')) +
391                '************************** END OF WPR LOG **********************')
392      if logging.getLogger('').isEnabledFor(log_level):
393        logging.log(log_level, output)
394      else:
395        print output
396
397    os.remove(self._temp_log_file_path)
398    self._temp_log_file_path = None
399
400  def __enter__(self):
401    """Add support for with-statement."""
402    self.StartServer()
403    return self
404
405  def __exit__(self, unused_exc_type, unused_exc_val, unused_exc_tb):
406    """Add support for with-statement."""
407    self.StopServer()
408
409  def _UrlOpen(self, url_path, protocol='http'):
410    """Open a Replay URL.
411
412    For matching requests in the archive, Replay relies on the "Host:" header.
413    For Replay command URLs, the "Host:" header is not needed.
414
415    Args:
416      url_path: WPR server request path.
417      protocol: 'http' or 'https'
418    Returns:
419      a file-like object from urllib.urlopen
420    """
421    url = '%s://%s:%s/%s' % (
422        protocol, self._replay_host, self._started_ports[protocol], url_path)
423    return urllib.urlopen(url, proxies={})
424
425def _ResetInterruptHandler():
426  """Reset the interrupt handler back to the default.
427
428  The replay process is stopped gracefully by making an HTTP request
429  ('web-page-replay-command-exit'). The graceful exit is important for
430  restoring the DNS configuration. If the HTTP request fails, the fallback
431  is to send SIGINT to the process.
432
433  On posix system, running this function before starting replay fixes a
434  bug that shows up when Telemetry is run as a background command from a
435  script. https://crbug.com/254572.
436
437  Background: Signal masks on Linux are inherited from parent
438  processes. If anything invoking us accidentally masks SIGINT
439  (e.g. by putting a process in the background from a shell script),
440  sending a SIGINT to the child will fail to terminate it.
441  """
442  signal.signal(signal.SIGINT, signal.SIG_DFL)
443