1# Copyright 2017 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Start and stop Web Page Replay.""" 6 7import logging 8import os 9import re 10import signal 11import subprocess 12import sys 13import tempfile 14import urllib 15 16import py_utils 17from py_utils import atexit_with_log 18from py_utils import binary_manager 19 20_WPR_DIR = os.path.abspath(os.path.join( 21 py_utils.GetCatapultDir(), 'web_page_replay_go')) 22 23TELEMETRY_PROJECT_CONFIG = os.path.join( 24 py_utils.GetCatapultDir(), 'telemetry', 'telemetry', 25 'binary_dependencies.json') 26 27CHROME_BINARY_CONFIG = os.path.join( 28 py_utils.GetCatapultDir(), 'common', 'py_utils', 'py_utils', 29 'chrome_binaries.json') 30 31RECORD = '--record' 32INJECT_SCRIPTS = '--inject_scripts=' 33USE_LOCAL_WPR = '--use-local-wpr' 34DISABLE_FUZZY_URL_MATCHING = '--disable-fuzzy-url-matching' 35 36class ReplayError(Exception): 37 """Catch-all exception for the module.""" 38 pass 39 40 41class ReplayNotFoundError(ReplayError): 42 def __init__(self, label, path): 43 """ 44 Create a ReplayNotFoundError instance. 45 46 Args: 47 48 label: A string of label of this error. 49 path: A string of the path in this error. 50 51 """ 52 super(ReplayNotFoundError, self).__init__() 53 self.args = (label, path) 54 55 def __str__(self): 56 label, path = self.args 57 return 'Path does not exist for %s: %s' % (label, path) 58 59 60class ReplayNotStartedError(ReplayError): 61 pass 62 63 64class ReplayServer(object): 65 """Start and Stop Web Page Replay. 66 67 Web Page Replay is a proxy that can record and "replay" web pages with 68 simulated network characteristics -- without having to edit the pages 69 by hand. With WPR, tests can use "real" web content, and catch 70 performance issues that may result from introducing network delays and 71 bandwidth throttling. 72 73 This class could be used as a context manager. 74 75 Example: 76 with ReplayServer(archive_path): 77 self.NavigateToURL(start_url) 78 self.WaitUntil(...) 79 """ 80 81 _go_binary_path = None 82 83 def __init__(self, archive_path, replay_host, http_port, https_port, 84 replay_options, binary_downloader=None): 85 """Initialize ReplayServer. 86 87 Args: 88 archive_path: a path to a specific WPR archive. 89 replay_host: the hostname to serve traffic. 90 http_port: an integer port on which to serve HTTP traffic. May be zero 91 to let the OS choose an available port. 92 https_port: an integer port on which to serve HTTPS traffic. May be zero 93 to let the OS choose an available port. 94 replay_options: an iterable of options strings to forward to replay.py. 95 binary_downloader: a function to be used to fetch binary. May be None to 96 use py_utils.binary_manager.FetchPath as default downloader. 97 """ 98 self.archive_path = archive_path 99 self._replay_host = replay_host 100 self._started_ports = {} # a dict such as {'http': 80, 'https': 443} 101 102 # A temporary path for storing stdout & stderr of the webpagereplay 103 # subprocess. 104 self._temp_log_file_path = None 105 106 self._downloader = binary_downloader 107 self._replay_options = replay_options 108 self._cmd_line = self._GetCommandLine( 109 self._GetGoBinaryPath(replay_options), http_port, https_port, 110 replay_options, archive_path) 111 112 if RECORD in replay_options or 'record' in replay_options: 113 self._AssertPathExists('archive directory', 114 os.path.dirname(self.archive_path)) 115 elif not os.path.exists(self.archive_path): 116 self._AssertPathExists('archive file', self.archive_path) 117 118 self.replay_process = None 119 120 def _GetDownloader(self): 121 """Gets the downloader used to download wpr_go binary from GCS.""" 122 if ReplayServer._go_binary_path: 123 # If the _go_binary_path was already set, then no need to use downloader 124 # to download via binary_manager. 125 self._downloader = None 126 elif not self._downloader: 127 configs = [CHROME_BINARY_CONFIG, TELEMETRY_PROJECT_CONFIG] 128 self._downloader = binary_manager.BinaryManager(configs).FetchPath 129 return self._downloader 130 131 def _GetGoBinaryPath(self, replay_options): 132 """Gets the _go_binary_path if it already set, or downloads it.""" 133 if USE_LOCAL_WPR in replay_options: 134 # Build WPR 135 go_folder = os.path.join(_WPR_DIR, 'src') 136 cur_cwd = os.getcwd() 137 os.chdir(go_folder) 138 try: 139 print subprocess.check_output(['go', 'build', os.path.join(go_folder, 'wpr.go')]) 140 except subprocess.CalledProcessError: 141 exit(1) 142 os.chdir(cur_cwd) 143 144 return os.path.join(go_folder, 'wpr') 145 146 if not ReplayServer._go_binary_path: 147 downloader = self._GetDownloader() 148 if not downloader: 149 raise RuntimeError('downloader should not be None ' 150 'while _go_binary_path is None') 151 ReplayServer._go_binary_path = downloader( 152 'wpr_go', py_utils.GetHostOsName(), py_utils.GetHostArchName()) 153 return ReplayServer._go_binary_path 154 155 @classmethod 156 def SetGoBinaryPath(cls, go_binary_path): 157 """Overrides the _go_binary_path. 158 159 This allows the server to use WPRGO files retrieved from somewhere 160 other than GCS via binary_manager, such as test isolation. 161 162 For chromium project to use WPR, it is encourage to use test isolation, 163 and therefore should call SetGoBinaryPath to set _go_binary_path. 164 165 For Catapult/Telemetry project, the tradition is to download wpr_go 166 binary via binary_manager. So do not call SetGoBinaryPath. 167 """ 168 if not os.path.exists(go_binary_path): 169 raise ValueError('SetGoBinaryPath could not set {} as it does not exist' 170 .format(go_binary_path)) 171 cls._go_binary_path = go_binary_path 172 173 @property 174 def http_port(self): 175 return self._started_ports['http'] 176 177 @property 178 def https_port(self): 179 return self._started_ports['https'] 180 181 @staticmethod 182 def _GetCommandLine(go_binary_path, http_port, https_port, 183 options, archive_path): 184 """Set WPR command-line arguments. Can be overridden if needed. 185 186 Keyword arguments: 187 188 * go_binary_path: A string of the path to the wpr.go binary. 189 * http_port: A decimal of the port that handles http requests. 190 * https_port: A decimal of the port that handles https requests. 191 * options: A list of options, such as '--record', 192 '--inject_scripts', etc. 193 * archive_path: A string of the path to the archive file. 194 195 """ 196 bad_options = [] 197 for option in options: 198 if option not in [RECORD, INJECT_SCRIPTS, 199 USE_LOCAL_WPR, DISABLE_FUZZY_URL_MATCHING]: 200 bad_options.append(option) 201 if len(bad_options) > 0: 202 raise ValueError("Invalid replay options %s" % bad_options) 203 204 cmd_line = [go_binary_path] 205 if RECORD in options: 206 cmd_line.append('record') 207 else: 208 cmd_line.append('replay') 209 if DISABLE_FUZZY_URL_MATCHING in options: 210 cmd_line.append('--disable_fuzzy_url_matching') 211 key_file = os.path.join(_WPR_DIR, 'wpr_key.pem') 212 cert_file = os.path.join(_WPR_DIR, 'wpr_cert.pem') 213 inject_script = os.path.join(_WPR_DIR, 'deterministic.js') 214 cmd_line.extend([ 215 '--http_port=%s' % http_port, 216 '--https_port=%s' % https_port, 217 '--https_key_file=%s' % key_file, 218 '--https_cert_file=%s' % cert_file]) 219 if INJECT_SCRIPTS in options: 220 cmd_line.append(INJECT_SCRIPTS) 221 else: 222 cmd_line.append('--inject_scripts=%s' % inject_script) 223 cmd_line.append(archive_path) 224 return cmd_line 225 226 def _AssertPathExists(self, label, path): 227 if not os.path.exists(path): 228 raise ReplayNotFoundError(label, path) 229 230 def _OpenLogFile(self): 231 """Opens the log file for writing.""" 232 log_dir = os.path.dirname(self._temp_log_file_path) 233 if not os.path.isdir(log_dir): 234 os.makedirs(log_dir) 235 return open(self._temp_log_file_path, 'w') 236 237 def _LogLines(self): 238 """Yields any log lines that have been writtent to disk.""" 239 if (not self._temp_log_file_path or 240 not os.path.isfile(self._temp_log_file_path)): 241 yield '(N/A)' 242 return 243 with open(self._temp_log_file_path) as f: 244 for line in f: 245 yield line 246 247 def _IsStarted(self): 248 """Returns true if the server is up and running.""" 249 if not self._IsReplayProcessStarted(): 250 return False 251 252 def HasIncompleteStartedPorts(): 253 return ('http' not in self._started_ports or 254 'https' not in self._started_ports) 255 256 if HasIncompleteStartedPorts(): 257 self._started_ports = self._ParseLogFilePorts(self._LogLines()) 258 if HasIncompleteStartedPorts(): 259 return False 260 261 try: 262 # HTTPS may require SNI (which urllib does not speak), so only check 263 # that HTTP responds. 264 return self._UrlOpen('web-page-replay-generate-200').getcode() == 200 265 except IOError: 266 return False 267 268 @staticmethod 269 def _ParseLogFilePorts(log_lines): 270 """Returns the ports on which replay listens as reported in its log file. 271 272 Only matches HTTP, HTTPS, and DNS. One call may return only some 273 of the ports depending on what has been written to the log file. 274 275 Example log lines: 276 2014-09-03 17:04:27,978 Starting server on http://:51673 277 2014-09-03 17:04:27,978 Starting server on https://:35270 278 279 Returns: 280 a dict with ports available in log_lines. For example, 281 {} # no ports found 282 {'http': 1234, 'https': 2345, 'dns': 3456} 283 """ 284 ports = {} 285 port_re = re.compile( 286 r'.*Starting server on ' 287 r'(?P<protocol>http|https)://' 288 r'(?P<host>[^:]*):' 289 r'(?P<port>\d+)') 290 for line in log_lines: 291 m = port_re.match(line.strip()) 292 if m: 293 protocol = m.group('protocol').lower() 294 ports[protocol] = int(m.group('port')) 295 return ports 296 297 def StartServer(self): 298 """Start Web Page Replay and verify that it started. 299 300 Returns: 301 A dictionary mapping the keys 'http', 'https', and (if used) 'dns' 302 to the respective ports of the replay server. 303 Raises: 304 ReplayNotStartedError: if Replay start-up fails. 305 """ 306 is_posix = sys.platform.startswith('linux') or sys.platform == 'darwin' 307 logging.info('Starting Web-Page-Replay: %s', self._cmd_line) 308 self._CreateTempLogFilePath() 309 with self._OpenLogFile() as log_fh: 310 self.replay_process = subprocess.Popen( 311 self._cmd_line, stdout=log_fh, stderr=subprocess.STDOUT, 312 preexec_fn=(_ResetInterruptHandler if is_posix else None)) 313 try: 314 # TODO(crbug.com/805418): consider changing this to wait with I/O timeout. 315 # The 120s timeout is based on past failures (e.g: crbug.com/812639). 316 py_utils.WaitFor(self._IsStarted, timeout=120) 317 logging.info('WPR ports: %s', self._started_ports) 318 atexit_with_log.Register(self.StopServer) 319 return dict(self._started_ports) 320 except Exception: 321 self.StopServer(logging.ERROR) 322 raise ReplayNotStartedError('Web Page Replay failed to start.') 323 324 def _IsReplayProcessStarted(self): 325 if not self.replay_process: 326 return False 327 return self.replay_process and self.replay_process.poll() is None 328 329 def StopServer(self, log_level=logging.DEBUG): 330 """Stop Web Page Replay. 331 332 This also attempts to return stdout/stderr logs of wpr process if there is 333 any. If there is none, '(N/A)' string is returned (see _LogLines() 334 implementation). 335 """ 336 if self._IsReplayProcessStarted(): 337 self._StopReplayProcess() 338 self._CleanUpTempLogFilePath(log_level) 339 self._started_ports = {} 340 341 def _StopReplayProcess(self): 342 if not self.replay_process: 343 return 344 logging.debug('Trying to stop Web-Page-Replay gracefully') 345 try: 346 if self._started_ports: 347 self._UrlOpen('web-page-replay-command-exit').close() 348 except IOError: 349 # IOError is possible because the server might exit without response. 350 pass 351 try: 352 py_utils.WaitFor(lambda: self.replay_process.poll() is not None, 10) 353 except py_utils.TimeoutException: 354 try: 355 # Use a SIGINT so that it can do graceful cleanup. 356 self.replay_process.send_signal(signal.SIGINT) 357 except Exception: # pylint: disable=broad-except 358 # On Windows, we are left with no other option than terminate(). 359 is_primary_nameserver_changed_by_replay = ( 360 self._replay_host == '127.0.0.1') 361 if is_primary_nameserver_changed_by_replay: 362 # Replay changes the DNS nameserver configuration so that DNS 363 # requests are resolved by replay's own DNS server. It resolves 364 # all DNS requests to it own IP address to it can server the 365 # HTTP and HTTPS requests. 366 # If the replay host is not '127.0.0.1', then replay skips the 367 # nameserver change because it assumes a different mechanism 368 # will be used to route DNS requests to replay's DNS server. 369 logging.warning( 370 'Unable to stop Web-Page-Replay gracefully.\n' 371 'Replay changed the DNS nameserver configuration to make replay ' 372 'the primary nameserver. That might not be restored!') 373 self.replay_process.terminate() 374 self.replay_process.communicate() 375 finally: 376 self.replay_process = None 377 378 def _CreateTempLogFilePath(self): 379 assert self._temp_log_file_path is None 380 handle, self._temp_log_file_path = tempfile.mkstemp() 381 os.close(handle) 382 383 def _CleanUpTempLogFilePath(self, log_level): 384 if not self._temp_log_file_path: 385 return '' 386 if logging.getLogger('').isEnabledFor(log_level) or USE_LOCAL_WPR in self._replay_options: 387 with open(self._temp_log_file_path, 'r') as f: 388 wpr_log_output = f.read() 389 output = ('************************** WPR LOG *****************************\n' + 390 '\n'.join(wpr_log_output.split('\n')) + 391 '************************** END OF WPR LOG **********************') 392 if logging.getLogger('').isEnabledFor(log_level): 393 logging.log(log_level, output) 394 else: 395 print output 396 397 os.remove(self._temp_log_file_path) 398 self._temp_log_file_path = None 399 400 def __enter__(self): 401 """Add support for with-statement.""" 402 self.StartServer() 403 return self 404 405 def __exit__(self, unused_exc_type, unused_exc_val, unused_exc_tb): 406 """Add support for with-statement.""" 407 self.StopServer() 408 409 def _UrlOpen(self, url_path, protocol='http'): 410 """Open a Replay URL. 411 412 For matching requests in the archive, Replay relies on the "Host:" header. 413 For Replay command URLs, the "Host:" header is not needed. 414 415 Args: 416 url_path: WPR server request path. 417 protocol: 'http' or 'https' 418 Returns: 419 a file-like object from urllib.urlopen 420 """ 421 url = '%s://%s:%s/%s' % ( 422 protocol, self._replay_host, self._started_ports[protocol], url_path) 423 return urllib.urlopen(url, proxies={}) 424 425def _ResetInterruptHandler(): 426 """Reset the interrupt handler back to the default. 427 428 The replay process is stopped gracefully by making an HTTP request 429 ('web-page-replay-command-exit'). The graceful exit is important for 430 restoring the DNS configuration. If the HTTP request fails, the fallback 431 is to send SIGINT to the process. 432 433 On posix system, running this function before starting replay fixes a 434 bug that shows up when Telemetry is run as a background command from a 435 script. https://crbug.com/254572. 436 437 Background: Signal masks on Linux are inherited from parent 438 processes. If anything invoking us accidentally masks SIGINT 439 (e.g. by putting a process in the background from a shell script), 440 sending a SIGINT to the child will fail to terminate it. 441 """ 442 signal.signal(signal.SIGINT, signal.SIG_DFL) 443