1# -*- coding: utf-8 -*- 2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Basic infrastructure for implementing retries.""" 7 8from __future__ import print_function 9 10import functools 11import random 12import re 13import sys 14import time 15 16import six 17 18from autotest_lib.utils.frozen_chromite.lib import cros_build_lib 19from autotest_lib.utils.frozen_chromite.lib import cros_logging as logging 20 21 22# Match stderr of curl's --fail option to see HTTP status code. 23CURL_STATUS_RE = re.compile(br'The requested URL returned error: (\d+) ') 24 25 26def _CreateExceptionRetryHandler(exception): 27 """Returns a retry handler for given exception(s). 28 29 Please see WithRetry class document for details. 30 """ 31 if not (isinstance(exception, type) and issubclass(exception, Exception) or 32 (isinstance(exception, tuple) and 33 all(issubclass(e, Exception) for e in exception))): 34 raise TypeError('exceptions should be an exception (or tuple), not %r' % 35 exception) 36 return lambda exc: isinstance(exc, exception) 37 38 39class _RetryDelayStrategy(object): 40 """The strategy of the delay between each retry attempts. 41 42 Please see WithRetry class document for details. 43 """ 44 45 def __init__(self, sleep=0, backoff_factor=1, jitter=0): 46 if sleep < 0: 47 raise ValueError('sleep must be >= 0: %s' % sleep) 48 49 if backoff_factor < 1: 50 raise ValueError('backoff_factor must be 1 or greater: %s' 51 % backoff_factor) 52 53 if jitter < 0: 54 raise ValueError('jitter must be >= 0: %s' % jitter) 55 56 self._sleep = sleep 57 self._backoff_factor = backoff_factor 58 self._jitter = jitter 59 60 def Sleep(self, attempt): 61 """Sleep to delay the current retry.""" 62 assert attempt >= 1, 'Expect attempt is always positive: %s' % attempt 63 if self._backoff_factor > 1: 64 sleep_duration = self._sleep * self._backoff_factor ** (attempt - 1) 65 else: 66 sleep_duration = self._sleep * attempt 67 68 # If |jitter| is set, add a random jitter sleep. 69 jitter = random.uniform(.5 * self._jitter, 1.5 * self._jitter) 70 total = sleep_duration + jitter 71 if total: 72 logging.debug('Retrying in %f (%f + jitter %f) seconds ...', 73 total, sleep_duration, jitter) 74 time.sleep(total) 75 76 77class WithRetry(object): 78 """Decorator to handle retry on exception. 79 80 Examples: 81 @WithRetry(max_retry=3) 82 def _run(): 83 ... do something ... 84 _run() 85 86 If _run() raises an exception, it retries at most three times. 87 88 Retrying strategy. 89 90 If the decorated function throws an Exception instance, then this class 91 checks whether the retry should be continued or not based on the given 92 |handler| or |exception| as follows. 93 - If |handler| is given, which should be a callback which takes an exception 94 and returns bool, calls it with the thrown exception. 95 If the |handler| returns True, retry will be continued. Otherwise no 96 further retry will be made, and an exception will be raised. 97 - If |exception| is given, which is an exception class or a tuple of 98 exception classes, iff the thrown exception is a instance of the given 99 exception class(es) (or its subclass), continues to retry. Otherwise no 100 further retry will be made, and an exception will be raised. 101 - If neither is given, just continues to retry on any Exception instance. 102 - Note: it is not allowed to specify both |handler| and |exception| at once. 103 104 Delay strategy. 105 106 Between for each attempt, some delay can be set, as follows. 107 - If |sleep| is given, the delay between the first and second attempts is 108 |sleep| secs. 109 - The delay between the second and third attempts, and later, depends on 110 |sleep| and |backoff_factor|. 111 - If |backoff_factor| is not given, the delay will be linearly increased, 112 as |sleep| * (number of attempts). E.g., if |sleep| is 1, the delays 113 will be 1, 2, 3, 4, 5, ... and so on. 114 - If |backoff_factor| is given, the delay will be exponentially increased, 115 as |sleep| * |backoff_factor| ** (number of attempts - 1). E.g., if 116 |sleep| is 1, and |backoff_factor| is 2, the delay will be, 117 1, 2, 4, 8, 16, ... and so on 118 - Note: Keep in mind that, if |backoff_factor| is not given, the total 119 delay time will be triangular value of |max_retry| multiplied by the 120 |sleep| value. E.g., |max_retry| is 5, and |sleep| is 10, will be 121 T5 (i.e. 5 + 4 + 3 + 2 + 1) times 10 = 150 seconds total. Rather than 122 use a large sleep value, you should lean more towards large retries 123 and lower sleep intervals, or by utilizing |backoff_factor|. 124 - In addition, for each delay, random duration of the delay can be added, 125 as 'jitter'. (Often, this helps to avoid consecutive conflicting situation) 126 |jitter| is specifies the duration of jitter delay, randomized up to 127 50% in either direction. 128 """ 129 130 def __init__(self, 131 max_retry, handler=None, exception=None, log_all_retries=False, 132 sleep=0, backoff_factor=1, jitter=0, 133 raise_first_exception_on_failure=True, exception_to_raise=None, 134 status_callback=None): 135 """Initialize. 136 137 Args: 138 max_retry: A positive integer representing how many times to retry the 139 command before giving up. Worst case, the command is invoked 140 (max_retry + 1) times before failing. 141 handler: Please see above for details. 142 exception: Please see above for details. 143 log_all_retries: when True, logs all retries. 144 sleep: Please see above for details. 145 backoff_factor: Please see above for details. 146 jitter: Please see above for details. 147 raise_first_exception_on_failure: determines which excecption is raised 148 upon failure after retries. If True, the first exception that was 149 encountered. Otherwise, the final one. 150 exception_to_raise: Optional exception type. If given, raises its 151 instance, instead of the one raised from the retry body. 152 status_callback: Optional callback invoked after each call of |functor|. 153 It takes two arguments: |attempt| which is the index of the last 154 attempt (0-based), and |success| representing whether the last attempt 155 was successfully done or not. If the callback raises an exception, no 156 further retry will be made, and the exception will be propagated to 157 the caller. 158 """ 159 if max_retry < 0: 160 raise ValueError('max_retry needs to be zero or more: %d' % max_retry) 161 self._max_retry = max_retry 162 163 if handler is not None and exception is not None: 164 raise ValueError('handler and exception cannot be specified at once') 165 self._handler = ( 166 handler or _CreateExceptionRetryHandler(exception or Exception)) 167 168 self._log_all_retries = log_all_retries 169 self._retry_delay = _RetryDelayStrategy(sleep, backoff_factor, jitter) 170 self._raise_first_exception_on_failure = raise_first_exception_on_failure 171 self._exception_to_raise = exception_to_raise 172 self._status_callback = status_callback or (lambda attempt, success: None) 173 174 def __call__(self, func): 175 @functools.wraps(func) 176 def _Wrapper(*args, **kwargs): 177 fname = getattr(func, '__qualname__', 178 getattr(func, '__name__', '<nameless>')) 179 exc_info = None 180 for attempt in range(self._max_retry + 1): 181 if attempt: 182 self._retry_delay.Sleep(attempt) 183 184 if attempt and self._log_all_retries: 185 logging.debug('Retrying %s (attempt %d)', fname, attempt + 1) 186 187 try: 188 ret = func(*args, **kwargs) 189 except Exception as e: 190 # Note we're not snagging BaseException, so 191 # MemoryError/KeyboardInterrupt and friends don't enter this except 192 # block. 193 194 # If raise_first_exception_on_failure, we intentionally ignore 195 # any failures in later attempts since we'll throw the original 196 # failure if all retries fail. 197 if exc_info is None or not self._raise_first_exception_on_failure: 198 exc_info = sys.exc_info() 199 200 try: 201 self._status_callback(attempt, False) 202 except Exception: 203 # In case callback raises an exception, quit the retry. 204 # For further investigation, log the original exception here. 205 logging.error('Ending retry due to Exception raised by a callback. ' 206 'Original exception raised during the attempt is ' 207 'as follows: ', 208 exc_info=exc_info) 209 # Reraise the exception raised from the status_callback. 210 raise 211 212 if not self._handler(e): 213 logging.debug('ending retries with error: %s(%s)', e.__class__, e) 214 break 215 logging.exception('func call has failed') 216 else: 217 # Run callback in outside of try's main block, in order to avoid 218 # accidental capture of an Exception which may be raised in callback. 219 self._status_callback(attempt, True) 220 return ret 221 222 # Did not return, meaning all attempts failed. Raise the exception. 223 if self._exception_to_raise: 224 raise self._exception_to_raise('%s: %s' % (exc_info[0], exc_info[1])) 225 six.reraise(exc_info[0], exc_info[1], exc_info[2]) 226 return _Wrapper 227 228 229def GenericRetry(handler, max_retry, functor, *args, **kwargs): 230 """Generic retry loop w/ optional break out depending on exceptions. 231 232 Runs functor(*args, **(kwargs excluding params for retry)) as a retry body. 233 234 Please see WithRetry for details about retrying parameters. 235 """ 236 # Note: the default value needs to be matched with the ones of WithRetry's 237 # ctor. 238 log_all_retries = kwargs.pop('log_all_retries', False) 239 delay_sec = kwargs.pop('delay_sec', 0) 240 sleep = kwargs.pop('sleep', 0) 241 backoff_factor = kwargs.pop('backoff_factor', 1) 242 status_callback = kwargs.pop('status_callback', None) 243 raise_first_exception_on_failure = kwargs.pop( 244 'raise_first_exception_on_failure', True) 245 exception_to_raise = kwargs.pop('exception_to_raise', None) 246 247 @WithRetry( 248 max_retry=max_retry, handler=handler, log_all_retries=log_all_retries, 249 sleep=sleep, backoff_factor=backoff_factor, jitter=delay_sec, 250 raise_first_exception_on_failure=raise_first_exception_on_failure, 251 exception_to_raise=exception_to_raise, 252 status_callback=status_callback) 253 def _run(): 254 return functor(*args, **kwargs) 255 return _run() 256 257 258def RetryException(exception, max_retry, functor, *args, **kwargs): 259 """Convenience wrapper for GenericRetry based on exceptions. 260 261 Runs functor(*args, **(kwargs excluding params for retry)) as a retry body. 262 263 Please see WithRetry for details about retrying parameters. 264 """ 265 log_all_retries = kwargs.pop('log_all_retries', False) 266 delay_sec = kwargs.pop('delay_sec', 0) 267 sleep = kwargs.pop('sleep', 0) 268 backoff_factor = kwargs.pop('backoff_factor', 1) 269 status_callback = kwargs.pop('status_callback', None) 270 raise_first_exception_on_failure = kwargs.pop( 271 'raise_first_exception_on_failure', True) 272 exception_to_raise = kwargs.pop('exception_to_raise', None) 273 274 @WithRetry( 275 max_retry=max_retry, exception=exception, 276 log_all_retries=log_all_retries, 277 sleep=sleep, backoff_factor=backoff_factor, jitter=delay_sec, 278 raise_first_exception_on_failure=raise_first_exception_on_failure, 279 exception_to_raise=exception_to_raise, 280 status_callback=status_callback) 281 def _run(): 282 return functor(*args, **kwargs) 283 return _run() 284 285 286def RetryCommand(functor, max_retry, *args, **kwargs): 287 """Wrapper for run that will retry a command. 288 289 Args: 290 functor: run function to run; retries will only occur on 291 RunCommandError exceptions being thrown. 292 max_retry: A positive integer representing how many times to retry 293 the command before giving up. Worst case, the command is invoked 294 (max_retry + 1) times before failing. 295 sleep: Optional keyword. Multiplier for how long to sleep between 296 retries; will delay (1*sleep) the first time, then (2*sleep), 297 continuing via attempt * sleep. 298 retry_on: If provided, we will retry on any exit codes in the given list. 299 Note: A process will exit with a negative exit code if it is killed by a 300 signal. By default, we retry on all non-negative exit codes. 301 error_check: Optional callback to check the error output. Return None to 302 fall back to |retry_on|, or True/False to set the retry directly. 303 log_retries: Whether to log a warning when retriable errors occur. 304 args: Positional args passed to run; see run for specifics. 305 kwargs: Optional args passed to run; see run for specifics. 306 307 Returns: 308 A CommandResult object. 309 310 Raises: 311 RunCommandError: Raised on error. 312 """ 313 values = kwargs.pop('retry_on', None) 314 error_check = kwargs.pop('error_check', lambda x: None) 315 log_retries = kwargs.pop('log_retries', True) 316 317 def ShouldRetry(exc): 318 """Return whether we should retry on a given exception.""" 319 if not ShouldRetryCommandCommon(exc): 320 return False 321 if values is None and exc.result.returncode < 0: 322 logging.info('Child process received signal %d; not retrying.', 323 -exc.result.returncode) 324 return False 325 326 ret = error_check(exc) 327 if ret is not None: 328 return ret 329 330 if values is None or exc.result.returncode in values: 331 if log_retries: 332 logging.warning('Command failed with retriable error.\n%s', exc) 333 return True 334 return False 335 336 return GenericRetry(ShouldRetry, max_retry, functor, *args, **kwargs) 337 338 339def ShouldRetryCommandCommon(exc): 340 """Returns whether any run should retry on a given exception.""" 341 if not isinstance(exc, cros_build_lib.RunCommandError): 342 return False 343 if exc.result.returncode is None: 344 logging.error('Child process failed to launch; not retrying:\n' 345 'command: %s', exc.result.cmdstr) 346 return False 347 return True 348 349 350def RunCommandWithRetries(max_retry, *args, **kwargs): 351 """Wrapper for run that will retry a command 352 353 Args: 354 max_retry: See RetryCommand and run. 355 *args: See RetryCommand and run. 356 **kwargs: See RetryCommand and run. 357 358 Returns: 359 A CommandResult object. 360 361 Raises: 362 RunCommandError: Raised on error. 363 """ 364 return RetryCommand(cros_build_lib.run, max_retry, *args, **kwargs) 365 366 367class DownloadError(Exception): 368 """Fetching file via curl failed""" 369 370 371def RunCurl(curl_args, *args, **kwargs): 372 """Runs curl and wraps around all necessary hacks. 373 374 Args: 375 curl_args: Command line to pass to curl. Must be list of str. 376 *args, **kwargs: See RunCommandWithRetries and run. 377 Note that retry_on, error_check, sleep, backoff_factor cannot be 378 overwritten. 379 380 Returns: 381 A CommandResult object. 382 383 Raises: 384 DownloadError: Whenever curl fails for any reason. 385 """ 386 cmd = ['curl'] + curl_args 387 388 # These values were discerned via scraping the curl manpage; they're all 389 # retry related (dns failed, timeout occurred, etc, see the manpage for 390 # exact specifics of each). 391 # Note we allow 22 to deal w/ 500's- they're thrown by google storage 392 # occasionally. This is also thrown when getting 4xx, but curl doesn't 393 # make it easy to differentiate between them. 394 # Note we allow 35 to deal w/ Unknown SSL Protocol error, thrown by 395 # google storage occasionally. 396 # Finally, we do not use curl's --retry option since it generally doesn't 397 # actually retry anything; code 18 for example, it will not retry on. 398 retriable_exits = frozenset([5, 6, 7, 15, 18, 22, 26, 28, 35, 52, 56]) 399 400 def _CheckExit(exc): 401 """Filter out specific error codes when getting exit 22 402 403 Curl will exit(22) for a wide range of HTTP codes -- both the 4xx and 5xx 404 set. For the 4xx, we don't want to retry. We have to look at the output. 405 """ 406 assert isinstance(exc, cros_build_lib.RunCommandError) 407 if exc.result.returncode == 22: 408 logging.debug('curl stderr %s', exc.result.error) 409 matched = CURL_STATUS_RE.search(exc.result.error) 410 if not matched: 411 # Unexpected stderr. It may not be error output from --fail. 412 return True 413 status_code = matched.group(1) 414 return not status_code.startswith(b'4') 415 416 # We'll let the common exit code filter do the right thing. 417 return None 418 419 try: 420 return RunCommandWithRetries( 421 10, cmd, retry_on=retriable_exits, error_check=_CheckExit, 422 sleep=3, backoff_factor=1.6, 423 stderr=True, extra_env={'LC_MESSAGES': 'C'}, *args, **kwargs) 424 except cros_build_lib.RunCommandError as e: 425 if e.result.returncode in (51, 58, 60): 426 # These are the return codes of failing certs as per 'man curl'. 427 raise DownloadError( 428 'Download failed with certificate error? Try "sudo c_rehash".') 429 raise DownloadError('Curl failed w/ exit code %i: %s' % 430 (e.result.returncode, e.result.error)) 431