xref: /aosp_15_r20/external/autotest/utils/frozen_chromite/lib/retry_util.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# -*- coding: utf-8 -*-
2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Basic infrastructure for implementing retries."""
7
8from __future__ import print_function
9
10import functools
11import random
12import re
13import sys
14import time
15
16import six
17
18from autotest_lib.utils.frozen_chromite.lib import cros_build_lib
19from autotest_lib.utils.frozen_chromite.lib import cros_logging as logging
20
21
22# Match stderr of curl's --fail option to see HTTP status code.
23CURL_STATUS_RE = re.compile(br'The requested URL returned error: (\d+) ')
24
25
26def _CreateExceptionRetryHandler(exception):
27  """Returns a retry handler for given exception(s).
28
29  Please see WithRetry class document for details.
30  """
31  if not (isinstance(exception, type) and issubclass(exception, Exception) or
32          (isinstance(exception, tuple) and
33           all(issubclass(e, Exception) for e in exception))):
34    raise TypeError('exceptions should be an exception (or tuple), not %r' %
35                    exception)
36  return lambda exc: isinstance(exc, exception)
37
38
39class _RetryDelayStrategy(object):
40  """The strategy of the delay between each retry attempts.
41
42  Please see WithRetry class document for details.
43  """
44
45  def __init__(self, sleep=0, backoff_factor=1, jitter=0):
46    if sleep < 0:
47      raise ValueError('sleep must be >= 0: %s' % sleep)
48
49    if backoff_factor < 1:
50      raise ValueError('backoff_factor must be 1 or greater: %s'
51                       % backoff_factor)
52
53    if jitter < 0:
54      raise ValueError('jitter must be >= 0: %s' % jitter)
55
56    self._sleep = sleep
57    self._backoff_factor = backoff_factor
58    self._jitter = jitter
59
60  def Sleep(self, attempt):
61    """Sleep to delay the current retry."""
62    assert attempt >= 1, 'Expect attempt is always positive: %s' % attempt
63    if self._backoff_factor > 1:
64      sleep_duration = self._sleep * self._backoff_factor ** (attempt - 1)
65    else:
66      sleep_duration = self._sleep * attempt
67
68    # If |jitter| is set, add a random jitter sleep.
69    jitter = random.uniform(.5 * self._jitter, 1.5 * self._jitter)
70    total = sleep_duration + jitter
71    if total:
72      logging.debug('Retrying in %f (%f + jitter %f) seconds ...',
73                    total, sleep_duration, jitter)
74      time.sleep(total)
75
76
77class WithRetry(object):
78  """Decorator to handle retry on exception.
79
80  Examples:
81    @WithRetry(max_retry=3)
82    def _run():
83      ... do something ...
84    _run()
85
86    If _run() raises an exception, it retries at most three times.
87
88  Retrying strategy.
89
90  If the decorated function throws an Exception instance, then this class
91  checks whether the retry should be continued or not based on the given
92  |handler| or |exception| as follows.
93  - If |handler| is given, which should be a callback which takes an exception
94    and returns bool, calls it with the thrown exception.
95    If the |handler| returns True, retry will be continued. Otherwise no
96    further retry will be made, and an exception will be raised.
97  - If |exception| is given, which is an exception class or a tuple of
98    exception classes, iff the thrown exception is a instance of the given
99    exception class(es) (or its subclass), continues to retry. Otherwise no
100    further retry will be made, and an exception will be raised.
101  - If neither is given, just continues to retry on any Exception instance.
102  - Note: it is not allowed to specify both |handler| and |exception| at once.
103
104  Delay strategy.
105
106  Between for each attempt, some delay can be set, as follows.
107  - If |sleep| is given, the delay between the first and second attempts is
108    |sleep| secs.
109  - The delay between the second and third attempts, and later, depends on
110    |sleep| and |backoff_factor|.
111    - If |backoff_factor| is not given, the delay will be linearly increased,
112      as |sleep| * (number of attempts). E.g., if |sleep| is 1, the delays
113      will be 1, 2, 3, 4, 5, ... and so on.
114    - If |backoff_factor| is given, the delay will be exponentially increased,
115      as |sleep| * |backoff_factor| ** (number of attempts - 1). E.g., if
116      |sleep| is 1, and |backoff_factor| is 2, the delay will be,
117      1, 2, 4, 8, 16, ... and so on
118  - Note: Keep in mind that, if |backoff_factor| is not given, the total
119    delay time will be triangular value of |max_retry| multiplied by the
120    |sleep| value. E.g., |max_retry| is 5, and |sleep| is 10, will be
121    T5 (i.e. 5 + 4 + 3 + 2 + 1) times 10 = 150 seconds total. Rather than
122    use a large sleep value, you should lean more towards large retries
123    and lower sleep intervals, or by utilizing |backoff_factor|.
124  - In addition, for each delay, random duration of the delay can be added,
125    as 'jitter'. (Often, this helps to avoid consecutive conflicting situation)
126    |jitter| is specifies the duration of jitter delay, randomized up to
127    50% in either direction.
128  """
129
130  def __init__(self,
131               max_retry, handler=None, exception=None, log_all_retries=False,
132               sleep=0, backoff_factor=1, jitter=0,
133               raise_first_exception_on_failure=True, exception_to_raise=None,
134               status_callback=None):
135    """Initialize.
136
137    Args:
138      max_retry: A positive integer representing how many times to retry the
139          command before giving up.  Worst case, the command is invoked
140          (max_retry + 1) times before failing.
141      handler: Please see above for details.
142      exception: Please see above for details.
143      log_all_retries: when True, logs all retries.
144      sleep: Please see above for details.
145      backoff_factor: Please see above for details.
146      jitter: Please see above for details.
147      raise_first_exception_on_failure: determines which excecption is raised
148          upon failure after retries. If True, the first exception that was
149          encountered. Otherwise, the final one.
150      exception_to_raise: Optional exception type. If given, raises its
151          instance, instead of the one raised from the retry body.
152      status_callback: Optional callback invoked after each call of |functor|.
153          It takes two arguments: |attempt| which is the index of the last
154          attempt (0-based), and |success| representing whether the last attempt
155          was successfully done or not. If the callback raises an exception, no
156          further retry will be made, and the exception will be propagated to
157          the caller.
158    """
159    if max_retry < 0:
160      raise ValueError('max_retry needs to be zero or more: %d' % max_retry)
161    self._max_retry = max_retry
162
163    if handler is not None and exception is not None:
164      raise ValueError('handler and exception cannot be specified at once')
165    self._handler = (
166        handler or _CreateExceptionRetryHandler(exception or Exception))
167
168    self._log_all_retries = log_all_retries
169    self._retry_delay = _RetryDelayStrategy(sleep, backoff_factor, jitter)
170    self._raise_first_exception_on_failure = raise_first_exception_on_failure
171    self._exception_to_raise = exception_to_raise
172    self._status_callback = status_callback or (lambda attempt, success: None)
173
174  def __call__(self, func):
175    @functools.wraps(func)
176    def _Wrapper(*args, **kwargs):
177      fname = getattr(func, '__qualname__',
178                      getattr(func, '__name__', '<nameless>'))
179      exc_info = None
180      for attempt in range(self._max_retry + 1):
181        if attempt:
182          self._retry_delay.Sleep(attempt)
183
184        if attempt and self._log_all_retries:
185          logging.debug('Retrying %s (attempt %d)', fname, attempt + 1)
186
187        try:
188          ret = func(*args, **kwargs)
189        except Exception as e:
190          # Note we're not snagging BaseException, so
191          # MemoryError/KeyboardInterrupt and friends don't enter this except
192          # block.
193
194          # If raise_first_exception_on_failure, we intentionally ignore
195          # any failures in later attempts since we'll throw the original
196          # failure if all retries fail.
197          if exc_info is None or not self._raise_first_exception_on_failure:
198            exc_info = sys.exc_info()
199
200          try:
201            self._status_callback(attempt, False)
202          except Exception:
203            # In case callback raises an exception, quit the retry.
204            # For further investigation, log the original exception here.
205            logging.error('Ending retry due to Exception raised by a callback. '
206                          'Original exception raised during the attempt is '
207                          'as follows: ',
208                          exc_info=exc_info)
209            # Reraise the exception raised from the status_callback.
210            raise
211
212          if not self._handler(e):
213            logging.debug('ending retries with error: %s(%s)', e.__class__, e)
214            break
215          logging.exception('func call has failed')
216        else:
217          # Run callback in outside of try's main block, in order to avoid
218          # accidental capture of an Exception which may be raised in callback.
219          self._status_callback(attempt, True)
220          return ret
221
222      # Did not return, meaning all attempts failed. Raise the exception.
223      if self._exception_to_raise:
224        raise self._exception_to_raise('%s: %s' % (exc_info[0], exc_info[1]))
225      six.reraise(exc_info[0], exc_info[1], exc_info[2])
226    return _Wrapper
227
228
229def GenericRetry(handler, max_retry, functor, *args, **kwargs):
230  """Generic retry loop w/ optional break out depending on exceptions.
231
232  Runs functor(*args, **(kwargs excluding params for retry)) as a retry body.
233
234  Please see WithRetry for details about retrying parameters.
235  """
236  # Note: the default value needs to be matched with the ones of WithRetry's
237  # ctor.
238  log_all_retries = kwargs.pop('log_all_retries', False)
239  delay_sec = kwargs.pop('delay_sec', 0)
240  sleep = kwargs.pop('sleep', 0)
241  backoff_factor = kwargs.pop('backoff_factor', 1)
242  status_callback = kwargs.pop('status_callback', None)
243  raise_first_exception_on_failure = kwargs.pop(
244      'raise_first_exception_on_failure', True)
245  exception_to_raise = kwargs.pop('exception_to_raise', None)
246
247  @WithRetry(
248      max_retry=max_retry, handler=handler, log_all_retries=log_all_retries,
249      sleep=sleep, backoff_factor=backoff_factor, jitter=delay_sec,
250      raise_first_exception_on_failure=raise_first_exception_on_failure,
251      exception_to_raise=exception_to_raise,
252      status_callback=status_callback)
253  def _run():
254    return functor(*args, **kwargs)
255  return _run()
256
257
258def RetryException(exception, max_retry, functor, *args, **kwargs):
259  """Convenience wrapper for GenericRetry based on exceptions.
260
261  Runs functor(*args, **(kwargs excluding params for retry)) as a retry body.
262
263  Please see WithRetry for details about retrying parameters.
264  """
265  log_all_retries = kwargs.pop('log_all_retries', False)
266  delay_sec = kwargs.pop('delay_sec', 0)
267  sleep = kwargs.pop('sleep', 0)
268  backoff_factor = kwargs.pop('backoff_factor', 1)
269  status_callback = kwargs.pop('status_callback', None)
270  raise_first_exception_on_failure = kwargs.pop(
271      'raise_first_exception_on_failure', True)
272  exception_to_raise = kwargs.pop('exception_to_raise', None)
273
274  @WithRetry(
275      max_retry=max_retry, exception=exception,
276      log_all_retries=log_all_retries,
277      sleep=sleep, backoff_factor=backoff_factor, jitter=delay_sec,
278      raise_first_exception_on_failure=raise_first_exception_on_failure,
279      exception_to_raise=exception_to_raise,
280      status_callback=status_callback)
281  def _run():
282    return functor(*args, **kwargs)
283  return _run()
284
285
286def RetryCommand(functor, max_retry, *args, **kwargs):
287  """Wrapper for run that will retry a command.
288
289  Args:
290    functor: run function to run; retries will only occur on
291      RunCommandError exceptions being thrown.
292    max_retry: A positive integer representing how many times to retry
293      the command before giving up.  Worst case, the command is invoked
294      (max_retry + 1) times before failing.
295    sleep: Optional keyword.  Multiplier for how long to sleep between
296      retries; will delay (1*sleep) the first time, then (2*sleep),
297      continuing via attempt * sleep.
298    retry_on: If provided, we will retry on any exit codes in the given list.
299      Note: A process will exit with a negative exit code if it is killed by a
300      signal. By default, we retry on all non-negative exit codes.
301    error_check: Optional callback to check the error output.  Return None to
302      fall back to |retry_on|, or True/False to set the retry directly.
303    log_retries: Whether to log a warning when retriable errors occur.
304    args: Positional args passed to run; see run for specifics.
305    kwargs: Optional args passed to run; see run for specifics.
306
307  Returns:
308    A CommandResult object.
309
310  Raises:
311    RunCommandError: Raised on error.
312  """
313  values = kwargs.pop('retry_on', None)
314  error_check = kwargs.pop('error_check', lambda x: None)
315  log_retries = kwargs.pop('log_retries', True)
316
317  def ShouldRetry(exc):
318    """Return whether we should retry on a given exception."""
319    if not ShouldRetryCommandCommon(exc):
320      return False
321    if values is None and exc.result.returncode < 0:
322      logging.info('Child process received signal %d; not retrying.',
323                   -exc.result.returncode)
324      return False
325
326    ret = error_check(exc)
327    if ret is not None:
328      return ret
329
330    if values is None or exc.result.returncode in values:
331      if log_retries:
332        logging.warning('Command failed with retriable error.\n%s', exc)
333      return True
334    return False
335
336  return GenericRetry(ShouldRetry, max_retry, functor, *args, **kwargs)
337
338
339def ShouldRetryCommandCommon(exc):
340  """Returns whether any run should retry on a given exception."""
341  if not isinstance(exc, cros_build_lib.RunCommandError):
342    return False
343  if exc.result.returncode is None:
344    logging.error('Child process failed to launch; not retrying:\n'
345                  'command: %s', exc.result.cmdstr)
346    return False
347  return True
348
349
350def RunCommandWithRetries(max_retry, *args, **kwargs):
351  """Wrapper for run that will retry a command
352
353  Args:
354    max_retry: See RetryCommand and run.
355    *args: See RetryCommand and run.
356    **kwargs: See RetryCommand and run.
357
358  Returns:
359    A CommandResult object.
360
361  Raises:
362    RunCommandError: Raised on error.
363  """
364  return RetryCommand(cros_build_lib.run, max_retry, *args, **kwargs)
365
366
367class DownloadError(Exception):
368  """Fetching file via curl failed"""
369
370
371def RunCurl(curl_args, *args, **kwargs):
372  """Runs curl and wraps around all necessary hacks.
373
374  Args:
375    curl_args: Command line to pass to curl. Must be list of str.
376    *args, **kwargs: See RunCommandWithRetries and run.
377      Note that retry_on, error_check, sleep, backoff_factor cannot be
378      overwritten.
379
380  Returns:
381    A CommandResult object.
382
383  Raises:
384    DownloadError: Whenever curl fails for any reason.
385  """
386  cmd = ['curl'] + curl_args
387
388  # These values were discerned via scraping the curl manpage; they're all
389  # retry related (dns failed, timeout occurred, etc, see  the manpage for
390  # exact specifics of each).
391  # Note we allow 22 to deal w/ 500's- they're thrown by google storage
392  # occasionally.  This is also thrown when getting 4xx, but curl doesn't
393  # make it easy to differentiate between them.
394  # Note we allow 35 to deal w/ Unknown SSL Protocol error, thrown by
395  # google storage occasionally.
396  # Finally, we do not use curl's --retry option since it generally doesn't
397  # actually retry anything; code 18 for example, it will not retry on.
398  retriable_exits = frozenset([5, 6, 7, 15, 18, 22, 26, 28, 35, 52, 56])
399
400  def _CheckExit(exc):
401    """Filter out specific error codes when getting exit 22
402
403    Curl will exit(22) for a wide range of HTTP codes -- both the 4xx and 5xx
404    set.  For the 4xx, we don't want to retry.  We have to look at the output.
405    """
406    assert isinstance(exc, cros_build_lib.RunCommandError)
407    if exc.result.returncode == 22:
408      logging.debug('curl stderr %s', exc.result.error)
409      matched = CURL_STATUS_RE.search(exc.result.error)
410      if not matched:
411        # Unexpected stderr.  It may not be error output from --fail.
412        return True
413      status_code = matched.group(1)
414      return not status_code.startswith(b'4')
415
416    # We'll let the common exit code filter do the right thing.
417    return None
418
419  try:
420    return RunCommandWithRetries(
421        10, cmd, retry_on=retriable_exits, error_check=_CheckExit,
422        sleep=3, backoff_factor=1.6,
423        stderr=True, extra_env={'LC_MESSAGES': 'C'}, *args, **kwargs)
424  except cros_build_lib.RunCommandError as e:
425    if e.result.returncode in (51, 58, 60):
426      # These are the return codes of failing certs as per 'man curl'.
427      raise DownloadError(
428          'Download failed with certificate error? Try "sudo c_rehash".')
429    raise DownloadError('Curl failed w/ exit code %i: %s' %
430                        (e.result.returncode, e.result.error))
431