common_lib/hosts/repair.py

# Copyright 2016 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""
Framework for host verification and repair in Autotest.

The framework provides implementation code in support of `Host.verify()`
and `Host.repair()` used in Verify and Repair special tasks.

The framework consists of these classes:
  * `Verifier`: A class representing a single verification check.
  * `RepairAction`: A class representing a repair operation that can fix
    a failed verification check.
  * `RepairStrategy`:  A class for organizing a collection of `Verifier`
    and `RepairAction` instances, and invoking them in order.

Individual operations during verification and repair are handled by
instances of `Verifier` and `RepairAction`.  `Verifier` objects are
meant to test for specific conditions that may cause tests to fail.
`RepairAction` objects provide operations designed to fix one or
more failures identified by a `Verifier` object.
"""

import collections
import logging
import re

import common
from autotest_lib.client.common_lib import error

try:
    from autotest_lib.utils.frozen_chromite.lib import metrics
except ImportError:
    from autotest_lib.client.bin.utils import metrics_mock as metrics

#Regular experssion pattern to filter out unwanted hostname.
_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
_DISALLOWED_HOSTNAME = 'disallowed_hostname'

# States of verifiers
# True - verifier run and passed
# False - verifier run and failed
# None - verifier did not run or dependency failed
VERIFY_SUCCESS = True
VERIFY_FAILED = False
VERIFY_NOT_RUN = None


class AutoservVerifyError(error.AutoservError):
    """
    Generic Exception for failures from `Verifier` objects.

    Instances of this exception can be raised when a `verify()`
    method fails, if no more specific exception is available.
    """
    pass


class AutoservNonCriticalVerifyError(error.AutoservError):
    """
    Exception for failures from `Verifier` objects that not critical enough to
    conclude the target host is in a bad state.
    """
    pass


_DependencyFailure = collections.namedtuple(
        '_DependencyFailure', ('dependency', 'error', 'tag'))


_NonCriticalDependencyFailure = collections.namedtuple(
    '_NonCriticalDependencyFailure', ('dependency', 'error', 'tag'))


class AutoservVerifyDependencyError(error.AutoservError):
    """
    Exception raised for failures in dependencies.

    This exception is used to distinguish an original failure from a
    failure being passed back from a verification dependency.  That is,
    if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
    to signal that the original failure is further down the dependency
    chain.

    The `failures` argument to the constructor for this class is a set
    of instances of `_DependencyFailure`, each corresponding to one
    failed dependency:
      * The `dependency` attribute of each failure is the description
        of the failed dependency.
      * The `error` attribute of each failure is the string value of
        the exception from the failed dependency.

    Multiple methods in this module recognize and handle this exception
    specially.

    @property failures  Set of failures passed to the constructor.
    @property _node     Instance of `_DependencyNode` reporting the
                        failed dependencies.
    """

    def __init__(self, node, failures):
        """
        Constructor for `AutoservVerifyDependencyError`.

        @param node       Instance of _DependencyNode reporting the
                          failed dependencies.
        @param failures   List of failure tuples as described above.
        """
        super(AutoservVerifyDependencyError, self).__init__(
                '\n'.join([f.error for f in failures]))
        self.failures = failures
        self._node = node

    def log_dependencies(self, action, deps):
        """
        Log an `AutoservVerifyDependencyError`.

        This writes a short summary of the dependency failures captured
        in this exception, using standard Python logging.

        The passed in `action` string plus `self._node.description`
        are logged at INFO level.  The `action` argument should
        introduce or describe an action relative to `self._node`.

        The passed in `deps` string and the description of each failed
        dependency in `self` are be logged at DEBUG level.  The `deps`
        argument is used to introduce the various failed dependencies.

        @param action   A string mentioning the action being logged
                        relative to `self._node`.
        @param deps     A string introducing the dependencies that
                        failed.
        """
        logging.info('%s: %s', action, self._node.description)
        logging.debug('%s:', deps)
        for failure in self.failures:
            logging.debug('    %s', failure.dependency)

    def is_critical(self, silent=False):
        """Check if the error is considered to be critical to repair process."""
        for error in self.failures:
            if isinstance(error, _NonCriticalDependencyFailure):
                if not silent:
                    logging.warning("%s is still failing but forgiven because"
                                    " it raised a non-critical error.",
                                    error.tag)
            else:
                return True
        return False


class AutoservRepairError(error.AutoservError):
    """
    Generic Exception for failures from `RepairAction` objects.

    Instances of this exception can be raised when a `repair()`
    method fails, if no more specific exception is available.
    """
    def __init__(self, description, tag):
        """
        @param description  Message describe the exception.
        @param tag          A short identifier used for metric purpose.
        """
        super(AutoservRepairError, self).__init__(description)
        self.tag = tag


class _DependencyNode(object):
    """
    An object that can depend on verifiers.

    Both repair and verify operations have the notion of dependencies
    that must pass before the operation proceeds.  This class captures
    the shared behaviors required by both classes.

    @property tag               Short identifier to be used in logging.
    @property description       Text summary of this node's action, to be
                                used in debug logs.
    @property _dependency_list  Dependency pre-requisites.
    """

    def __init__(self, tag, record_type, dependencies):
        self._dependency_list = dependencies
        self._tag = tag
        self._record_tag = record_type + '.' + tag

    def _is_applicable(self, host):
        """
        Check if the action is applicable to target host. Subclasses
        can override this method per their need.

        @param host     Target host to check.
        @return         A bool value.
        """
        return True

    def _record(self, host, silent, status_code, *record_args):
        """
        Log a status record for `host`.

        Call `host.record()` using the given status_code, and
        operation tag `self._record_tag`, plus any extra arguments in
        `record_args`.  Do nothing if `silent` is a true value.

        @param host         Host which will record the status record.
        @param silent       Don't record the event if this is a true
                            value.
        @param status_code  Value for the `status_code` parameter to
                            `host.record()`.
        @param record_args  Additional arguments to pass to
                            `host.record()`.
        """
        if not silent:
            host.record(status_code, None, self._record_tag,
                        *record_args)

    def _record_good(self, host, silent):
        """Log a 'GOOD' status line.

        @param host         Host which will record the status record.
        @param silent       Don't record the event if this is a true
                            value.
        """
        self._record(host, silent, 'GOOD')

    def _record_fail(self, host, silent, exc):
        """Log a 'FAIL' status line.

        @param host         Host which will record the status record.
        @param silent       Don't record the event if this is a true
                            value.
        @param exc          Exception describing the cause of failure.
        """
        self._record(host, silent, 'FAIL', str(exc))

    def _verify_list(self, host, verifiers, silent):
        """
        Test a list of verifiers against a given host.

        This invokes `_verify_host()` on every verifier in the given
        list.  If any verifier in the transitive closure of dependencies
        in the list fails, an `AutoservVerifyDependencyError` is raised
        containing the description of each failed verifier.  Only
        original failures are reported; verifiers that don't run due
        to a failed dependency are omitted.

        By design, original failures are logged once in `_verify_host()`
        when `verify()` originally fails.  The additional data gathered
        here is for the debug logs to indicate why a subsequent
        operation never ran.

        @param host       The host to be tested against the verifiers.
        @param verifiers  List of verifiers to be checked.
        @param silent     If true, don't log host status records.

        @raises AutoservVerifyDependencyError   Raised when at least
                        one verifier in the list has failed.
        """
        failures = set()
        for v in verifiers:
            try:
                v._verify_host(host, silent)
            except AutoservNonCriticalVerifyError as e:
                failures.add(_NonCriticalDependencyFailure(v.description,
                                                           str(e), v.tag))
            except AutoservVerifyDependencyError as e:
                failures.update(e.failures)
            except Exception as e:
                failures.add(_DependencyFailure(v.description, str(e), v.tag))
        if failures:
            raise AutoservVerifyDependencyError(self, failures)

    def _verify_dependencies(self, host, silent):
        """
        Verify that all of this node's dependencies pass for a host.

        @param host     The host to be verified.
        @param silent   If true, don't log host status records.
        """
        try:
            self._verify_list(host, self._dependency_list, silent)
        except AutoservVerifyDependencyError as e:
            e.log_dependencies(
                    'Skipping this operation',
                    'The following dependencies failed')
            raise

    @property
    def tag(self):
        """
        Tag for use in logging status records.

        This is a property with a short string used to identify the node
        in the 'status.log' file and during node construction.  The tag
        should contain only letters, digits, and '_' characters.  This
        tag is not used alone, but is combined with other identifiers,
        based on the operation being logged.

        @return A short identifier-like string.
        """
        return self._tag

    @property
    def description(self):
        """
        Text description of this node for log messages.

        This string will be logged with failures, and should describe
        the condition required for success.

        N.B. Subclasses are required to override this method, but we
        _don't_ raise NotImplementedError here.  Various methods fail in
        inscrutable ways if this method raises any exception, so for
        debugging purposes, it's better to return a default value.

        @return A descriptive string.
        """
        return ('Class %s fails to implement description().' %
                type(self).__name__)

    def _get_node_by_tag(self, tag):
        """Find verifier by tag, recursive.

        @param tag  Node identifier.

        @returns:   _DependencyNode instance associated with tag
        """
        if self._tag == tag:
            return self
        for child in self._dependency_list:
            node = child._get_node_by_tag(tag)
            if node is not None:
                return node
        return None


class Verifier(_DependencyNode):
    """
    Abstract class embodying one verification check.

    A concrete subclass of `Verifier` provides a simple check that can
    determine a host's fitness for testing.  Failure indicates that the
    check found a problem that can cause at least one test to fail.

    `Verifier` objects are organized in a DAG identifying dependencies
    among operations.  The DAG controls ordering and prevents wasted
    effort:  If verification operation V2 requires that verification
    operation V1 pass, then a) V1 will run before V2, and b) if V1
    fails, V2 won't run at all.  The `_verify_host()` method ensures
    that all dependencies run and pass before invoking the `verify()`
    method.

    A `Verifier` object caches its result the first time it calls
    `verify()`.  Subsequent calls return the cached result, without
    re-running the check code.  The `_reverify()` method clears the
    cached result in the current node, and in all dependencies.

    Subclasses must supply these properties and methods:
      * `verify()`: This is the method to perform the actual
        verification check.
      * `description`:  A one-line summary of the verification check for
        debug log messages.

    Subclasses must override all of the above attributes; subclasses
    should not override or extend any other attributes of this class.

    The description string should be a simple sentence explaining what
    must be true for the verifier to pass.  Do not include a terminating
    period.  For example:

        Host is available via ssh

    The base class manages the following private data:
      * `_result`:  The cached result of verification.
                    None - did not run
                    True - successful pass
                    Exception - fail during execution
      * `_dependency_list`:  The list of dependencies.
    Subclasses should not use these attributes.

    @property _result           Cached result of verification.
    """

    def __init__(self, tag, dependencies):
        super(Verifier, self).__init__(tag, 'verify', dependencies)
        self._result = None

    def _reverify(self):
        """
        Discard cached verification results.

        Reset the cached verification result for this node, and for the
        transitive closure of all dependencies.
        """
        self._result = None
        for v in self._dependency_list:
            v._reverify()

    def _verify_host(self, host, silent):
        """
        Determine the result of verification, and log results.

        If this verifier does not have a cached verification result,
        check dependencies, and if they pass, run `verify()`.  Log
        informational messages regarding failed dependencies.  If we
        call `verify()`, log the result in `status.log`.

        If we already have a cached result, return that result without
        logging any message.

        @param host     The host to be tested for a problem.
        @param silent   If true, don't log host status records.
        """
        self._verify_dependencies(host, silent)
        try:
            if not self._is_applicable(host):
                logging.info(
                        'Verify "%s:%s" is not applicable to %s, skipping...',
                        self.tag, self.description, host.hostname)
                return
        except Exception as e:
            logging.error('Skipping %s verifier due to unexpect error during'
                          ' check applicability; %s', self.tag, e)
            return

        if self._result is not None:
            if isinstance(self._result, Exception):
                raise self._result  # cached failure
            elif self._result:
                return              # cached success

        logging.info('Verifying %s:%s', self.tag, self.description)
        try:
            logging.debug('Start verify task: %s.', type(self).__name__)
            self.verify(host)
            self._record_good(host, silent)
        except Exception as e:
            message = 'Failed: %s'
            if isinstance(e, AutoservNonCriticalVerifyError):
                message = '(Non-critical)Failed: %s'
            logging.exception(message, self.description)
            self._result = e
            self._record_fail(host, silent, e)
            # Increase verifier fail count if device health profile is
            # available to the host class.
            if hasattr(host, 'health_profile') and host.health_profile:
                host.health_profile.insert_failed_verifier(self.tag)
            raise
        finally:
            logging.debug('Finished verify task: %s.', type(self).__name__)

        self._result = True

    def verify(self, host):
        """
        Unconditionally perform a verification check.

        This method is responsible for testing for a single problem on a
        host.  Implementations should follow these guidelines:
          * The check should find a problem that will cause testing to
            fail.
          * Verification checks on a working system should run quickly
            and should be optimized for success; a check that passes
            should finish within seconds.
          * Verification checks are not expected have side effects, but
            may apply trivial fixes if they will finish within the time
            constraints above.

        A verification check should normally trigger a single set of
        repair actions.  If two different failures can require two
        different repairs, ideally they should use two different
        subclasses of `Verifier`.

        Implementations indicate failure by raising an exception.  The
        exception text should be a short, 1-line summary of the error.
        The text should be concise and diagnostic, as it will appear in
        `status.log` files.

        If this method finds no problems, it returns without raising any
        exception.

        Implementations should avoid most logging actions, but can log
        DEBUG level messages if they provide significant information for
        diagnosing failures.

        @param host   The host to be tested for a problem.
        """
        raise NotImplementedError('Class %s does not implement '
                                  'verify()' % type(self).__name__)

    def _is_good(self):
        """Provide result of the verifier

        @returns: a boolean or None value:
            True - verifier passed
            False - verifier did not pass
            None - verifier did not run because it is not applicable
                   or blocked due to dependency failure
        """
        if type(self._result) == type(True):
            return self._result
        elif isinstance(self._result, Exception):
            return False
        return None


class RepairAction(_DependencyNode):
    """
    Abstract class embodying one repair procedure.

    A `RepairAction` is responsible for fixing one or more failed
    `Verifier` checks, in order to make those checks pass.

    Each repair action includes one or more verifier triggers that
    determine when the repair action should run.  A repair action
    will call its `repair()` method if one or more of its triggers
    fails.  A repair action is successful if all of its triggers pass
    after calling `repair()`.

    A `RepairAction` is a subclass of `_DependencyNode`; if any of a
    repair action's dependencies fail, the action does not check its
    triggers, and doesn't call `repair()`.

    Subclasses must supply these attributes:
      * `repair()`: This is the method to perform the necessary
        repair.  The method should avoid most logging actions, but
        can log DEBUG level messages if they provide significant
        information for diagnosing failures.
      * `description`:  A one-line summary of the repair action for
        debug log messages.

    Subclasses must override both of the above attributes and should
    not override any other attributes of this class.

    The description string should be a simple sentence explaining the
    operation that will be performed.  Do not include a terminating
    period.  For example:

        Re-install the stable build via AU

    @property _trigger_list   List of verification checks that will
                              trigger this repair when they fail.
    @property host_class      A string identifier that will be
                              used as a field to send repair metrics.
    """

    def __init__(self, tag, dependencies, triggers, host_class):
        super(RepairAction, self).__init__(tag, 'repair', dependencies)
        self._trigger_list = triggers
        self._failure_modes_counter = metrics.Counter(
            'chromeos/autotest/repair/failure_modes')
        self._failure_detail_counter = metrics.Counter(
            'chromeos/autotest/repair/failure_detail')
        self.host_class = host_class

    def _record_start(self, host, silent):
        """Log a 'START' status line.

        @param host         Host which will record the status record.
        @param silent       Don't record the event if this is a true
                            value.
        """
        self._record(host, silent, 'START')

    def _record_end_good(self, host, silent):
        """Log an 'END GOOD' status line.

        @param host         Host which will record the status record.
        @param silent       Don't record the event if this is a true
                            value.
        """
        self._record(host, silent, 'END GOOD')
        self.status = 'repaired'

    def _record_end_fail(self, host, silent, status, *args):
        """Log an 'END FAIL' status line.

        @param host         Host which will record the status record.
        @param silent       Don't record the event if this is a true
                            value.
        @param args         Extra arguments to `self._record()`
        """
        self._record(host, silent, 'END FAIL', *args)
        self.status = status

    def _send_failure_metrics(self, host, error, stage):
        """Send failure mode metrics to monarch

        @param host         Host which this RepairAction targeted to.
        @param error        An exception that caught in _repair_host.
        @param stage        In which stage we caught above exception.
                            Can be one of below value:
                                'dep'    during verify dependencies
                                'pre'    during pre-repair trigger verification
                                'repair' during repair() process itself
                                'post'   during post-repair trigger verification
        """

        def get_fields(vf_tag):
            fields = {
                'ra_tag': self.tag,
                'vf_tag': vf_tag,
                'hostname': _filter_metrics_hostname(host),
                'stage': stage,
                'host_class': self.host_class
            }
            return fields

        if isinstance(error, AutoservVerifyDependencyError):
            # We'll catch all failure tags here for a dependencies error
            for f in error.failures:
                self._failure_modes_counter.increment(fields=get_fields(f.tag))
        else:
            # When there is failure during repair or unknown failure. there
            # will be no Verifier, so vf_tag set to 'unknown'.
            self._failure_modes_counter.increment(fields=get_fields('unknown'))

        if stage == 'repair':
            self._send_failure_detail(error)

    def _send_failure_detail(self, error):
        """Send reason of failure inside repair() to monarch.

        @param error    The exception caught inside repair().
        """
        tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
        fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
        self._failure_detail_counter.increment(fields=fields)

    def _repair_host(self, host, silent):
        """
        Apply this repair action if any triggers fail.

        Repair is triggered when all dependencies are successful, and at
        least one trigger fails.

        If the `repair()` method triggers, the success or failure of
        this operation is logged in `status.log` bracketed by 'START'
        and 'END' records.  Details of whether or why `repair()`
        triggered are written to the debug logs.   If repair doesn't
        trigger, nothing is logged to `status.log`.

        @param host     The host to be repaired.
        @param silent   If true, don't log host status records.
        """
        # Note:  Every exit path from the method must set `self.status`.
        # There's a lot of exit paths, so be careful.
        #
        # If we're blocked by a failed dependency, we exit with an
        # exception.  So set status to 'blocked' first.
        self.status = 'blocked'
        try:
            self._verify_dependencies(host, silent)
        except Exception as e:
            self._send_failure_metrics(host, e, 'dep')
            raise

        self.status = 'skipped'
        try:
            if not self._is_applicable(host):
                logging.info('RepairAction is not applicable, skipping repair: %s',
                             self.description)
                return
        except Exception as e:
            logging.error('Skipping %s repair action due to unexpect error'
                          ' during check applicability; %s', self.tag, e)
            return
        # This is a defensive action.  Every path below should overwrite
        # this setting, but if it doesn't, we want our status to reflect
        # a coding error.
        self.status = 'unknown'
        try:
            self._verify_list(host, self._trigger_list, silent)
        except AutoservVerifyDependencyError as e:
            e.log_dependencies(
                    'Attempting this repair action',
                    'Repairing because these triggers failed')
            self._send_failure_metrics(host, e, 'pre')
            self._record_start(host, silent)
            try:
                self.repair(host)
                # Increase action success count if device health profile is
                # available to the host class.
                if hasattr(host, 'health_profile') and host.health_profile:
                    host.health_profile.insert_succeed_repair_action(self.tag)
            except Exception as e:
                logging.exception('Repair failed: %s', self.description)
                self._record_fail(host, silent, e)
                self._record_end_fail(host, silent, 'repair_failure')
                self._send_failure_metrics(host, e, 'repair')
                # Increase action fail count if device health profile is
                # available to the host class.
                if hasattr(host, 'health_profile') and host.health_profile:
                    host.health_profile.insert_failed_repair_action(self.tag)
                raise
            try:
                for v in self._trigger_list:
                    v._reverify()
                self._verify_list(host, self._trigger_list, silent)
                self._record_end_good(host, silent)
            except AutoservVerifyDependencyError as e:
                e.log_dependencies(
                        'This repair action reported success',
                        'However, these triggers still fail')
                self._record_end_fail(host, silent, 'verify_failure')
                self._send_failure_metrics(host, e, 'post')
                raise AutoservRepairError(
                        'Some verification checks still fail', 'post_verify')
            except Exception:
                # The specification for `self._verify_list()` says
                # that this can't happen; this is a defensive
                # precaution.
                self._record_end_fail(host, silent, 'unknown',
                                      'Internal error in repair')
                self._send_failure_metrics(host, e, 'post')
                raise
        else:
            self.status = 'skipped'
            logging.info('No failed triggers, skipping repair: %s',
                         self.description)

    def repair(self, host):
        """
        Apply this repair action to the given host.

        This method is responsible for applying changes to fix failures
        in one or more verification checks.  The repair is considered
        successful if the DUT passes the specific checks after this
        method completes.

        Implementations indicate failure by raising an exception.  The
        exception text should be a short, 1-line summary of the error.
        The text should be concise and diagnostic, as it will appear in
        `status.log` files.

        If this method completes successfully, it returns without
        raising any exception.

        Implementations should avoid most logging actions, but can log
        DEBUG level messages if they provide significant information for
        diagnosing failures.

        @param host   The host to be repaired.
        """
        raise NotImplementedError('Class %s does not implement '
                                  'repair()' % type(self).__name__)


class _RootVerifier(Verifier):
    """
    Utility class used by `RepairStrategy`.

    A node of this class by itself does nothing; it always passes (if it
    can run).  This class exists merely to be the root of a DAG of
    dependencies in an instance of `RepairStrategy`.
    """

    def verify(self, host):
        pass

    @property
    def description(self):
        return 'All host verification checks pass'


class RepairStrategy(object):
    """
    A class for organizing `Verifier` and `RepairAction` objects.

    An instance of `RepairStrategy` is organized as a DAG of `Verifier`
    objects, plus a list of `RepairAction` objects.  The class provides
    methods for invoking those objects in the required order, when
    needed:
      * The `verify()` method walks the verifier DAG in dependency
        order.
      * The `repair()` method invokes the repair actions in list order.
        Each repair action will invoke its dependencies and triggers as
        needed.

    # The Verifier DAG
    The verifier DAG is constructed from the first argument passed to
    the passed to the `RepairStrategy` constructor.  That argument is an
    iterable consisting of three-element tuples in the form
    `(constructor, tag, deps)`:
      * The `constructor` value is a callable that creates a `Verifier`
        as for the interface of the class constructor.  For classes
        that inherit the default constructor from `Verifier`, this can
        be the class itself.
      * The `tag` value is the tag to be associated with the constructed
        verifier.
      * The `deps` value is an iterable (e.g. list or tuple) of strings.
        Each string corresponds to the `tag` member of a `Verifier`
        dependency.

    The tag names of verifiers in the constructed DAG must all be
    unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
    reserved and may not be used by any verifier.

    In the input data for the constructor, dependencies must appear
    before the nodes that depend on them.  Thus:

        ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
        ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!

    Internally, the DAG of verifiers is given unique root node.  So,
    given this input:

        ((C, 'c', ()),
         (A, 'a', ('c',)),
         (B, 'b', ('c',)))

    The following DAG is constructed:

          Root
          /  \
         A    B
          \  /
           C

    Since nothing depends on `A` or `B`, the root node guarantees that
    these two verifiers will both be called and properly logged.

    The root node is not directly accessible; however repair actions can
    trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
    node will be logged in `status.log` whenever `verify()` succeeds.

    # The Repair Actions List
    The list of repair actions is constructed from the second argument
    passed to the passed to the `RepairStrategy` constructor.  That
    argument is an iterable consisting of four-element tuples in the
    form `(constructor, tag, deps, triggers)`:
      * The `constructor` value is a callable that creates a
        `RepairAction` as for the interface of the class constructor.
        For classes that inherit the default constructor from
        `RepairAction`, this can be the class itself.
      * The `tag` value is the tag to be associated with the constructed
        repair action.
      * The `deps` value is an iterable (e.g. list or tuple) of strings.
        Each string corresponds to the `tag` member of a `Verifier` that
        the repair action depends on.
      * The `triggers` value is an iterable (e.g. list or tuple) of
        strings.  Each string corresponds to the `tag` member of a
        `Verifier` that can trigger the repair action.

    `RepairStrategy` deps and triggers can only refer to verifiers,
    not to other repair actions.
    """

    # This name is reserved; clients may not use it.
    ROOT_TAG = 'PASS'

    @staticmethod
    def _add_verifier(verifiers, constructor, tag, dep_tags):
        """
        Construct and remember a verifier.

        Create a `Verifier` using `constructor` and `tag`.  Dependencies
        for construction are found by looking up `dep_tags` in the
        `verifiers` dictionary.

        After construction, the new verifier is added to `verifiers`.

        @param verifiers    Dictionary of verifiers, indexed by tag.
        @param constructor  Verifier construction function.
        @param tag          Tag parameter for the construction function.
        @param dep_tags     Tags of dependencies for the constructor, to
                            be found in `verifiers`.
        """
        assert tag not in verifiers
        deps = [verifiers[d] for d in dep_tags]
        verifiers[tag] = constructor(tag, deps)

    def __init__(self, verifier_data, repair_data, host_class):
        """
        Construct a `RepairStrategy` from simplified DAG data.

        The input `verifier_data` object describes how to construct
        verify nodes and the dependencies that relate them, as detailed
        above.

        The input `repair_data` object describes how to construct repair
        actions and their dependencies and triggers, as detailed above.

        @param verifier_data  Iterable value with constructors for the
                              elements of the verification DAG and their
                              dependencies.
        @param repair_data    Iterable value with constructors for the
                              elements of the repair action list, and
                              their dependencies and triggers.
        @property host_class  A string identifier that identify what
                              class of host this repair strategy target
                              on, will be used as a field to send repair
                              metrics.
        """
        # Metrics - we report on 'actions' for every repair action
        # we execute; we report on 'strategy' for every complete
        # repair operation.
        self._strategy_counter = metrics.Counter(
            'chromeos/autotest/repair/repair_strategy_v2')
        self._actions_counter = metrics.Counter(
            'chromeos/autotest/repair/repair_actions')
        self.host_class = host_class
        # We use the `all_verifiers` list to guarantee that our root
        # verifier will execute its dependencies in the order provided
        # to us by our caller.
        verifier_map = {}
        all_tags = []
        dependencies = set()
        for constructor, tag, deps in verifier_data:
            self._add_verifier(verifier_map, constructor, tag, deps)
            dependencies.update(deps)
            all_tags.append(tag)
        # Capture all the verifiers that have nothing depending on them.
        root_tags = [t for t in all_tags if t not in dependencies]
        self._add_verifier(verifier_map, _RootVerifier,
                           self.ROOT_TAG, root_tags)
        self._verify_root = verifier_map[self.ROOT_TAG]
        self._repair_actions = []
        for constructor, tag, deps, triggers in repair_data:
            r = constructor(tag,
                            [verifier_map[d] for d in deps],
                            [verifier_map[t] for t in triggers],
                            self.host_class)
            self._repair_actions.append(r)

    def _send_strategy_metrics(self, host, result):
        """Send repair strategy metrics to monarch

        @param host     The target to be repaired.
        @param result   A String that describe a final result for the
                        RepairStrategy.
        """
        info = host.host_info_store.get()
        board = info.board if info.board else 'unknown'
        model = info.model if info.model else 'unknown'
        fields = {
            'board': board,
            'host_class': self.host_class,
            'hostname': _filter_metrics_hostname(host),
            'model': model,
            'result': result,
        }
        self._strategy_counter.increment(fields=fields)

    def _send_action_metrics(self, host, ra):
        """Send repair action metrics to monarch

        @param host     The target to be repaired.
        @param ra       an RepairAction instance.
        """
        fields = {
            'tag': ra.tag,
            'status': ra.status,
            'hostname': _filter_metrics_hostname(host),
            'host_class': self.host_class
        }
        self._actions_counter.increment(fields=fields)

    def verify(self, host, silent=False):
        """
        Run the verifier DAG on the given host.

        @param host     The target to be verified.
        @param silent   If true, don't log host status records.
        """
        self._verify_root._reverify()
        self._verify_root._verify_host(host, silent)

    def repair(self, host, silent=False):
        """
        Run the repair list on the given host.

        @param host     The target to be repaired.
        @param silent   If true, don't log host status records.
        """
        self._verify_root._reverify()
        attempted = False
        for ra in self._repair_actions:
            try:
                logging.debug('Start repair task: %s.', type(ra).__name__)
                ra._repair_host(host, silent)
            except Exception as e:
                # all logging and exception handling was done at
                # lower levels
                pass
            finally:
                self._send_action_metrics(host, ra)
                logging.debug('Finished repair task: %s.', type(ra).__name__)
                if ra.status not in ('skipped', 'blocked'):
                    attempted = True

        result = 'failure'
        try:
            self._verify_root._verify_host(host, silent)
            result = 'success' if attempted else 'not_attempted'
        except:
            if not attempted:
                result = 'attempt_blocked'
            raise
        finally:
            self._send_strategy_metrics(host, result)

    def verifier_is_good(self, tag):
        """Find and return result of a verifier.

        @param tag: key to be associated with verifier

        @returns: a boolean or None value:
            True - verifier passed
            False - verifier did not pass
            None - verifier did not run because it is not applicable
                   or blocked due to dependency failure
        """
        verifier = self.node_by_tag(tag)
        if verifier is not None:
            result = verifier._is_good()
            logging.debug('Verifier with associated tag: %s found', tag)
            if result is None:
                logging.debug('%s did not run; it is not applicable to run '
                              'or blocked due to dependency failure', tag)
            elif result == True:
                logging.debug('Cached result of %s verifier is pass', tag)
            else:
                logging.debug('Cached result of %s verifier is fail', tag)
            return result
        logging.debug('Verifier with associated tag: %s not found', tag)
        return None

    def node_by_tag(self, tag):
        """Find and return node by searched tag.

        @param tag: key to be associated with node

        @returns: _DependencyNode instance associated with tag
        """
        node = self._verify_root._get_node_by_tag(tag)
        if node is None:
            for n in self._repair_actions:
                node = n._get_node_by_tag(tag)
                if node is not None:
                    break
        return node


def _filter_metrics_hostname(host):
    """
       Restrict format of hostnames we'll send to monarch

       @param host    An host instance(i.e. ServoHost, CrosHost)
    """
    if re.match(_HOSTNAME_PATTERN, host.hostname):
        return host.hostname
    else:
        return _DISALLOWED_HOSTNAME