xref: /aosp_15_r20/external/autotest/client/common_lib/hosts/repair.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""
6Framework for host verification and repair in Autotest.
7
8The framework provides implementation code in support of `Host.verify()`
9and `Host.repair()` used in Verify and Repair special tasks.
10
11The framework consists of these classes:
12  * `Verifier`: A class representing a single verification check.
13  * `RepairAction`: A class representing a repair operation that can fix
14    a failed verification check.
15  * `RepairStrategy`:  A class for organizing a collection of `Verifier`
16    and `RepairAction` instances, and invoking them in order.
17
18Individual operations during verification and repair are handled by
19instances of `Verifier` and `RepairAction`.  `Verifier` objects are
20meant to test for specific conditions that may cause tests to fail.
21`RepairAction` objects provide operations designed to fix one or
22more failures identified by a `Verifier` object.
23"""
24
25import collections
26import logging
27import re
28
29import common
30from autotest_lib.client.common_lib import error
31
32try:
33    from autotest_lib.utils.frozen_chromite.lib import metrics
34except ImportError:
35    from autotest_lib.client.bin.utils import metrics_mock as metrics
36
37#Regular experssion pattern to filter out unwanted hostname.
38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
39_DISALLOWED_HOSTNAME = 'disallowed_hostname'
40
41# States of verifiers
42# True - verifier run and passed
43# False - verifier run and failed
44# None - verifier did not run or dependency failed
45VERIFY_SUCCESS = True
46VERIFY_FAILED = False
47VERIFY_NOT_RUN = None
48
49
50class AutoservVerifyError(error.AutoservError):
51    """
52    Generic Exception for failures from `Verifier` objects.
53
54    Instances of this exception can be raised when a `verify()`
55    method fails, if no more specific exception is available.
56    """
57    pass
58
59
60class AutoservNonCriticalVerifyError(error.AutoservError):
61    """
62    Exception for failures from `Verifier` objects that not critical enough to
63    conclude the target host is in a bad state.
64    """
65    pass
66
67
68_DependencyFailure = collections.namedtuple(
69        '_DependencyFailure', ('dependency', 'error', 'tag'))
70
71
72_NonCriticalDependencyFailure = collections.namedtuple(
73    '_NonCriticalDependencyFailure', ('dependency', 'error', 'tag'))
74
75
76class AutoservVerifyDependencyError(error.AutoservError):
77    """
78    Exception raised for failures in dependencies.
79
80    This exception is used to distinguish an original failure from a
81    failure being passed back from a verification dependency.  That is,
82    if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
83    to signal that the original failure is further down the dependency
84    chain.
85
86    The `failures` argument to the constructor for this class is a set
87    of instances of `_DependencyFailure`, each corresponding to one
88    failed dependency:
89      * The `dependency` attribute of each failure is the description
90        of the failed dependency.
91      * The `error` attribute of each failure is the string value of
92        the exception from the failed dependency.
93
94    Multiple methods in this module recognize and handle this exception
95    specially.
96
97    @property failures  Set of failures passed to the constructor.
98    @property _node     Instance of `_DependencyNode` reporting the
99                        failed dependencies.
100    """
101
102    def __init__(self, node, failures):
103        """
104        Constructor for `AutoservVerifyDependencyError`.
105
106        @param node       Instance of _DependencyNode reporting the
107                          failed dependencies.
108        @param failures   List of failure tuples as described above.
109        """
110        super(AutoservVerifyDependencyError, self).__init__(
111                '\n'.join([f.error for f in failures]))
112        self.failures = failures
113        self._node = node
114
115    def log_dependencies(self, action, deps):
116        """
117        Log an `AutoservVerifyDependencyError`.
118
119        This writes a short summary of the dependency failures captured
120        in this exception, using standard Python logging.
121
122        The passed in `action` string plus `self._node.description`
123        are logged at INFO level.  The `action` argument should
124        introduce or describe an action relative to `self._node`.
125
126        The passed in `deps` string and the description of each failed
127        dependency in `self` are be logged at DEBUG level.  The `deps`
128        argument is used to introduce the various failed dependencies.
129
130        @param action   A string mentioning the action being logged
131                        relative to `self._node`.
132        @param deps     A string introducing the dependencies that
133                        failed.
134        """
135        logging.info('%s: %s', action, self._node.description)
136        logging.debug('%s:', deps)
137        for failure in self.failures:
138            logging.debug('    %s', failure.dependency)
139
140    def is_critical(self, silent=False):
141        """Check if the error is considered to be critical to repair process."""
142        for error in self.failures:
143            if isinstance(error, _NonCriticalDependencyFailure):
144                if not silent:
145                    logging.warning("%s is still failing but forgiven because"
146                                    " it raised a non-critical error.",
147                                    error.tag)
148            else:
149                return True
150        return False
151
152
153class AutoservRepairError(error.AutoservError):
154    """
155    Generic Exception for failures from `RepairAction` objects.
156
157    Instances of this exception can be raised when a `repair()`
158    method fails, if no more specific exception is available.
159    """
160    def __init__(self, description, tag):
161        """
162        @param description  Message describe the exception.
163        @param tag          A short identifier used for metric purpose.
164        """
165        super(AutoservRepairError, self).__init__(description)
166        self.tag = tag
167
168
169class _DependencyNode(object):
170    """
171    An object that can depend on verifiers.
172
173    Both repair and verify operations have the notion of dependencies
174    that must pass before the operation proceeds.  This class captures
175    the shared behaviors required by both classes.
176
177    @property tag               Short identifier to be used in logging.
178    @property description       Text summary of this node's action, to be
179                                used in debug logs.
180    @property _dependency_list  Dependency pre-requisites.
181    """
182
183    def __init__(self, tag, record_type, dependencies):
184        self._dependency_list = dependencies
185        self._tag = tag
186        self._record_tag = record_type + '.' + tag
187
188    def _is_applicable(self, host):
189        """
190        Check if the action is applicable to target host. Subclasses
191        can override this method per their need.
192
193        @param host     Target host to check.
194        @return         A bool value.
195        """
196        return True
197
198    def _record(self, host, silent, status_code, *record_args):
199        """
200        Log a status record for `host`.
201
202        Call `host.record()` using the given status_code, and
203        operation tag `self._record_tag`, plus any extra arguments in
204        `record_args`.  Do nothing if `silent` is a true value.
205
206        @param host         Host which will record the status record.
207        @param silent       Don't record the event if this is a true
208                            value.
209        @param status_code  Value for the `status_code` parameter to
210                            `host.record()`.
211        @param record_args  Additional arguments to pass to
212                            `host.record()`.
213        """
214        if not silent:
215            host.record(status_code, None, self._record_tag,
216                        *record_args)
217
218    def _record_good(self, host, silent):
219        """Log a 'GOOD' status line.
220
221        @param host         Host which will record the status record.
222        @param silent       Don't record the event if this is a true
223                            value.
224        """
225        self._record(host, silent, 'GOOD')
226
227    def _record_fail(self, host, silent, exc):
228        """Log a 'FAIL' status line.
229
230        @param host         Host which will record the status record.
231        @param silent       Don't record the event if this is a true
232                            value.
233        @param exc          Exception describing the cause of failure.
234        """
235        self._record(host, silent, 'FAIL', str(exc))
236
237    def _verify_list(self, host, verifiers, silent):
238        """
239        Test a list of verifiers against a given host.
240
241        This invokes `_verify_host()` on every verifier in the given
242        list.  If any verifier in the transitive closure of dependencies
243        in the list fails, an `AutoservVerifyDependencyError` is raised
244        containing the description of each failed verifier.  Only
245        original failures are reported; verifiers that don't run due
246        to a failed dependency are omitted.
247
248        By design, original failures are logged once in `_verify_host()`
249        when `verify()` originally fails.  The additional data gathered
250        here is for the debug logs to indicate why a subsequent
251        operation never ran.
252
253        @param host       The host to be tested against the verifiers.
254        @param verifiers  List of verifiers to be checked.
255        @param silent     If true, don't log host status records.
256
257        @raises AutoservVerifyDependencyError   Raised when at least
258                        one verifier in the list has failed.
259        """
260        failures = set()
261        for v in verifiers:
262            try:
263                v._verify_host(host, silent)
264            except AutoservNonCriticalVerifyError as e:
265                failures.add(_NonCriticalDependencyFailure(v.description,
266                                                           str(e), v.tag))
267            except AutoservVerifyDependencyError as e:
268                failures.update(e.failures)
269            except Exception as e:
270                failures.add(_DependencyFailure(v.description, str(e), v.tag))
271        if failures:
272            raise AutoservVerifyDependencyError(self, failures)
273
274    def _verify_dependencies(self, host, silent):
275        """
276        Verify that all of this node's dependencies pass for a host.
277
278        @param host     The host to be verified.
279        @param silent   If true, don't log host status records.
280        """
281        try:
282            self._verify_list(host, self._dependency_list, silent)
283        except AutoservVerifyDependencyError as e:
284            e.log_dependencies(
285                    'Skipping this operation',
286                    'The following dependencies failed')
287            raise
288
289    @property
290    def tag(self):
291        """
292        Tag for use in logging status records.
293
294        This is a property with a short string used to identify the node
295        in the 'status.log' file and during node construction.  The tag
296        should contain only letters, digits, and '_' characters.  This
297        tag is not used alone, but is combined with other identifiers,
298        based on the operation being logged.
299
300        @return A short identifier-like string.
301        """
302        return self._tag
303
304    @property
305    def description(self):
306        """
307        Text description of this node for log messages.
308
309        This string will be logged with failures, and should describe
310        the condition required for success.
311
312        N.B. Subclasses are required to override this method, but we
313        _don't_ raise NotImplementedError here.  Various methods fail in
314        inscrutable ways if this method raises any exception, so for
315        debugging purposes, it's better to return a default value.
316
317        @return A descriptive string.
318        """
319        return ('Class %s fails to implement description().' %
320                type(self).__name__)
321
322    def _get_node_by_tag(self, tag):
323        """Find verifier by tag, recursive.
324
325        @param tag  Node identifier.
326
327        @returns:   _DependencyNode instance associated with tag
328        """
329        if self._tag == tag:
330            return self
331        for child in self._dependency_list:
332            node = child._get_node_by_tag(tag)
333            if node is not None:
334                return node
335        return None
336
337
338class Verifier(_DependencyNode):
339    """
340    Abstract class embodying one verification check.
341
342    A concrete subclass of `Verifier` provides a simple check that can
343    determine a host's fitness for testing.  Failure indicates that the
344    check found a problem that can cause at least one test to fail.
345
346    `Verifier` objects are organized in a DAG identifying dependencies
347    among operations.  The DAG controls ordering and prevents wasted
348    effort:  If verification operation V2 requires that verification
349    operation V1 pass, then a) V1 will run before V2, and b) if V1
350    fails, V2 won't run at all.  The `_verify_host()` method ensures
351    that all dependencies run and pass before invoking the `verify()`
352    method.
353
354    A `Verifier` object caches its result the first time it calls
355    `verify()`.  Subsequent calls return the cached result, without
356    re-running the check code.  The `_reverify()` method clears the
357    cached result in the current node, and in all dependencies.
358
359    Subclasses must supply these properties and methods:
360      * `verify()`: This is the method to perform the actual
361        verification check.
362      * `description`:  A one-line summary of the verification check for
363        debug log messages.
364
365    Subclasses must override all of the above attributes; subclasses
366    should not override or extend any other attributes of this class.
367
368    The description string should be a simple sentence explaining what
369    must be true for the verifier to pass.  Do not include a terminating
370    period.  For example:
371
372        Host is available via ssh
373
374    The base class manages the following private data:
375      * `_result`:  The cached result of verification.
376                    None - did not run
377                    True - successful pass
378                    Exception - fail during execution
379      * `_dependency_list`:  The list of dependencies.
380    Subclasses should not use these attributes.
381
382    @property _result           Cached result of verification.
383    """
384
385    def __init__(self, tag, dependencies):
386        super(Verifier, self).__init__(tag, 'verify', dependencies)
387        self._result = None
388
389    def _reverify(self):
390        """
391        Discard cached verification results.
392
393        Reset the cached verification result for this node, and for the
394        transitive closure of all dependencies.
395        """
396        self._result = None
397        for v in self._dependency_list:
398            v._reverify()
399
400    def _verify_host(self, host, silent):
401        """
402        Determine the result of verification, and log results.
403
404        If this verifier does not have a cached verification result,
405        check dependencies, and if they pass, run `verify()`.  Log
406        informational messages regarding failed dependencies.  If we
407        call `verify()`, log the result in `status.log`.
408
409        If we already have a cached result, return that result without
410        logging any message.
411
412        @param host     The host to be tested for a problem.
413        @param silent   If true, don't log host status records.
414        """
415        self._verify_dependencies(host, silent)
416        try:
417            if not self._is_applicable(host):
418                logging.info(
419                        'Verify "%s:%s" is not applicable to %s, skipping...',
420                        self.tag, self.description, host.hostname)
421                return
422        except Exception as e:
423            logging.error('Skipping %s verifier due to unexpect error during'
424                          ' check applicability; %s', self.tag, e)
425            return
426
427        if self._result is not None:
428            if isinstance(self._result, Exception):
429                raise self._result  # cached failure
430            elif self._result:
431                return              # cached success
432
433        logging.info('Verifying %s:%s', self.tag, self.description)
434        try:
435            logging.debug('Start verify task: %s.', type(self).__name__)
436            self.verify(host)
437            self._record_good(host, silent)
438        except Exception as e:
439            message = 'Failed: %s'
440            if isinstance(e, AutoservNonCriticalVerifyError):
441                message = '(Non-critical)Failed: %s'
442            logging.exception(message, self.description)
443            self._result = e
444            self._record_fail(host, silent, e)
445            # Increase verifier fail count if device health profile is
446            # available to the host class.
447            if hasattr(host, 'health_profile') and host.health_profile:
448                host.health_profile.insert_failed_verifier(self.tag)
449            raise
450        finally:
451            logging.debug('Finished verify task: %s.', type(self).__name__)
452
453        self._result = True
454
455    def verify(self, host):
456        """
457        Unconditionally perform a verification check.
458
459        This method is responsible for testing for a single problem on a
460        host.  Implementations should follow these guidelines:
461          * The check should find a problem that will cause testing to
462            fail.
463          * Verification checks on a working system should run quickly
464            and should be optimized for success; a check that passes
465            should finish within seconds.
466          * Verification checks are not expected have side effects, but
467            may apply trivial fixes if they will finish within the time
468            constraints above.
469
470        A verification check should normally trigger a single set of
471        repair actions.  If two different failures can require two
472        different repairs, ideally they should use two different
473        subclasses of `Verifier`.
474
475        Implementations indicate failure by raising an exception.  The
476        exception text should be a short, 1-line summary of the error.
477        The text should be concise and diagnostic, as it will appear in
478        `status.log` files.
479
480        If this method finds no problems, it returns without raising any
481        exception.
482
483        Implementations should avoid most logging actions, but can log
484        DEBUG level messages if they provide significant information for
485        diagnosing failures.
486
487        @param host   The host to be tested for a problem.
488        """
489        raise NotImplementedError('Class %s does not implement '
490                                  'verify()' % type(self).__name__)
491
492    def _is_good(self):
493        """Provide result of the verifier
494
495        @returns: a boolean or None value:
496            True - verifier passed
497            False - verifier did not pass
498            None - verifier did not run because it is not applicable
499                   or blocked due to dependency failure
500        """
501        if type(self._result) == type(True):
502            return self._result
503        elif isinstance(self._result, Exception):
504            return False
505        return None
506
507
508class RepairAction(_DependencyNode):
509    """
510    Abstract class embodying one repair procedure.
511
512    A `RepairAction` is responsible for fixing one or more failed
513    `Verifier` checks, in order to make those checks pass.
514
515    Each repair action includes one or more verifier triggers that
516    determine when the repair action should run.  A repair action
517    will call its `repair()` method if one or more of its triggers
518    fails.  A repair action is successful if all of its triggers pass
519    after calling `repair()`.
520
521    A `RepairAction` is a subclass of `_DependencyNode`; if any of a
522    repair action's dependencies fail, the action does not check its
523    triggers, and doesn't call `repair()`.
524
525    Subclasses must supply these attributes:
526      * `repair()`: This is the method to perform the necessary
527        repair.  The method should avoid most logging actions, but
528        can log DEBUG level messages if they provide significant
529        information for diagnosing failures.
530      * `description`:  A one-line summary of the repair action for
531        debug log messages.
532
533    Subclasses must override both of the above attributes and should
534    not override any other attributes of this class.
535
536    The description string should be a simple sentence explaining the
537    operation that will be performed.  Do not include a terminating
538    period.  For example:
539
540        Re-install the stable build via AU
541
542    @property _trigger_list   List of verification checks that will
543                              trigger this repair when they fail.
544    @property host_class      A string identifier that will be
545                              used as a field to send repair metrics.
546    """
547
548    def __init__(self, tag, dependencies, triggers, host_class):
549        super(RepairAction, self).__init__(tag, 'repair', dependencies)
550        self._trigger_list = triggers
551        self._failure_modes_counter = metrics.Counter(
552            'chromeos/autotest/repair/failure_modes')
553        self._failure_detail_counter = metrics.Counter(
554            'chromeos/autotest/repair/failure_detail')
555        self.host_class = host_class
556
557    def _record_start(self, host, silent):
558        """Log a 'START' status line.
559
560        @param host         Host which will record the status record.
561        @param silent       Don't record the event if this is a true
562                            value.
563        """
564        self._record(host, silent, 'START')
565
566    def _record_end_good(self, host, silent):
567        """Log an 'END GOOD' status line.
568
569        @param host         Host which will record the status record.
570        @param silent       Don't record the event if this is a true
571                            value.
572        """
573        self._record(host, silent, 'END GOOD')
574        self.status = 'repaired'
575
576    def _record_end_fail(self, host, silent, status, *args):
577        """Log an 'END FAIL' status line.
578
579        @param host         Host which will record the status record.
580        @param silent       Don't record the event if this is a true
581                            value.
582        @param args         Extra arguments to `self._record()`
583        """
584        self._record(host, silent, 'END FAIL', *args)
585        self.status = status
586
587    def _send_failure_metrics(self, host, error, stage):
588        """Send failure mode metrics to monarch
589
590        @param host         Host which this RepairAction targeted to.
591        @param error        An exception that caught in _repair_host.
592        @param stage        In which stage we caught above exception.
593                            Can be one of below value:
594                                'dep'    during verify dependencies
595                                'pre'    during pre-repair trigger verification
596                                'repair' during repair() process itself
597                                'post'   during post-repair trigger verification
598        """
599
600        def get_fields(vf_tag):
601            fields = {
602                'ra_tag': self.tag,
603                'vf_tag': vf_tag,
604                'hostname': _filter_metrics_hostname(host),
605                'stage': stage,
606                'host_class': self.host_class
607            }
608            return fields
609
610        if isinstance(error, AutoservVerifyDependencyError):
611            # We'll catch all failure tags here for a dependencies error
612            for f in error.failures:
613                self._failure_modes_counter.increment(fields=get_fields(f.tag))
614        else:
615            # When there is failure during repair or unknown failure. there
616            # will be no Verifier, so vf_tag set to 'unknown'.
617            self._failure_modes_counter.increment(fields=get_fields('unknown'))
618
619        if stage == 'repair':
620            self._send_failure_detail(error)
621
622    def _send_failure_detail(self, error):
623        """Send reason of failure inside repair() to monarch.
624
625        @param error    The exception caught inside repair().
626        """
627        tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
628        fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
629        self._failure_detail_counter.increment(fields=fields)
630
631    def _repair_host(self, host, silent):
632        """
633        Apply this repair action if any triggers fail.
634
635        Repair is triggered when all dependencies are successful, and at
636        least one trigger fails.
637
638        If the `repair()` method triggers, the success or failure of
639        this operation is logged in `status.log` bracketed by 'START'
640        and 'END' records.  Details of whether or why `repair()`
641        triggered are written to the debug logs.   If repair doesn't
642        trigger, nothing is logged to `status.log`.
643
644        @param host     The host to be repaired.
645        @param silent   If true, don't log host status records.
646        """
647        # Note:  Every exit path from the method must set `self.status`.
648        # There's a lot of exit paths, so be careful.
649        #
650        # If we're blocked by a failed dependency, we exit with an
651        # exception.  So set status to 'blocked' first.
652        self.status = 'blocked'
653        try:
654            self._verify_dependencies(host, silent)
655        except Exception as e:
656            self._send_failure_metrics(host, e, 'dep')
657            raise
658
659        self.status = 'skipped'
660        try:
661            if not self._is_applicable(host):
662                logging.info('RepairAction is not applicable, skipping repair: %s',
663                             self.description)
664                return
665        except Exception as e:
666            logging.error('Skipping %s repair action due to unexpect error'
667                          ' during check applicability; %s', self.tag, e)
668            return
669        # This is a defensive action.  Every path below should overwrite
670        # this setting, but if it doesn't, we want our status to reflect
671        # a coding error.
672        self.status = 'unknown'
673        try:
674            self._verify_list(host, self._trigger_list, silent)
675        except AutoservVerifyDependencyError as e:
676            e.log_dependencies(
677                    'Attempting this repair action',
678                    'Repairing because these triggers failed')
679            self._send_failure_metrics(host, e, 'pre')
680            self._record_start(host, silent)
681            try:
682                self.repair(host)
683                # Increase action success count if device health profile is
684                # available to the host class.
685                if hasattr(host, 'health_profile') and host.health_profile:
686                    host.health_profile.insert_succeed_repair_action(self.tag)
687            except Exception as e:
688                logging.exception('Repair failed: %s', self.description)
689                self._record_fail(host, silent, e)
690                self._record_end_fail(host, silent, 'repair_failure')
691                self._send_failure_metrics(host, e, 'repair')
692                # Increase action fail count if device health profile is
693                # available to the host class.
694                if hasattr(host, 'health_profile') and host.health_profile:
695                    host.health_profile.insert_failed_repair_action(self.tag)
696                raise
697            try:
698                for v in self._trigger_list:
699                    v._reverify()
700                self._verify_list(host, self._trigger_list, silent)
701                self._record_end_good(host, silent)
702            except AutoservVerifyDependencyError as e:
703                e.log_dependencies(
704                        'This repair action reported success',
705                        'However, these triggers still fail')
706                self._record_end_fail(host, silent, 'verify_failure')
707                self._send_failure_metrics(host, e, 'post')
708                raise AutoservRepairError(
709                        'Some verification checks still fail', 'post_verify')
710            except Exception:
711                # The specification for `self._verify_list()` says
712                # that this can't happen; this is a defensive
713                # precaution.
714                self._record_end_fail(host, silent, 'unknown',
715                                      'Internal error in repair')
716                self._send_failure_metrics(host, e, 'post')
717                raise
718        else:
719            self.status = 'skipped'
720            logging.info('No failed triggers, skipping repair: %s',
721                         self.description)
722
723    def repair(self, host):
724        """
725        Apply this repair action to the given host.
726
727        This method is responsible for applying changes to fix failures
728        in one or more verification checks.  The repair is considered
729        successful if the DUT passes the specific checks after this
730        method completes.
731
732        Implementations indicate failure by raising an exception.  The
733        exception text should be a short, 1-line summary of the error.
734        The text should be concise and diagnostic, as it will appear in
735        `status.log` files.
736
737        If this method completes successfully, it returns without
738        raising any exception.
739
740        Implementations should avoid most logging actions, but can log
741        DEBUG level messages if they provide significant information for
742        diagnosing failures.
743
744        @param host   The host to be repaired.
745        """
746        raise NotImplementedError('Class %s does not implement '
747                                  'repair()' % type(self).__name__)
748
749
750class _RootVerifier(Verifier):
751    """
752    Utility class used by `RepairStrategy`.
753
754    A node of this class by itself does nothing; it always passes (if it
755    can run).  This class exists merely to be the root of a DAG of
756    dependencies in an instance of `RepairStrategy`.
757    """
758
759    def verify(self, host):
760        pass
761
762    @property
763    def description(self):
764        return 'All host verification checks pass'
765
766
767class RepairStrategy(object):
768    """
769    A class for organizing `Verifier` and `RepairAction` objects.
770
771    An instance of `RepairStrategy` is organized as a DAG of `Verifier`
772    objects, plus a list of `RepairAction` objects.  The class provides
773    methods for invoking those objects in the required order, when
774    needed:
775      * The `verify()` method walks the verifier DAG in dependency
776        order.
777      * The `repair()` method invokes the repair actions in list order.
778        Each repair action will invoke its dependencies and triggers as
779        needed.
780
781    # The Verifier DAG
782    The verifier DAG is constructed from the first argument passed to
783    the passed to the `RepairStrategy` constructor.  That argument is an
784    iterable consisting of three-element tuples in the form
785    `(constructor, tag, deps)`:
786      * The `constructor` value is a callable that creates a `Verifier`
787        as for the interface of the class constructor.  For classes
788        that inherit the default constructor from `Verifier`, this can
789        be the class itself.
790      * The `tag` value is the tag to be associated with the constructed
791        verifier.
792      * The `deps` value is an iterable (e.g. list or tuple) of strings.
793        Each string corresponds to the `tag` member of a `Verifier`
794        dependency.
795
796    The tag names of verifiers in the constructed DAG must all be
797    unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
798    reserved and may not be used by any verifier.
799
800    In the input data for the constructor, dependencies must appear
801    before the nodes that depend on them.  Thus:
802
803        ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
804        ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
805
806    Internally, the DAG of verifiers is given unique root node.  So,
807    given this input:
808
809        ((C, 'c', ()),
810         (A, 'a', ('c',)),
811         (B, 'b', ('c',)))
812
813    The following DAG is constructed:
814
815          Root
816          /  \
817         A    B
818          \  /
819           C
820
821    Since nothing depends on `A` or `B`, the root node guarantees that
822    these two verifiers will both be called and properly logged.
823
824    The root node is not directly accessible; however repair actions can
825    trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
826    node will be logged in `status.log` whenever `verify()` succeeds.
827
828    # The Repair Actions List
829    The list of repair actions is constructed from the second argument
830    passed to the passed to the `RepairStrategy` constructor.  That
831    argument is an iterable consisting of four-element tuples in the
832    form `(constructor, tag, deps, triggers)`:
833      * The `constructor` value is a callable that creates a
834        `RepairAction` as for the interface of the class constructor.
835        For classes that inherit the default constructor from
836        `RepairAction`, this can be the class itself.
837      * The `tag` value is the tag to be associated with the constructed
838        repair action.
839      * The `deps` value is an iterable (e.g. list or tuple) of strings.
840        Each string corresponds to the `tag` member of a `Verifier` that
841        the repair action depends on.
842      * The `triggers` value is an iterable (e.g. list or tuple) of
843        strings.  Each string corresponds to the `tag` member of a
844        `Verifier` that can trigger the repair action.
845
846    `RepairStrategy` deps and triggers can only refer to verifiers,
847    not to other repair actions.
848    """
849
850    # This name is reserved; clients may not use it.
851    ROOT_TAG = 'PASS'
852
853    @staticmethod
854    def _add_verifier(verifiers, constructor, tag, dep_tags):
855        """
856        Construct and remember a verifier.
857
858        Create a `Verifier` using `constructor` and `tag`.  Dependencies
859        for construction are found by looking up `dep_tags` in the
860        `verifiers` dictionary.
861
862        After construction, the new verifier is added to `verifiers`.
863
864        @param verifiers    Dictionary of verifiers, indexed by tag.
865        @param constructor  Verifier construction function.
866        @param tag          Tag parameter for the construction function.
867        @param dep_tags     Tags of dependencies for the constructor, to
868                            be found in `verifiers`.
869        """
870        assert tag not in verifiers
871        deps = [verifiers[d] for d in dep_tags]
872        verifiers[tag] = constructor(tag, deps)
873
874    def __init__(self, verifier_data, repair_data, host_class):
875        """
876        Construct a `RepairStrategy` from simplified DAG data.
877
878        The input `verifier_data` object describes how to construct
879        verify nodes and the dependencies that relate them, as detailed
880        above.
881
882        The input `repair_data` object describes how to construct repair
883        actions and their dependencies and triggers, as detailed above.
884
885        @param verifier_data  Iterable value with constructors for the
886                              elements of the verification DAG and their
887                              dependencies.
888        @param repair_data    Iterable value with constructors for the
889                              elements of the repair action list, and
890                              their dependencies and triggers.
891        @property host_class  A string identifier that identify what
892                              class of host this repair strategy target
893                              on, will be used as a field to send repair
894                              metrics.
895        """
896        # Metrics - we report on 'actions' for every repair action
897        # we execute; we report on 'strategy' for every complete
898        # repair operation.
899        self._strategy_counter = metrics.Counter(
900            'chromeos/autotest/repair/repair_strategy_v2')
901        self._actions_counter = metrics.Counter(
902            'chromeos/autotest/repair/repair_actions')
903        self.host_class = host_class
904        # We use the `all_verifiers` list to guarantee that our root
905        # verifier will execute its dependencies in the order provided
906        # to us by our caller.
907        verifier_map = {}
908        all_tags = []
909        dependencies = set()
910        for constructor, tag, deps in verifier_data:
911            self._add_verifier(verifier_map, constructor, tag, deps)
912            dependencies.update(deps)
913            all_tags.append(tag)
914        # Capture all the verifiers that have nothing depending on them.
915        root_tags = [t for t in all_tags if t not in dependencies]
916        self._add_verifier(verifier_map, _RootVerifier,
917                           self.ROOT_TAG, root_tags)
918        self._verify_root = verifier_map[self.ROOT_TAG]
919        self._repair_actions = []
920        for constructor, tag, deps, triggers in repair_data:
921            r = constructor(tag,
922                            [verifier_map[d] for d in deps],
923                            [verifier_map[t] for t in triggers],
924                            self.host_class)
925            self._repair_actions.append(r)
926
927    def _send_strategy_metrics(self, host, result):
928        """Send repair strategy metrics to monarch
929
930        @param host     The target to be repaired.
931        @param result   A String that describe a final result for the
932                        RepairStrategy.
933        """
934        info = host.host_info_store.get()
935        board = info.board if info.board else 'unknown'
936        model = info.model if info.model else 'unknown'
937        fields = {
938            'board': board,
939            'host_class': self.host_class,
940            'hostname': _filter_metrics_hostname(host),
941            'model': model,
942            'result': result,
943        }
944        self._strategy_counter.increment(fields=fields)
945
946    def _send_action_metrics(self, host, ra):
947        """Send repair action metrics to monarch
948
949        @param host     The target to be repaired.
950        @param ra       an RepairAction instance.
951        """
952        fields = {
953            'tag': ra.tag,
954            'status': ra.status,
955            'hostname': _filter_metrics_hostname(host),
956            'host_class': self.host_class
957        }
958        self._actions_counter.increment(fields=fields)
959
960    def verify(self, host, silent=False):
961        """
962        Run the verifier DAG on the given host.
963
964        @param host     The target to be verified.
965        @param silent   If true, don't log host status records.
966        """
967        self._verify_root._reverify()
968        self._verify_root._verify_host(host, silent)
969
970    def repair(self, host, silent=False):
971        """
972        Run the repair list on the given host.
973
974        @param host     The target to be repaired.
975        @param silent   If true, don't log host status records.
976        """
977        self._verify_root._reverify()
978        attempted = False
979        for ra in self._repair_actions:
980            try:
981                logging.debug('Start repair task: %s.', type(ra).__name__)
982                ra._repair_host(host, silent)
983            except Exception as e:
984                # all logging and exception handling was done at
985                # lower levels
986                pass
987            finally:
988                self._send_action_metrics(host, ra)
989                logging.debug('Finished repair task: %s.', type(ra).__name__)
990                if ra.status not in ('skipped', 'blocked'):
991                    attempted = True
992
993        result = 'failure'
994        try:
995            self._verify_root._verify_host(host, silent)
996            result = 'success' if attempted else 'not_attempted'
997        except:
998            if not attempted:
999                result = 'attempt_blocked'
1000            raise
1001        finally:
1002            self._send_strategy_metrics(host, result)
1003
1004    def verifier_is_good(self, tag):
1005        """Find and return result of a verifier.
1006
1007        @param tag: key to be associated with verifier
1008
1009        @returns: a boolean or None value:
1010            True - verifier passed
1011            False - verifier did not pass
1012            None - verifier did not run because it is not applicable
1013                   or blocked due to dependency failure
1014        """
1015        verifier = self.node_by_tag(tag)
1016        if verifier is not None:
1017            result = verifier._is_good()
1018            logging.debug('Verifier with associated tag: %s found', tag)
1019            if result is None:
1020                logging.debug('%s did not run; it is not applicable to run '
1021                              'or blocked due to dependency failure', tag)
1022            elif result == True:
1023                logging.debug('Cached result of %s verifier is pass', tag)
1024            else:
1025                logging.debug('Cached result of %s verifier is fail', tag)
1026            return result
1027        logging.debug('Verifier with associated tag: %s not found', tag)
1028        return None
1029
1030    def node_by_tag(self, tag):
1031        """Find and return node by searched tag.
1032
1033        @param tag: key to be associated with node
1034
1035        @returns: _DependencyNode instance associated with tag
1036        """
1037        node = self._verify_root._get_node_by_tag(tag)
1038        if node is None:
1039            for n in self._repair_actions:
1040                node = n._get_node_by_tag(tag)
1041                if node is not None:
1042                    break
1043        return node
1044
1045
1046def _filter_metrics_hostname(host):
1047    """
1048       Restrict format of hostnames we'll send to monarch
1049
1050       @param host    An host instance(i.e. ServoHost, CrosHost)
1051    """
1052    if re.match(_HOSTNAME_PATTERN, host.hostname):
1053        return host.hostname
1054    else:
1055        return _DISALLOWED_HOSTNAME
1056