1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5""" 6Framework for host verification and repair in Autotest. 7 8The framework provides implementation code in support of `Host.verify()` 9and `Host.repair()` used in Verify and Repair special tasks. 10 11The framework consists of these classes: 12 * `Verifier`: A class representing a single verification check. 13 * `RepairAction`: A class representing a repair operation that can fix 14 a failed verification check. 15 * `RepairStrategy`: A class for organizing a collection of `Verifier` 16 and `RepairAction` instances, and invoking them in order. 17 18Individual operations during verification and repair are handled by 19instances of `Verifier` and `RepairAction`. `Verifier` objects are 20meant to test for specific conditions that may cause tests to fail. 21`RepairAction` objects provide operations designed to fix one or 22more failures identified by a `Verifier` object. 23""" 24 25import collections 26import logging 27import re 28 29import common 30from autotest_lib.client.common_lib import error 31 32try: 33 from autotest_lib.utils.frozen_chromite.lib import metrics 34except ImportError: 35 from autotest_lib.client.bin.utils import metrics_mock as metrics 36 37#Regular experssion pattern to filter out unwanted hostname. 38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+' 39_DISALLOWED_HOSTNAME = 'disallowed_hostname' 40 41# States of verifiers 42# True - verifier run and passed 43# False - verifier run and failed 44# None - verifier did not run or dependency failed 45VERIFY_SUCCESS = True 46VERIFY_FAILED = False 47VERIFY_NOT_RUN = None 48 49 50class AutoservVerifyError(error.AutoservError): 51 """ 52 Generic Exception for failures from `Verifier` objects. 53 54 Instances of this exception can be raised when a `verify()` 55 method fails, if no more specific exception is available. 56 """ 57 pass 58 59 60class AutoservNonCriticalVerifyError(error.AutoservError): 61 """ 62 Exception for failures from `Verifier` objects that not critical enough to 63 conclude the target host is in a bad state. 64 """ 65 pass 66 67 68_DependencyFailure = collections.namedtuple( 69 '_DependencyFailure', ('dependency', 'error', 'tag')) 70 71 72_NonCriticalDependencyFailure = collections.namedtuple( 73 '_NonCriticalDependencyFailure', ('dependency', 'error', 'tag')) 74 75 76class AutoservVerifyDependencyError(error.AutoservError): 77 """ 78 Exception raised for failures in dependencies. 79 80 This exception is used to distinguish an original failure from a 81 failure being passed back from a verification dependency. That is, 82 if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception 83 to signal that the original failure is further down the dependency 84 chain. 85 86 The `failures` argument to the constructor for this class is a set 87 of instances of `_DependencyFailure`, each corresponding to one 88 failed dependency: 89 * The `dependency` attribute of each failure is the description 90 of the failed dependency. 91 * The `error` attribute of each failure is the string value of 92 the exception from the failed dependency. 93 94 Multiple methods in this module recognize and handle this exception 95 specially. 96 97 @property failures Set of failures passed to the constructor. 98 @property _node Instance of `_DependencyNode` reporting the 99 failed dependencies. 100 """ 101 102 def __init__(self, node, failures): 103 """ 104 Constructor for `AutoservVerifyDependencyError`. 105 106 @param node Instance of _DependencyNode reporting the 107 failed dependencies. 108 @param failures List of failure tuples as described above. 109 """ 110 super(AutoservVerifyDependencyError, self).__init__( 111 '\n'.join([f.error for f in failures])) 112 self.failures = failures 113 self._node = node 114 115 def log_dependencies(self, action, deps): 116 """ 117 Log an `AutoservVerifyDependencyError`. 118 119 This writes a short summary of the dependency failures captured 120 in this exception, using standard Python logging. 121 122 The passed in `action` string plus `self._node.description` 123 are logged at INFO level. The `action` argument should 124 introduce or describe an action relative to `self._node`. 125 126 The passed in `deps` string and the description of each failed 127 dependency in `self` are be logged at DEBUG level. The `deps` 128 argument is used to introduce the various failed dependencies. 129 130 @param action A string mentioning the action being logged 131 relative to `self._node`. 132 @param deps A string introducing the dependencies that 133 failed. 134 """ 135 logging.info('%s: %s', action, self._node.description) 136 logging.debug('%s:', deps) 137 for failure in self.failures: 138 logging.debug(' %s', failure.dependency) 139 140 def is_critical(self, silent=False): 141 """Check if the error is considered to be critical to repair process.""" 142 for error in self.failures: 143 if isinstance(error, _NonCriticalDependencyFailure): 144 if not silent: 145 logging.warning("%s is still failing but forgiven because" 146 " it raised a non-critical error.", 147 error.tag) 148 else: 149 return True 150 return False 151 152 153class AutoservRepairError(error.AutoservError): 154 """ 155 Generic Exception for failures from `RepairAction` objects. 156 157 Instances of this exception can be raised when a `repair()` 158 method fails, if no more specific exception is available. 159 """ 160 def __init__(self, description, tag): 161 """ 162 @param description Message describe the exception. 163 @param tag A short identifier used for metric purpose. 164 """ 165 super(AutoservRepairError, self).__init__(description) 166 self.tag = tag 167 168 169class _DependencyNode(object): 170 """ 171 An object that can depend on verifiers. 172 173 Both repair and verify operations have the notion of dependencies 174 that must pass before the operation proceeds. This class captures 175 the shared behaviors required by both classes. 176 177 @property tag Short identifier to be used in logging. 178 @property description Text summary of this node's action, to be 179 used in debug logs. 180 @property _dependency_list Dependency pre-requisites. 181 """ 182 183 def __init__(self, tag, record_type, dependencies): 184 self._dependency_list = dependencies 185 self._tag = tag 186 self._record_tag = record_type + '.' + tag 187 188 def _is_applicable(self, host): 189 """ 190 Check if the action is applicable to target host. Subclasses 191 can override this method per their need. 192 193 @param host Target host to check. 194 @return A bool value. 195 """ 196 return True 197 198 def _record(self, host, silent, status_code, *record_args): 199 """ 200 Log a status record for `host`. 201 202 Call `host.record()` using the given status_code, and 203 operation tag `self._record_tag`, plus any extra arguments in 204 `record_args`. Do nothing if `silent` is a true value. 205 206 @param host Host which will record the status record. 207 @param silent Don't record the event if this is a true 208 value. 209 @param status_code Value for the `status_code` parameter to 210 `host.record()`. 211 @param record_args Additional arguments to pass to 212 `host.record()`. 213 """ 214 if not silent: 215 host.record(status_code, None, self._record_tag, 216 *record_args) 217 218 def _record_good(self, host, silent): 219 """Log a 'GOOD' status line. 220 221 @param host Host which will record the status record. 222 @param silent Don't record the event if this is a true 223 value. 224 """ 225 self._record(host, silent, 'GOOD') 226 227 def _record_fail(self, host, silent, exc): 228 """Log a 'FAIL' status line. 229 230 @param host Host which will record the status record. 231 @param silent Don't record the event if this is a true 232 value. 233 @param exc Exception describing the cause of failure. 234 """ 235 self._record(host, silent, 'FAIL', str(exc)) 236 237 def _verify_list(self, host, verifiers, silent): 238 """ 239 Test a list of verifiers against a given host. 240 241 This invokes `_verify_host()` on every verifier in the given 242 list. If any verifier in the transitive closure of dependencies 243 in the list fails, an `AutoservVerifyDependencyError` is raised 244 containing the description of each failed verifier. Only 245 original failures are reported; verifiers that don't run due 246 to a failed dependency are omitted. 247 248 By design, original failures are logged once in `_verify_host()` 249 when `verify()` originally fails. The additional data gathered 250 here is for the debug logs to indicate why a subsequent 251 operation never ran. 252 253 @param host The host to be tested against the verifiers. 254 @param verifiers List of verifiers to be checked. 255 @param silent If true, don't log host status records. 256 257 @raises AutoservVerifyDependencyError Raised when at least 258 one verifier in the list has failed. 259 """ 260 failures = set() 261 for v in verifiers: 262 try: 263 v._verify_host(host, silent) 264 except AutoservNonCriticalVerifyError as e: 265 failures.add(_NonCriticalDependencyFailure(v.description, 266 str(e), v.tag)) 267 except AutoservVerifyDependencyError as e: 268 failures.update(e.failures) 269 except Exception as e: 270 failures.add(_DependencyFailure(v.description, str(e), v.tag)) 271 if failures: 272 raise AutoservVerifyDependencyError(self, failures) 273 274 def _verify_dependencies(self, host, silent): 275 """ 276 Verify that all of this node's dependencies pass for a host. 277 278 @param host The host to be verified. 279 @param silent If true, don't log host status records. 280 """ 281 try: 282 self._verify_list(host, self._dependency_list, silent) 283 except AutoservVerifyDependencyError as e: 284 e.log_dependencies( 285 'Skipping this operation', 286 'The following dependencies failed') 287 raise 288 289 @property 290 def tag(self): 291 """ 292 Tag for use in logging status records. 293 294 This is a property with a short string used to identify the node 295 in the 'status.log' file and during node construction. The tag 296 should contain only letters, digits, and '_' characters. This 297 tag is not used alone, but is combined with other identifiers, 298 based on the operation being logged. 299 300 @return A short identifier-like string. 301 """ 302 return self._tag 303 304 @property 305 def description(self): 306 """ 307 Text description of this node for log messages. 308 309 This string will be logged with failures, and should describe 310 the condition required for success. 311 312 N.B. Subclasses are required to override this method, but we 313 _don't_ raise NotImplementedError here. Various methods fail in 314 inscrutable ways if this method raises any exception, so for 315 debugging purposes, it's better to return a default value. 316 317 @return A descriptive string. 318 """ 319 return ('Class %s fails to implement description().' % 320 type(self).__name__) 321 322 def _get_node_by_tag(self, tag): 323 """Find verifier by tag, recursive. 324 325 @param tag Node identifier. 326 327 @returns: _DependencyNode instance associated with tag 328 """ 329 if self._tag == tag: 330 return self 331 for child in self._dependency_list: 332 node = child._get_node_by_tag(tag) 333 if node is not None: 334 return node 335 return None 336 337 338class Verifier(_DependencyNode): 339 """ 340 Abstract class embodying one verification check. 341 342 A concrete subclass of `Verifier` provides a simple check that can 343 determine a host's fitness for testing. Failure indicates that the 344 check found a problem that can cause at least one test to fail. 345 346 `Verifier` objects are organized in a DAG identifying dependencies 347 among operations. The DAG controls ordering and prevents wasted 348 effort: If verification operation V2 requires that verification 349 operation V1 pass, then a) V1 will run before V2, and b) if V1 350 fails, V2 won't run at all. The `_verify_host()` method ensures 351 that all dependencies run and pass before invoking the `verify()` 352 method. 353 354 A `Verifier` object caches its result the first time it calls 355 `verify()`. Subsequent calls return the cached result, without 356 re-running the check code. The `_reverify()` method clears the 357 cached result in the current node, and in all dependencies. 358 359 Subclasses must supply these properties and methods: 360 * `verify()`: This is the method to perform the actual 361 verification check. 362 * `description`: A one-line summary of the verification check for 363 debug log messages. 364 365 Subclasses must override all of the above attributes; subclasses 366 should not override or extend any other attributes of this class. 367 368 The description string should be a simple sentence explaining what 369 must be true for the verifier to pass. Do not include a terminating 370 period. For example: 371 372 Host is available via ssh 373 374 The base class manages the following private data: 375 * `_result`: The cached result of verification. 376 None - did not run 377 True - successful pass 378 Exception - fail during execution 379 * `_dependency_list`: The list of dependencies. 380 Subclasses should not use these attributes. 381 382 @property _result Cached result of verification. 383 """ 384 385 def __init__(self, tag, dependencies): 386 super(Verifier, self).__init__(tag, 'verify', dependencies) 387 self._result = None 388 389 def _reverify(self): 390 """ 391 Discard cached verification results. 392 393 Reset the cached verification result for this node, and for the 394 transitive closure of all dependencies. 395 """ 396 self._result = None 397 for v in self._dependency_list: 398 v._reverify() 399 400 def _verify_host(self, host, silent): 401 """ 402 Determine the result of verification, and log results. 403 404 If this verifier does not have a cached verification result, 405 check dependencies, and if they pass, run `verify()`. Log 406 informational messages regarding failed dependencies. If we 407 call `verify()`, log the result in `status.log`. 408 409 If we already have a cached result, return that result without 410 logging any message. 411 412 @param host The host to be tested for a problem. 413 @param silent If true, don't log host status records. 414 """ 415 self._verify_dependencies(host, silent) 416 try: 417 if not self._is_applicable(host): 418 logging.info( 419 'Verify "%s:%s" is not applicable to %s, skipping...', 420 self.tag, self.description, host.hostname) 421 return 422 except Exception as e: 423 logging.error('Skipping %s verifier due to unexpect error during' 424 ' check applicability; %s', self.tag, e) 425 return 426 427 if self._result is not None: 428 if isinstance(self._result, Exception): 429 raise self._result # cached failure 430 elif self._result: 431 return # cached success 432 433 logging.info('Verifying %s:%s', self.tag, self.description) 434 try: 435 logging.debug('Start verify task: %s.', type(self).__name__) 436 self.verify(host) 437 self._record_good(host, silent) 438 except Exception as e: 439 message = 'Failed: %s' 440 if isinstance(e, AutoservNonCriticalVerifyError): 441 message = '(Non-critical)Failed: %s' 442 logging.exception(message, self.description) 443 self._result = e 444 self._record_fail(host, silent, e) 445 # Increase verifier fail count if device health profile is 446 # available to the host class. 447 if hasattr(host, 'health_profile') and host.health_profile: 448 host.health_profile.insert_failed_verifier(self.tag) 449 raise 450 finally: 451 logging.debug('Finished verify task: %s.', type(self).__name__) 452 453 self._result = True 454 455 def verify(self, host): 456 """ 457 Unconditionally perform a verification check. 458 459 This method is responsible for testing for a single problem on a 460 host. Implementations should follow these guidelines: 461 * The check should find a problem that will cause testing to 462 fail. 463 * Verification checks on a working system should run quickly 464 and should be optimized for success; a check that passes 465 should finish within seconds. 466 * Verification checks are not expected have side effects, but 467 may apply trivial fixes if they will finish within the time 468 constraints above. 469 470 A verification check should normally trigger a single set of 471 repair actions. If two different failures can require two 472 different repairs, ideally they should use two different 473 subclasses of `Verifier`. 474 475 Implementations indicate failure by raising an exception. The 476 exception text should be a short, 1-line summary of the error. 477 The text should be concise and diagnostic, as it will appear in 478 `status.log` files. 479 480 If this method finds no problems, it returns without raising any 481 exception. 482 483 Implementations should avoid most logging actions, but can log 484 DEBUG level messages if they provide significant information for 485 diagnosing failures. 486 487 @param host The host to be tested for a problem. 488 """ 489 raise NotImplementedError('Class %s does not implement ' 490 'verify()' % type(self).__name__) 491 492 def _is_good(self): 493 """Provide result of the verifier 494 495 @returns: a boolean or None value: 496 True - verifier passed 497 False - verifier did not pass 498 None - verifier did not run because it is not applicable 499 or blocked due to dependency failure 500 """ 501 if type(self._result) == type(True): 502 return self._result 503 elif isinstance(self._result, Exception): 504 return False 505 return None 506 507 508class RepairAction(_DependencyNode): 509 """ 510 Abstract class embodying one repair procedure. 511 512 A `RepairAction` is responsible for fixing one or more failed 513 `Verifier` checks, in order to make those checks pass. 514 515 Each repair action includes one or more verifier triggers that 516 determine when the repair action should run. A repair action 517 will call its `repair()` method if one or more of its triggers 518 fails. A repair action is successful if all of its triggers pass 519 after calling `repair()`. 520 521 A `RepairAction` is a subclass of `_DependencyNode`; if any of a 522 repair action's dependencies fail, the action does not check its 523 triggers, and doesn't call `repair()`. 524 525 Subclasses must supply these attributes: 526 * `repair()`: This is the method to perform the necessary 527 repair. The method should avoid most logging actions, but 528 can log DEBUG level messages if they provide significant 529 information for diagnosing failures. 530 * `description`: A one-line summary of the repair action for 531 debug log messages. 532 533 Subclasses must override both of the above attributes and should 534 not override any other attributes of this class. 535 536 The description string should be a simple sentence explaining the 537 operation that will be performed. Do not include a terminating 538 period. For example: 539 540 Re-install the stable build via AU 541 542 @property _trigger_list List of verification checks that will 543 trigger this repair when they fail. 544 @property host_class A string identifier that will be 545 used as a field to send repair metrics. 546 """ 547 548 def __init__(self, tag, dependencies, triggers, host_class): 549 super(RepairAction, self).__init__(tag, 'repair', dependencies) 550 self._trigger_list = triggers 551 self._failure_modes_counter = metrics.Counter( 552 'chromeos/autotest/repair/failure_modes') 553 self._failure_detail_counter = metrics.Counter( 554 'chromeos/autotest/repair/failure_detail') 555 self.host_class = host_class 556 557 def _record_start(self, host, silent): 558 """Log a 'START' status line. 559 560 @param host Host which will record the status record. 561 @param silent Don't record the event if this is a true 562 value. 563 """ 564 self._record(host, silent, 'START') 565 566 def _record_end_good(self, host, silent): 567 """Log an 'END GOOD' status line. 568 569 @param host Host which will record the status record. 570 @param silent Don't record the event if this is a true 571 value. 572 """ 573 self._record(host, silent, 'END GOOD') 574 self.status = 'repaired' 575 576 def _record_end_fail(self, host, silent, status, *args): 577 """Log an 'END FAIL' status line. 578 579 @param host Host which will record the status record. 580 @param silent Don't record the event if this is a true 581 value. 582 @param args Extra arguments to `self._record()` 583 """ 584 self._record(host, silent, 'END FAIL', *args) 585 self.status = status 586 587 def _send_failure_metrics(self, host, error, stage): 588 """Send failure mode metrics to monarch 589 590 @param host Host which this RepairAction targeted to. 591 @param error An exception that caught in _repair_host. 592 @param stage In which stage we caught above exception. 593 Can be one of below value: 594 'dep' during verify dependencies 595 'pre' during pre-repair trigger verification 596 'repair' during repair() process itself 597 'post' during post-repair trigger verification 598 """ 599 600 def get_fields(vf_tag): 601 fields = { 602 'ra_tag': self.tag, 603 'vf_tag': vf_tag, 604 'hostname': _filter_metrics_hostname(host), 605 'stage': stage, 606 'host_class': self.host_class 607 } 608 return fields 609 610 if isinstance(error, AutoservVerifyDependencyError): 611 # We'll catch all failure tags here for a dependencies error 612 for f in error.failures: 613 self._failure_modes_counter.increment(fields=get_fields(f.tag)) 614 else: 615 # When there is failure during repair or unknown failure. there 616 # will be no Verifier, so vf_tag set to 'unknown'. 617 self._failure_modes_counter.increment(fields=get_fields('unknown')) 618 619 if stage == 'repair': 620 self._send_failure_detail(error) 621 622 def _send_failure_detail(self, error): 623 """Send reason of failure inside repair() to monarch. 624 625 @param error The exception caught inside repair(). 626 """ 627 tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown' 628 fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag} 629 self._failure_detail_counter.increment(fields=fields) 630 631 def _repair_host(self, host, silent): 632 """ 633 Apply this repair action if any triggers fail. 634 635 Repair is triggered when all dependencies are successful, and at 636 least one trigger fails. 637 638 If the `repair()` method triggers, the success or failure of 639 this operation is logged in `status.log` bracketed by 'START' 640 and 'END' records. Details of whether or why `repair()` 641 triggered are written to the debug logs. If repair doesn't 642 trigger, nothing is logged to `status.log`. 643 644 @param host The host to be repaired. 645 @param silent If true, don't log host status records. 646 """ 647 # Note: Every exit path from the method must set `self.status`. 648 # There's a lot of exit paths, so be careful. 649 # 650 # If we're blocked by a failed dependency, we exit with an 651 # exception. So set status to 'blocked' first. 652 self.status = 'blocked' 653 try: 654 self._verify_dependencies(host, silent) 655 except Exception as e: 656 self._send_failure_metrics(host, e, 'dep') 657 raise 658 659 self.status = 'skipped' 660 try: 661 if not self._is_applicable(host): 662 logging.info('RepairAction is not applicable, skipping repair: %s', 663 self.description) 664 return 665 except Exception as e: 666 logging.error('Skipping %s repair action due to unexpect error' 667 ' during check applicability; %s', self.tag, e) 668 return 669 # This is a defensive action. Every path below should overwrite 670 # this setting, but if it doesn't, we want our status to reflect 671 # a coding error. 672 self.status = 'unknown' 673 try: 674 self._verify_list(host, self._trigger_list, silent) 675 except AutoservVerifyDependencyError as e: 676 e.log_dependencies( 677 'Attempting this repair action', 678 'Repairing because these triggers failed') 679 self._send_failure_metrics(host, e, 'pre') 680 self._record_start(host, silent) 681 try: 682 self.repair(host) 683 # Increase action success count if device health profile is 684 # available to the host class. 685 if hasattr(host, 'health_profile') and host.health_profile: 686 host.health_profile.insert_succeed_repair_action(self.tag) 687 except Exception as e: 688 logging.exception('Repair failed: %s', self.description) 689 self._record_fail(host, silent, e) 690 self._record_end_fail(host, silent, 'repair_failure') 691 self._send_failure_metrics(host, e, 'repair') 692 # Increase action fail count if device health profile is 693 # available to the host class. 694 if hasattr(host, 'health_profile') and host.health_profile: 695 host.health_profile.insert_failed_repair_action(self.tag) 696 raise 697 try: 698 for v in self._trigger_list: 699 v._reverify() 700 self._verify_list(host, self._trigger_list, silent) 701 self._record_end_good(host, silent) 702 except AutoservVerifyDependencyError as e: 703 e.log_dependencies( 704 'This repair action reported success', 705 'However, these triggers still fail') 706 self._record_end_fail(host, silent, 'verify_failure') 707 self._send_failure_metrics(host, e, 'post') 708 raise AutoservRepairError( 709 'Some verification checks still fail', 'post_verify') 710 except Exception: 711 # The specification for `self._verify_list()` says 712 # that this can't happen; this is a defensive 713 # precaution. 714 self._record_end_fail(host, silent, 'unknown', 715 'Internal error in repair') 716 self._send_failure_metrics(host, e, 'post') 717 raise 718 else: 719 self.status = 'skipped' 720 logging.info('No failed triggers, skipping repair: %s', 721 self.description) 722 723 def repair(self, host): 724 """ 725 Apply this repair action to the given host. 726 727 This method is responsible for applying changes to fix failures 728 in one or more verification checks. The repair is considered 729 successful if the DUT passes the specific checks after this 730 method completes. 731 732 Implementations indicate failure by raising an exception. The 733 exception text should be a short, 1-line summary of the error. 734 The text should be concise and diagnostic, as it will appear in 735 `status.log` files. 736 737 If this method completes successfully, it returns without 738 raising any exception. 739 740 Implementations should avoid most logging actions, but can log 741 DEBUG level messages if they provide significant information for 742 diagnosing failures. 743 744 @param host The host to be repaired. 745 """ 746 raise NotImplementedError('Class %s does not implement ' 747 'repair()' % type(self).__name__) 748 749 750class _RootVerifier(Verifier): 751 """ 752 Utility class used by `RepairStrategy`. 753 754 A node of this class by itself does nothing; it always passes (if it 755 can run). This class exists merely to be the root of a DAG of 756 dependencies in an instance of `RepairStrategy`. 757 """ 758 759 def verify(self, host): 760 pass 761 762 @property 763 def description(self): 764 return 'All host verification checks pass' 765 766 767class RepairStrategy(object): 768 """ 769 A class for organizing `Verifier` and `RepairAction` objects. 770 771 An instance of `RepairStrategy` is organized as a DAG of `Verifier` 772 objects, plus a list of `RepairAction` objects. The class provides 773 methods for invoking those objects in the required order, when 774 needed: 775 * The `verify()` method walks the verifier DAG in dependency 776 order. 777 * The `repair()` method invokes the repair actions in list order. 778 Each repair action will invoke its dependencies and triggers as 779 needed. 780 781 # The Verifier DAG 782 The verifier DAG is constructed from the first argument passed to 783 the passed to the `RepairStrategy` constructor. That argument is an 784 iterable consisting of three-element tuples in the form 785 `(constructor, tag, deps)`: 786 * The `constructor` value is a callable that creates a `Verifier` 787 as for the interface of the class constructor. For classes 788 that inherit the default constructor from `Verifier`, this can 789 be the class itself. 790 * The `tag` value is the tag to be associated with the constructed 791 verifier. 792 * The `deps` value is an iterable (e.g. list or tuple) of strings. 793 Each string corresponds to the `tag` member of a `Verifier` 794 dependency. 795 796 The tag names of verifiers in the constructed DAG must all be 797 unique. The tag name defined by `RepairStrategy.ROOT_TAG` is 798 reserved and may not be used by any verifier. 799 800 In the input data for the constructor, dependencies must appear 801 before the nodes that depend on them. Thus: 802 803 ((A, 'a', ()), (B, 'b', ('a',))) # This is valid 804 ((B, 'b', ('a',)), (A, 'a', ())) # This will fail! 805 806 Internally, the DAG of verifiers is given unique root node. So, 807 given this input: 808 809 ((C, 'c', ()), 810 (A, 'a', ('c',)), 811 (B, 'b', ('c',))) 812 813 The following DAG is constructed: 814 815 Root 816 / \ 817 A B 818 \ / 819 C 820 821 Since nothing depends on `A` or `B`, the root node guarantees that 822 these two verifiers will both be called and properly logged. 823 824 The root node is not directly accessible; however repair actions can 825 trigger on it by using `RepairStrategy.ROOT_TAG`. Additionally, the 826 node will be logged in `status.log` whenever `verify()` succeeds. 827 828 # The Repair Actions List 829 The list of repair actions is constructed from the second argument 830 passed to the passed to the `RepairStrategy` constructor. That 831 argument is an iterable consisting of four-element tuples in the 832 form `(constructor, tag, deps, triggers)`: 833 * The `constructor` value is a callable that creates a 834 `RepairAction` as for the interface of the class constructor. 835 For classes that inherit the default constructor from 836 `RepairAction`, this can be the class itself. 837 * The `tag` value is the tag to be associated with the constructed 838 repair action. 839 * The `deps` value is an iterable (e.g. list or tuple) of strings. 840 Each string corresponds to the `tag` member of a `Verifier` that 841 the repair action depends on. 842 * The `triggers` value is an iterable (e.g. list or tuple) of 843 strings. Each string corresponds to the `tag` member of a 844 `Verifier` that can trigger the repair action. 845 846 `RepairStrategy` deps and triggers can only refer to verifiers, 847 not to other repair actions. 848 """ 849 850 # This name is reserved; clients may not use it. 851 ROOT_TAG = 'PASS' 852 853 @staticmethod 854 def _add_verifier(verifiers, constructor, tag, dep_tags): 855 """ 856 Construct and remember a verifier. 857 858 Create a `Verifier` using `constructor` and `tag`. Dependencies 859 for construction are found by looking up `dep_tags` in the 860 `verifiers` dictionary. 861 862 After construction, the new verifier is added to `verifiers`. 863 864 @param verifiers Dictionary of verifiers, indexed by tag. 865 @param constructor Verifier construction function. 866 @param tag Tag parameter for the construction function. 867 @param dep_tags Tags of dependencies for the constructor, to 868 be found in `verifiers`. 869 """ 870 assert tag not in verifiers 871 deps = [verifiers[d] for d in dep_tags] 872 verifiers[tag] = constructor(tag, deps) 873 874 def __init__(self, verifier_data, repair_data, host_class): 875 """ 876 Construct a `RepairStrategy` from simplified DAG data. 877 878 The input `verifier_data` object describes how to construct 879 verify nodes and the dependencies that relate them, as detailed 880 above. 881 882 The input `repair_data` object describes how to construct repair 883 actions and their dependencies and triggers, as detailed above. 884 885 @param verifier_data Iterable value with constructors for the 886 elements of the verification DAG and their 887 dependencies. 888 @param repair_data Iterable value with constructors for the 889 elements of the repair action list, and 890 their dependencies and triggers. 891 @property host_class A string identifier that identify what 892 class of host this repair strategy target 893 on, will be used as a field to send repair 894 metrics. 895 """ 896 # Metrics - we report on 'actions' for every repair action 897 # we execute; we report on 'strategy' for every complete 898 # repair operation. 899 self._strategy_counter = metrics.Counter( 900 'chromeos/autotest/repair/repair_strategy_v2') 901 self._actions_counter = metrics.Counter( 902 'chromeos/autotest/repair/repair_actions') 903 self.host_class = host_class 904 # We use the `all_verifiers` list to guarantee that our root 905 # verifier will execute its dependencies in the order provided 906 # to us by our caller. 907 verifier_map = {} 908 all_tags = [] 909 dependencies = set() 910 for constructor, tag, deps in verifier_data: 911 self._add_verifier(verifier_map, constructor, tag, deps) 912 dependencies.update(deps) 913 all_tags.append(tag) 914 # Capture all the verifiers that have nothing depending on them. 915 root_tags = [t for t in all_tags if t not in dependencies] 916 self._add_verifier(verifier_map, _RootVerifier, 917 self.ROOT_TAG, root_tags) 918 self._verify_root = verifier_map[self.ROOT_TAG] 919 self._repair_actions = [] 920 for constructor, tag, deps, triggers in repair_data: 921 r = constructor(tag, 922 [verifier_map[d] for d in deps], 923 [verifier_map[t] for t in triggers], 924 self.host_class) 925 self._repair_actions.append(r) 926 927 def _send_strategy_metrics(self, host, result): 928 """Send repair strategy metrics to monarch 929 930 @param host The target to be repaired. 931 @param result A String that describe a final result for the 932 RepairStrategy. 933 """ 934 info = host.host_info_store.get() 935 board = info.board if info.board else 'unknown' 936 model = info.model if info.model else 'unknown' 937 fields = { 938 'board': board, 939 'host_class': self.host_class, 940 'hostname': _filter_metrics_hostname(host), 941 'model': model, 942 'result': result, 943 } 944 self._strategy_counter.increment(fields=fields) 945 946 def _send_action_metrics(self, host, ra): 947 """Send repair action metrics to monarch 948 949 @param host The target to be repaired. 950 @param ra an RepairAction instance. 951 """ 952 fields = { 953 'tag': ra.tag, 954 'status': ra.status, 955 'hostname': _filter_metrics_hostname(host), 956 'host_class': self.host_class 957 } 958 self._actions_counter.increment(fields=fields) 959 960 def verify(self, host, silent=False): 961 """ 962 Run the verifier DAG on the given host. 963 964 @param host The target to be verified. 965 @param silent If true, don't log host status records. 966 """ 967 self._verify_root._reverify() 968 self._verify_root._verify_host(host, silent) 969 970 def repair(self, host, silent=False): 971 """ 972 Run the repair list on the given host. 973 974 @param host The target to be repaired. 975 @param silent If true, don't log host status records. 976 """ 977 self._verify_root._reverify() 978 attempted = False 979 for ra in self._repair_actions: 980 try: 981 logging.debug('Start repair task: %s.', type(ra).__name__) 982 ra._repair_host(host, silent) 983 except Exception as e: 984 # all logging and exception handling was done at 985 # lower levels 986 pass 987 finally: 988 self._send_action_metrics(host, ra) 989 logging.debug('Finished repair task: %s.', type(ra).__name__) 990 if ra.status not in ('skipped', 'blocked'): 991 attempted = True 992 993 result = 'failure' 994 try: 995 self._verify_root._verify_host(host, silent) 996 result = 'success' if attempted else 'not_attempted' 997 except: 998 if not attempted: 999 result = 'attempt_blocked' 1000 raise 1001 finally: 1002 self._send_strategy_metrics(host, result) 1003 1004 def verifier_is_good(self, tag): 1005 """Find and return result of a verifier. 1006 1007 @param tag: key to be associated with verifier 1008 1009 @returns: a boolean or None value: 1010 True - verifier passed 1011 False - verifier did not pass 1012 None - verifier did not run because it is not applicable 1013 or blocked due to dependency failure 1014 """ 1015 verifier = self.node_by_tag(tag) 1016 if verifier is not None: 1017 result = verifier._is_good() 1018 logging.debug('Verifier with associated tag: %s found', tag) 1019 if result is None: 1020 logging.debug('%s did not run; it is not applicable to run ' 1021 'or blocked due to dependency failure', tag) 1022 elif result == True: 1023 logging.debug('Cached result of %s verifier is pass', tag) 1024 else: 1025 logging.debug('Cached result of %s verifier is fail', tag) 1026 return result 1027 logging.debug('Verifier with associated tag: %s not found', tag) 1028 return None 1029 1030 def node_by_tag(self, tag): 1031 """Find and return node by searched tag. 1032 1033 @param tag: key to be associated with node 1034 1035 @returns: _DependencyNode instance associated with tag 1036 """ 1037 node = self._verify_root._get_node_by_tag(tag) 1038 if node is None: 1039 for n in self._repair_actions: 1040 node = n._get_node_by_tag(tag) 1041 if node is not None: 1042 break 1043 return node 1044 1045 1046def _filter_metrics_hostname(host): 1047 """ 1048 Restrict format of hostnames we'll send to monarch 1049 1050 @param host An host instance(i.e. ServoHost, CrosHost) 1051 """ 1052 if re.match(_HOSTNAME_PATTERN, host.hostname): 1053 return host.hostname 1054 else: 1055 return _DISALLOWED_HOSTNAME 1056