1#!/usr/bin/env python 2# Copyright 2020 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import os 7import copy 8import json 9import time 10import logging 11import shutil 12 13import common 14 15from autotest_lib.server.cros.device_health_profile.profile_constants import * 16 17 18class DeviceHealthProfileError(Exception): 19 """ 20 Generic Exception for failures from DeviceHealthProfile object. 21 """ 22 23 24class InvalidDeviceHealthProfileKeyError(DeviceHealthProfileError): 25 """ 26 Exception to throw when trying to get an invalid health profile key. 27 """ 28 29 30class DeviceHealthProfile(object): 31 """This class provide interfaces to access device health profile 32 that cached on profile host(usually labstation). 33 """ 34 35 def __init__(self, hostname, host_info=None, result_dir=None): 36 """Initialize the class. 37 38 @param hostname: The device hostaname or identification. 39 @param host_info: A HostInfo object of the device of the profile. 40 @param result_dir: A result directory where we can keep local copy of 41 device profile. 42 """ 43 self._hostname = hostname 44 # Cache host-info data 45 self._device_board = host_info.board if host_info else '' 46 self._device_model = host_info.model if host_info else '' 47 # the profile is located on servo-host as temporally location. 48 # The servo-host will be provided later 49 self._profile_host = None 50 # The flag will be set when we set _profile_host. 51 # For servod container setups we keep device prfile on drone instead 52 # the servo-host. 53 self._is_containerized_servod = False 54 self._health_profile = None 55 56 # Construct remote and local file path. 57 self._filename = self._hostname + '.profile' 58 self._remote_path = os.path.join(PROFILE_FILE_DIR, self._filename) 59 result_dir = result_dir or '/tmp' 60 self._local_path = os.path.join(result_dir, self._filename) 61 62 def init_profile(self, profile_host): 63 """Initialize device health profile data. 64 65 If the cached file exists on profile host the method will download 66 file to a local path and read data, otherwise create a profile data 67 from template. 68 69 @param profile_host: An ServoHost object, where is the location 70 we store device health for device. 71 """ 72 if not profile_host: 73 raise DeviceHealthProfileError('The profile host is not provided.') 74 self._profile_host = profile_host 75 # When we work with containeried servod we do not have access to the 76 # remote host and keep profiles in local volume on the drone. 77 if self._profile_host.is_containerized_servod(): 78 self._is_containerized_servod = True 79 # Set path to volume on the drone where we keep all profiles. 80 self._remote_path = os.path.join(PROFILE_DIR_CONTAINER, 81 self._filename) 82 else: 83 # Do a lightweighted check to make sure the machine is up 84 # (by ping), as we don't waste time on unreachable DUT. 85 if not self._profile_host.check_cached_up_status(): 86 raise DeviceHealthProfileError( 87 'The profile host %s is not reachable via ping.' % 88 self._profile_host.hostname) 89 90 # We also want try to check if the DUT is available for ssh. 91 if not self._profile_host.is_up(): 92 raise DeviceHealthProfileError( 93 'The profile host %s is pingable but not sshable.' % 94 self._profile_host.hostname) 95 96 if not self._sync_existing_profile(): 97 self._create_profile_from_template() 98 99 def is_loaded(self): 100 """Check if device profile was loaded on not.""" 101 return self._health_profile is not None 102 103 def _sync_existing_profile(self): 104 """Sync health profile from remote profile host(servohost) and 105 validate profile data is not corrupted or outdated. 106 107 @returns True if sync and validate succeed otherwise False. 108 """ 109 if self._is_containerized_servod: 110 self._copy_from_local() 111 else: 112 if not self._profile_host.is_file_exists(self._remote_path): 113 logging.debug('%s not exists on %s.', self._remote_path, 114 self._profile_host.hostname) 115 return False 116 self._download_profile() 117 118 self._read_profile() 119 return self._validate_profile_data(self._health_profile) 120 121 def _download_profile(self): 122 """Copy profile file from remote profile host to local path. 123 """ 124 logging.debug('Downloading profile file from %s:%s to local path: %s', 125 self._profile_host.hostname, 126 self._remote_path, 127 self._local_path) 128 self._profile_host.get_file(source=self._remote_path, 129 dest=self._local_path) 130 131 def _upload_profile(self): 132 """Copy profile file from local path to remote profile host. 133 """ 134 # Make sure the device health profile directory exists on profile host. 135 self._profile_host.run('mkdir -p %s' % PROFILE_FILE_DIR, 136 ignore_status=True) 137 138 logging.debug('Uploading profile from local path: %s to remote %s:%s', 139 self._local_path, 140 self._profile_host.hostname, 141 self._remote_path) 142 self._profile_host.send_file(source=self._local_path, 143 dest=self._remote_path) 144 145 def _copy_from_local(self): 146 """Copy profile from local volume to result directory. 147 148 For Satlab all device profiles saved in special volume on the drone. 149 """ 150 if os.path.exists(self._remote_path): 151 logging.info('Copying profile file from %s to local path: %s', 152 self._remote_path, self._local_path) 153 shutil.copyfile(self._remote_path, self._local_path) 154 else: 155 logging.info( 156 'Skipping copy from remote path %s as file is not exist.', 157 self._remote_path) 158 159 def _copy_to_local(self): 160 """Copy profile file from result directory to local volume. 161 162 For Satlab all device profiles saved in special volume on the drone. 163 """ 164 logging.info('Copying profile file from %s to remote path: %s', 165 self._local_path, self._remote_path) 166 shutil.copyfile(self._local_path, self._remote_path) 167 168 def _read_profile(self): 169 """Read profile data from local path and convert it into json format. 170 """ 171 if not os.path.exists(self._local_path): 172 logging.info('Skipping reading as local file: %s is not exist.', 173 self._local_path) 174 return 175 logging.debug('Reading device health profile from: %s', 176 self._local_path) 177 with open(self._local_path, 'r') as f: 178 try: 179 self._health_profile = json.load(f) 180 except Exception as e: 181 logging.warning('Could not decode %s to json format, the file' 182 ' may be corrupted; %s', 183 self._local_path, str(e)) 184 185 def _dump_profile(self): 186 """Dump profile data into local file. 187 """ 188 logging.debug('Dumping device health profile to: %s', self._local_path) 189 with open(self._local_path, 'w') as f: 190 json.dump(self._health_profile, f) 191 192 def _create_profile_from_template(self): 193 """Create a new health profile dict from template. 194 """ 195 logging.info('Creating new health profile from template for %s.', 196 self._hostname) 197 self._health_profile = copy.deepcopy(DEVICE_HEALTH_PROFILE_TEMPLATE) 198 if self._device_board or self._device_model: 199 self._set_board(self._device_board) 200 self._set_model(self._device_model) 201 self.refresh_update_time() 202 203 def _validate_profile_data(self, data): 204 """Validate the given profile data is in good state. 205 """ 206 logging.debug('Validating health profile data.') 207 if not isinstance(data, dict): 208 logging.debug('Non-dict type detected, the profile data' 209 ' may be corrupted.') 210 return False 211 212 # Validate that cached health profile version is not outdated. 213 input_version = data.get(PROFILE_VERSION_KEY) 214 if input_version != PROFILE_VERSION: 215 logging.info('The input profile version: %s is outdated,' 216 ' expected version: %s', input_version, 217 PROFILE_VERSION) 218 return False 219 220 # Validate that cached board/model is match with device, in case 221 # there is was decom/redeploy. 222 cached_board = data.get(BOARD_KEY) 223 cached_model = data.get(MODEL_KEY) 224 if (self._device_board and cached_board 225 and (self._device_board != cached_board)): 226 logging.info( 227 'The board: %s from host_info does not match board: %s' 228 ' from cached profile, the device hardware probably has' 229 ' been changed.', self._device_board, cached_board) 230 return False 231 if (self._device_model and cached_model 232 and (self._device_model != cached_model)): 233 logging.info( 234 'The model: %s from host_info does not match model: %s' 235 ' from cached profile, the device hardware probably has' 236 ' been changed.', self._device_model, cached_model) 237 return False 238 return True 239 240 def _is_validate_profile_key(self, key): 241 return key in DEVICE_HEALTH_PROFILE_TEMPLATE 242 243 def _update_profile(self, key, value): 244 if not self._is_validate_profile_key(key): 245 logging.info('%s is an invalid health profile key.', key) 246 return 247 logging.debug('Updating health profile key %s to %s', key, value) 248 self._health_profile[key] = value 249 250 def _get_value(self, key): 251 """The basic interface to get a value from health profile dictionary. 252 253 @raises InvalidDeviceHealthProfileKeyError if the input key is 254 not a valid device health profile key. 255 """ 256 if not self._is_validate_profile_key(key): 257 raise InvalidDeviceHealthProfileKeyError( 258 '%s is not a valid device health profile key' % key) 259 return self._health_profile.get(key) 260 261 def _set_board(self, board): 262 # pylint: disable=missing-docstring 263 self._update_profile(BOARD_KEY, board) 264 265 def _set_model(self, model): 266 # pylint: disable=missing-docstring 267 self._update_profile(MODEL_KEY, model) 268 269 @property 270 def health_profile(self): 271 # pylint: disable=missing-docstring 272 return self._health_profile 273 274 def get_board(self): 275 """Get device board from cached device health profile. 276 """ 277 return self._get_value(BOARD_KEY) 278 279 def get_model(self): 280 """Get device model from cached device health profile. 281 """ 282 return self._get_value(MODEL_KEY) 283 284 def get_profile_version(self): 285 """Get the version of cached device health profile. 286 """ 287 return self._get_value(PROFILE_VERSION_KEY) 288 289 def get_dut_state(self): 290 """Get most recent dut state from device health profile. 291 """ 292 return self._get_value(DUT_STATE_KEY) 293 294 def get_servo_state(self): 295 """Get most recent servo state from device health profile. 296 """ 297 return self._get_value(SERVO_STATE_KEY) 298 299 def get_cros_stable_version(self): 300 """Get the most recent used cros image during repair. 301 """ 302 return self._get_value(CROS_STABLE_VERSION_KEY) 303 304 def get_firmware_stable_version(self): 305 """Get the most recent used firmware image during repair, we only 306 expect to see this on non-faft pool device. 307 """ 308 return self._get_value(FIRMWARE_STABLE_VERSION_KEY) 309 310 def get_last_update_time(self): 311 """Get the timestamp of when device health profile file received 312 the most recent updates. Example "2020-01-01 15:05:05" 313 """ 314 return self._get_value(LAST_UPDATE_TIME_KEY) 315 316 def get_last_update_time_epoch(self): 317 """Get the unix time in int of when device health profile file 318 received the most recent updates. 319 """ 320 return int(time.mktime(time.strptime( 321 self.get_last_update_time(), TIME_PATTERN))) 322 323 def get_enter_current_state_time(self): 324 """Get the timestamp of when DUT enter current state. 325 Example "2020-01-01 15:05:05" 326 """ 327 return self._get_value(TIME_ENTER_CURRENT_STATE_KEY) 328 329 def get_enter_current_state_time_epoch(self): 330 """Get the unix time in int of when DUT enter current state. 331 """ 332 return int(time.mktime(time.strptime( 333 self.get_enter_current_state_time(), TIME_PATTERN))) 334 335 def get_repair_fail_count(self): 336 """Get repair fail count since enter current state. 337 """ 338 return self._get_value(REPAIR_FAIL_COUNT_KEY) 339 340 def get_provision_fail_count(self): 341 """Get provision fail count since enter current state. 342 """ 343 return self._get_value(PROVISION_FAIL_COUNT_KEY) 344 345 def get_failed_verifiers(self): 346 """Get all failed verifiers. 347 348 @returns a dict represents all failed verifiers and 349 their fail count. 350 """ 351 return self._get_value(FAILED_VERIFIERS_KEY) 352 353 def get_failed_verifier(self, tag): 354 """Get fail count of a specific verifier. 355 356 @param tag: the short identifier of the verifier. 357 358 @returns the fail count of the specified verifier. 359 """ 360 return self.get_failed_verifiers().get(tag, 0) 361 362 def get_succeed_repair_actions(self): 363 """Get all repair actions that has been applied and succeed. 364 365 @returns a dict represents all succeed repair actions 366 and their success count. 367 """ 368 return self._get_value(SUCCEED_REPAIR_ACTIONS_KEY) 369 370 def get_succeed_repair_action(self, tag): 371 """Get success count of a specific repair action. 372 373 @param tag: the short identifier of the repair action. 374 375 @returns the success count of the specified repair action. 376 """ 377 return self.get_succeed_repair_actions().get(tag, 0) 378 379 def get_failed_repair_actions(self): 380 """Get all repair actions that has been applied and failed. 381 382 @returns a dict represents all failed repair actions 383 and their fail count. 384 """ 385 return self._get_value(FAILED_REPAIR_ACTIONS_KEY) 386 387 def get_failed_repair_action(self, tag): 388 """Get fail count of a specific repair action. 389 390 @param tag: the short identifier of the repair action. 391 392 @returns the failed count of the specified repair action. 393 """ 394 return self.get_failed_repair_actions().get(tag, 0) 395 396 def get_badblocks_ro_run_time(self): 397 """Get the timestamp of when run last read-only badblocks check 398 on the device. Example "2020-01-01 15:05:05" 399 """ 400 last_time = self._get_value(LAST_BADBLOCKS_RO_RUN_TIME_KEY) 401 return last_time or DEFAULT_TIMESTAMP 402 403 def get_badblocks_ro_run_time_epoch(self): 404 """Get the unix time of when run last read-only badblocks check 405 on the device." 406 """ 407 last_time = self.get_badblocks_ro_run_time() 408 return int(time.mktime(time.strptime(last_time, TIME_PATTERN))) 409 410 def get_badblocks_rw_run_time(self): 411 """Get the timestamp of when run last read-write badblocks check 412 on the device. Example "2020-01-01 15:05:05" 413 """ 414 last_time = self._get_value(LAST_BADBLOCKS_RW_RUN_TIME_KEY) 415 return last_time or DEFAULT_TIMESTAMP 416 417 def get_badblocks_rw_run_time_epoch(self): 418 """Get the unix time of when run last read-write badblocks check 419 on the device." 420 """ 421 last_time = self.get_badblocks_rw_run_time() 422 return int(time.mktime(time.strptime(last_time, TIME_PATTERN))) 423 424 def get_servo_micro_fw_update_time(self): 425 """Get the timestamp of when run last fw update for servo_micro. 426 Example "2020-01-01 15:05:05" 427 """ 428 last_time = self._get_value(LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY) 429 return last_time or DEFAULT_TIMESTAMP 430 431 def get_servo_micro_fw_update_time_epoch(self): 432 """Get the unix time of when run last fw update for servo_micro. 433 """ 434 last_time = self.get_servo_micro_fw_update_time() 435 return int(time.mktime(time.strptime(last_time, TIME_PATTERN))) 436 437 def set_cros_stable_version(self, build): 438 """Set the most recent used cros image during repair. 439 """ 440 self._update_profile(CROS_STABLE_VERSION_KEY, build) 441 442 def set_firmware_stable_version(self, build): 443 """Set the most recent used firmware image during repair, we only 444 expect to see this on non-faft pool device. 445 """ 446 self._update_profile(FIRMWARE_STABLE_VERSION_KEY, build) 447 448 def refresh_badblocks_ro_run_time(self): 449 """Get the timestamp of when run last read-only badblocks check 450 on the device. 451 """ 452 return self._update_profile( 453 LAST_BADBLOCKS_RO_RUN_TIME_KEY, 454 time.strftime(TIME_PATTERN, time.localtime())) 455 456 def refresh_badblocks_rw_run_time(self): 457 """Get the timestamp of when run last read-write badblocks check 458 on the device. 459 """ 460 return self._update_profile( 461 LAST_BADBLOCKS_RW_RUN_TIME_KEY, 462 time.strftime(TIME_PATTERN, time.localtime())) 463 464 def refresh_servo_miro_fw_update_run_time(self): 465 """Get the timestamp of when run last fw update for servo_micro. 466 """ 467 return self._update_profile( 468 LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY, 469 time.strftime(TIME_PATTERN, time.localtime())) 470 471 def refresh_update_time(self): 472 """Update last_update_time to current timestamp in UTC. 473 """ 474 self._update_profile(LAST_UPDATE_TIME_KEY, 475 time.strftime(TIME_PATTERN, time.localtime())) 476 477 def increase_repair_fail_count(self): 478 # pylint: disable=missing-docstring 479 self._update_profile(REPAIR_FAIL_COUNT_KEY, 480 self.get_repair_fail_count() + 1) 481 482 def increase_provision_fail_count(self): 483 # pylint: disable=missing-docstring 484 self._update_profile(PROVISION_FAIL_COUNT_KEY, 485 self.get_provision_fail_count() + 1) 486 487 def insert_failed_verifier(self, tag): 488 """Increase fail count for a specific verifier by 1. 489 """ 490 verifiers = self.get_failed_verifiers() 491 if tag not in verifiers: 492 verifiers[tag] = 0 493 verifiers[tag] += 1 494 self._update_profile(FAILED_VERIFIERS_KEY, verifiers) 495 496 def insert_succeed_repair_action(self, tag): 497 """Increase succeed count for a specific repair action by 1. 498 """ 499 actions = self.get_succeed_repair_actions() 500 if tag not in actions: 501 actions[tag] = 0 502 actions[tag] += 1 503 self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, actions) 504 505 def insert_failed_repair_action(self, tag): 506 """Increase fail count for a specific repair action by 1. 507 """ 508 actions = self.get_failed_repair_actions() 509 if tag not in actions: 510 actions[tag] = 0 511 actions[tag] += 1 512 self._update_profile(FAILED_REPAIR_ACTIONS_KEY, actions) 513 514 def update_dut_state(self, state, reset_counters=False): 515 """Update state of the device, this will also reset all fail counts. 516 517 @param state: the new dut state to update. 518 @param reset_counts: a boolean to indicate whether we want to reset 519 all counters. 520 """ 521 if state == self.get_dut_state(): 522 logging.debug('The host is already in %s state.', state) 523 if state == DUT_STATE_REPAIR_FAILED: 524 self.increase_repair_fail_count() 525 return 526 # Reset some records when dut state changes. 527 if reset_counters: 528 self._update_profile(REPAIR_FAIL_COUNT_KEY, 0) 529 self._update_profile(PROVISION_FAIL_COUNT_KEY, 0) 530 self._update_profile(FAILED_VERIFIERS_KEY, {}) 531 self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, {}) 532 self._update_profile(FAILED_REPAIR_ACTIONS_KEY, {}) 533 self._update_profile(TIME_ENTER_CURRENT_STATE_KEY, 534 time.strftime(TIME_PATTERN, time.localtime())) 535 self._update_profile(DUT_STATE_KEY, state) 536 537 def update_servo_state(self, state): 538 # pylint: disable=missing-docstring 539 if state == self.get_servo_state(): 540 logging.debug('The servo is already in %s state.', state) 541 return 542 self._update_profile(SERVO_STATE_KEY, state) 543 544 def close(self): 545 # pylint: disable=missing-docstring 546 self.refresh_update_time() 547 self._dump_profile() 548 if self._is_containerized_servod: 549 self._copy_to_local() 550 else: 551 self._upload_profile() 552