xref: /aosp_15_r20/external/autotest/server/cros/device_health_profile/device_health_profile.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1#!/usr/bin/env python
2# Copyright 2020 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import os
7import copy
8import json
9import time
10import logging
11import shutil
12
13import common
14
15from autotest_lib.server.cros.device_health_profile.profile_constants import *
16
17
18class DeviceHealthProfileError(Exception):
19    """
20    Generic Exception for failures from DeviceHealthProfile object.
21    """
22
23
24class InvalidDeviceHealthProfileKeyError(DeviceHealthProfileError):
25    """
26    Exception to throw when trying to get an invalid health profile key.
27    """
28
29
30class DeviceHealthProfile(object):
31    """This class provide interfaces to access device health profile
32    that cached on profile host(usually labstation).
33    """
34
35    def __init__(self, hostname, host_info=None, result_dir=None):
36        """Initialize the class.
37
38        @param hostname:    The device hostaname or identification.
39        @param host_info:   A HostInfo object of the device of the profile.
40        @param result_dir:  A result directory where we can keep local copy of
41                            device profile.
42        """
43        self._hostname = hostname
44        # Cache host-info data
45        self._device_board = host_info.board if host_info else ''
46        self._device_model = host_info.model if host_info else ''
47        # the profile is located on servo-host as temporally location.
48        # The servo-host will be provided later
49        self._profile_host = None
50        # The flag will be set when we set _profile_host.
51        # For servod container setups we keep device prfile on drone instead
52        # the servo-host.
53        self._is_containerized_servod = False
54        self._health_profile = None
55
56        # Construct remote and local file path.
57        self._filename = self._hostname + '.profile'
58        self._remote_path = os.path.join(PROFILE_FILE_DIR, self._filename)
59        result_dir = result_dir or '/tmp'
60        self._local_path = os.path.join(result_dir, self._filename)
61
62    def init_profile(self, profile_host):
63        """Initialize device health profile data.
64
65        If the cached file exists on profile host the method will download
66        file to a local path and read data, otherwise create a profile data
67        from template.
68
69        @param profile_host: An ServoHost object, where is the location
70                             we store device health for device.
71        """
72        if not profile_host:
73            raise DeviceHealthProfileError('The profile host is not provided.')
74        self._profile_host = profile_host
75        # When we work with containeried servod we do not have access to the
76        # remote host and keep profiles in local volume on the drone.
77        if self._profile_host.is_containerized_servod():
78            self._is_containerized_servod = True
79            # Set path to volume on the drone where we keep all profiles.
80            self._remote_path = os.path.join(PROFILE_DIR_CONTAINER,
81                                             self._filename)
82        else:
83            # Do a lightweighted check to make sure the machine is up
84            # (by ping), as we don't waste time on unreachable DUT.
85            if not self._profile_host.check_cached_up_status():
86                raise DeviceHealthProfileError(
87                        'The profile host %s is not reachable via ping.' %
88                        self._profile_host.hostname)
89
90            # We also want try to check if the DUT is available for ssh.
91            if not self._profile_host.is_up():
92                raise DeviceHealthProfileError(
93                        'The profile host %s is pingable but not sshable.' %
94                        self._profile_host.hostname)
95
96        if not self._sync_existing_profile():
97            self._create_profile_from_template()
98
99    def is_loaded(self):
100        """Check if device profile was loaded on not."""
101        return self._health_profile is not None
102
103    def _sync_existing_profile(self):
104        """Sync health profile from remote profile host(servohost) and
105        validate profile data is not corrupted or outdated.
106
107        @returns True if sync and validate succeed otherwise False.
108        """
109        if self._is_containerized_servod:
110            self._copy_from_local()
111        else:
112            if not self._profile_host.is_file_exists(self._remote_path):
113                logging.debug('%s not exists on %s.', self._remote_path,
114                              self._profile_host.hostname)
115                return False
116            self._download_profile()
117
118        self._read_profile()
119        return self._validate_profile_data(self._health_profile)
120
121    def _download_profile(self):
122        """Copy profile file from remote profile host to local path.
123        """
124        logging.debug('Downloading profile file from %s:%s to local path: %s',
125                      self._profile_host.hostname,
126                      self._remote_path,
127                      self._local_path)
128        self._profile_host.get_file(source=self._remote_path,
129                                    dest=self._local_path)
130
131    def _upload_profile(self):
132        """Copy profile file from local path to remote profile host.
133        """
134        # Make sure the device health profile directory exists on profile host.
135        self._profile_host.run('mkdir -p %s' % PROFILE_FILE_DIR,
136                               ignore_status=True)
137
138        logging.debug('Uploading profile from local path: %s to remote %s:%s',
139                      self._local_path,
140                      self._profile_host.hostname,
141                      self._remote_path)
142        self._profile_host.send_file(source=self._local_path,
143                                     dest=self._remote_path)
144
145    def _copy_from_local(self):
146        """Copy profile from local volume to result directory.
147
148        For Satlab all device profiles saved in special volume on the drone.
149        """
150        if os.path.exists(self._remote_path):
151            logging.info('Copying profile file from %s to local path: %s',
152                         self._remote_path, self._local_path)
153            shutil.copyfile(self._remote_path, self._local_path)
154        else:
155            logging.info(
156                    'Skipping copy from remote path %s as file is not exist.',
157                    self._remote_path)
158
159    def _copy_to_local(self):
160        """Copy profile file from result directory to local volume.
161
162        For Satlab all device profiles saved in special volume on the drone.
163        """
164        logging.info('Copying profile file from %s to remote path: %s',
165                     self._local_path, self._remote_path)
166        shutil.copyfile(self._local_path, self._remote_path)
167
168    def _read_profile(self):
169        """Read profile data from local path and convert it into json format.
170        """
171        if not os.path.exists(self._local_path):
172            logging.info('Skipping reading as local file: %s is not exist.',
173                         self._local_path)
174            return
175        logging.debug('Reading device health profile from: %s',
176                      self._local_path)
177        with open(self._local_path, 'r') as f:
178            try:
179                self._health_profile = json.load(f)
180            except Exception as e:
181                logging.warning('Could not decode %s to json format, the file'
182                                ' may be corrupted; %s',
183                                self._local_path, str(e))
184
185    def _dump_profile(self):
186        """Dump profile data into local file.
187        """
188        logging.debug('Dumping device health profile to: %s', self._local_path)
189        with open(self._local_path, 'w') as f:
190            json.dump(self._health_profile, f)
191
192    def _create_profile_from_template(self):
193        """Create a new health profile dict from template.
194        """
195        logging.info('Creating new health profile from template for %s.',
196                     self._hostname)
197        self._health_profile = copy.deepcopy(DEVICE_HEALTH_PROFILE_TEMPLATE)
198        if self._device_board or self._device_model:
199            self._set_board(self._device_board)
200            self._set_model(self._device_model)
201        self.refresh_update_time()
202
203    def _validate_profile_data(self, data):
204        """Validate the given profile data is in good state.
205        """
206        logging.debug('Validating health profile data.')
207        if not isinstance(data, dict):
208            logging.debug('Non-dict type detected, the profile data'
209                          ' may be corrupted.')
210            return False
211
212        # Validate that cached health profile version is not outdated.
213        input_version = data.get(PROFILE_VERSION_KEY)
214        if input_version != PROFILE_VERSION:
215            logging.info('The input profile version: %s is outdated,'
216                         ' expected version: %s', input_version,
217                         PROFILE_VERSION)
218            return False
219
220        # Validate that cached board/model is match with device, in case
221        # there is was decom/redeploy.
222        cached_board = data.get(BOARD_KEY)
223        cached_model = data.get(MODEL_KEY)
224        if (self._device_board and cached_board
225                    and (self._device_board != cached_board)):
226            logging.info(
227                    'The board: %s from host_info does not match board: %s'
228                    ' from cached profile, the device hardware probably has'
229                    ' been changed.', self._device_board, cached_board)
230            return False
231        if (self._device_model and cached_model
232                    and (self._device_model != cached_model)):
233            logging.info(
234                    'The model: %s from host_info does not match model: %s'
235                    ' from cached profile, the device hardware probably has'
236                    ' been changed.', self._device_model, cached_model)
237            return False
238        return True
239
240    def _is_validate_profile_key(self, key):
241        return key in DEVICE_HEALTH_PROFILE_TEMPLATE
242
243    def _update_profile(self, key, value):
244        if not self._is_validate_profile_key(key):
245            logging.info('%s is an invalid health profile key.', key)
246            return
247        logging.debug('Updating health profile key %s to %s', key, value)
248        self._health_profile[key] = value
249
250    def _get_value(self, key):
251        """The basic interface to get a value from health profile dictionary.
252
253        @raises InvalidDeviceHealthProfileKeyError if the input key is
254                not a valid device health profile key.
255        """
256        if not self._is_validate_profile_key(key):
257            raise InvalidDeviceHealthProfileKeyError(
258                '%s is not a valid device health profile key' % key)
259        return self._health_profile.get(key)
260
261    def _set_board(self, board):
262        # pylint: disable=missing-docstring
263        self._update_profile(BOARD_KEY, board)
264
265    def _set_model(self, model):
266        # pylint: disable=missing-docstring
267        self._update_profile(MODEL_KEY, model)
268
269    @property
270    def health_profile(self):
271        # pylint: disable=missing-docstring
272        return self._health_profile
273
274    def get_board(self):
275        """Get device board from cached device health profile.
276        """
277        return self._get_value(BOARD_KEY)
278
279    def get_model(self):
280        """Get device model from cached device health profile.
281        """
282        return self._get_value(MODEL_KEY)
283
284    def get_profile_version(self):
285        """Get the version of cached device health profile.
286        """
287        return self._get_value(PROFILE_VERSION_KEY)
288
289    def get_dut_state(self):
290        """Get most recent dut state from device health profile.
291        """
292        return self._get_value(DUT_STATE_KEY)
293
294    def get_servo_state(self):
295        """Get most recent servo state from device health profile.
296        """
297        return self._get_value(SERVO_STATE_KEY)
298
299    def get_cros_stable_version(self):
300        """Get the most recent used cros image during repair.
301        """
302        return self._get_value(CROS_STABLE_VERSION_KEY)
303
304    def get_firmware_stable_version(self):
305        """Get the most recent used firmware image during repair, we only
306        expect to see this on non-faft pool device.
307        """
308        return self._get_value(FIRMWARE_STABLE_VERSION_KEY)
309
310    def get_last_update_time(self):
311        """Get the timestamp of when device health profile file received
312        the most recent updates. Example "2020-01-01 15:05:05"
313        """
314        return self._get_value(LAST_UPDATE_TIME_KEY)
315
316    def get_last_update_time_epoch(self):
317        """Get the unix time in int of when device health profile file
318        received the most recent updates.
319        """
320        return int(time.mktime(time.strptime(
321            self.get_last_update_time(), TIME_PATTERN)))
322
323    def get_enter_current_state_time(self):
324        """Get the timestamp of when DUT enter current state.
325        Example "2020-01-01 15:05:05"
326        """
327        return self._get_value(TIME_ENTER_CURRENT_STATE_KEY)
328
329    def get_enter_current_state_time_epoch(self):
330        """Get the unix time in int of when DUT enter current state.
331        """
332        return int(time.mktime(time.strptime(
333            self.get_enter_current_state_time(), TIME_PATTERN)))
334
335    def get_repair_fail_count(self):
336        """Get repair fail count since enter current state.
337        """
338        return self._get_value(REPAIR_FAIL_COUNT_KEY)
339
340    def get_provision_fail_count(self):
341        """Get provision fail count since enter current state.
342        """
343        return self._get_value(PROVISION_FAIL_COUNT_KEY)
344
345    def get_failed_verifiers(self):
346        """Get all failed verifiers.
347
348        @returns a dict represents all failed verifiers and
349                 their fail count.
350        """
351        return self._get_value(FAILED_VERIFIERS_KEY)
352
353    def get_failed_verifier(self, tag):
354        """Get fail count of a specific verifier.
355
356        @param tag: the short identifier of the verifier.
357
358        @returns the fail count of the specified verifier.
359        """
360        return self.get_failed_verifiers().get(tag, 0)
361
362    def get_succeed_repair_actions(self):
363        """Get all repair actions that has been applied and succeed.
364
365        @returns a dict represents all succeed repair actions
366                 and their success count.
367        """
368        return self._get_value(SUCCEED_REPAIR_ACTIONS_KEY)
369
370    def get_succeed_repair_action(self, tag):
371        """Get success count of a specific repair action.
372
373        @param tag: the short identifier of the repair action.
374
375        @returns the success count of the specified repair action.
376        """
377        return self.get_succeed_repair_actions().get(tag, 0)
378
379    def get_failed_repair_actions(self):
380        """Get all repair actions that has been applied and failed.
381
382        @returns a dict represents all failed repair actions
383                 and their fail count.
384        """
385        return self._get_value(FAILED_REPAIR_ACTIONS_KEY)
386
387    def get_failed_repair_action(self, tag):
388        """Get fail count of a specific repair action.
389
390        @param tag: the short identifier of the repair action.
391
392        @returns the failed count of the specified repair action.
393        """
394        return self.get_failed_repair_actions().get(tag, 0)
395
396    def get_badblocks_ro_run_time(self):
397        """Get the timestamp of when run last read-only badblocks check
398        on the device. Example "2020-01-01 15:05:05"
399        """
400        last_time = self._get_value(LAST_BADBLOCKS_RO_RUN_TIME_KEY)
401        return last_time or DEFAULT_TIMESTAMP
402
403    def get_badblocks_ro_run_time_epoch(self):
404        """Get the unix time of when run last read-only badblocks check
405        on the device."
406        """
407        last_time = self.get_badblocks_ro_run_time()
408        return int(time.mktime(time.strptime(last_time, TIME_PATTERN)))
409
410    def get_badblocks_rw_run_time(self):
411        """Get the timestamp of when run last read-write badblocks check
412        on the device. Example "2020-01-01 15:05:05"
413        """
414        last_time = self._get_value(LAST_BADBLOCKS_RW_RUN_TIME_KEY)
415        return last_time or DEFAULT_TIMESTAMP
416
417    def get_badblocks_rw_run_time_epoch(self):
418        """Get the unix time of when run last read-write badblocks check
419        on the device."
420        """
421        last_time = self.get_badblocks_rw_run_time()
422        return int(time.mktime(time.strptime(last_time, TIME_PATTERN)))
423
424    def get_servo_micro_fw_update_time(self):
425        """Get the timestamp of when run last fw update for servo_micro.
426        Example "2020-01-01 15:05:05"
427        """
428        last_time = self._get_value(LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY)
429        return last_time or DEFAULT_TIMESTAMP
430
431    def get_servo_micro_fw_update_time_epoch(self):
432        """Get the unix time of when run last fw update for servo_micro.
433        """
434        last_time = self.get_servo_micro_fw_update_time()
435        return int(time.mktime(time.strptime(last_time, TIME_PATTERN)))
436
437    def set_cros_stable_version(self, build):
438        """Set the most recent used cros image during repair.
439        """
440        self._update_profile(CROS_STABLE_VERSION_KEY, build)
441
442    def set_firmware_stable_version(self, build):
443        """Set the most recent used firmware image during repair, we only
444        expect to see this on non-faft pool device.
445        """
446        self._update_profile(FIRMWARE_STABLE_VERSION_KEY, build)
447
448    def refresh_badblocks_ro_run_time(self):
449        """Get the timestamp of when run last read-only badblocks check
450        on the device.
451        """
452        return self._update_profile(
453                LAST_BADBLOCKS_RO_RUN_TIME_KEY,
454                time.strftime(TIME_PATTERN, time.localtime()))
455
456    def refresh_badblocks_rw_run_time(self):
457        """Get the timestamp of when run last read-write badblocks check
458        on the device.
459        """
460        return self._update_profile(
461                LAST_BADBLOCKS_RW_RUN_TIME_KEY,
462                time.strftime(TIME_PATTERN, time.localtime()))
463
464    def refresh_servo_miro_fw_update_run_time(self):
465        """Get the timestamp of when run last fw update for servo_micro.
466        """
467        return self._update_profile(
468                LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY,
469                time.strftime(TIME_PATTERN, time.localtime()))
470
471    def refresh_update_time(self):
472        """Update last_update_time to current timestamp in UTC.
473        """
474        self._update_profile(LAST_UPDATE_TIME_KEY,
475                             time.strftime(TIME_PATTERN, time.localtime()))
476
477    def increase_repair_fail_count(self):
478        # pylint: disable=missing-docstring
479        self._update_profile(REPAIR_FAIL_COUNT_KEY,
480                             self.get_repair_fail_count() + 1)
481
482    def increase_provision_fail_count(self):
483        # pylint: disable=missing-docstring
484        self._update_profile(PROVISION_FAIL_COUNT_KEY,
485                             self.get_provision_fail_count() + 1)
486
487    def insert_failed_verifier(self, tag):
488        """Increase fail count for a specific verifier by 1.
489        """
490        verifiers = self.get_failed_verifiers()
491        if tag not in verifiers:
492            verifiers[tag] = 0
493        verifiers[tag] += 1
494        self._update_profile(FAILED_VERIFIERS_KEY, verifiers)
495
496    def insert_succeed_repair_action(self, tag):
497        """Increase succeed count for a specific repair action by 1.
498        """
499        actions = self.get_succeed_repair_actions()
500        if tag not in actions:
501            actions[tag] = 0
502        actions[tag] += 1
503        self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, actions)
504
505    def insert_failed_repair_action(self, tag):
506        """Increase fail count for a specific repair action by 1.
507        """
508        actions = self.get_failed_repair_actions()
509        if tag not in actions:
510            actions[tag] = 0
511        actions[tag] += 1
512        self._update_profile(FAILED_REPAIR_ACTIONS_KEY, actions)
513
514    def update_dut_state(self, state, reset_counters=False):
515        """Update state of the device, this will also reset all fail counts.
516
517        @param state: the new dut state to update.
518        @param reset_counts: a boolean to indicate whether we want to reset
519                             all counters.
520        """
521        if state == self.get_dut_state():
522            logging.debug('The host is already in %s state.', state)
523            if state == DUT_STATE_REPAIR_FAILED:
524                self.increase_repair_fail_count()
525            return
526        # Reset some records when dut state changes.
527        if reset_counters:
528            self._update_profile(REPAIR_FAIL_COUNT_KEY, 0)
529            self._update_profile(PROVISION_FAIL_COUNT_KEY, 0)
530            self._update_profile(FAILED_VERIFIERS_KEY, {})
531            self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, {})
532            self._update_profile(FAILED_REPAIR_ACTIONS_KEY, {})
533        self._update_profile(TIME_ENTER_CURRENT_STATE_KEY,
534                             time.strftime(TIME_PATTERN, time.localtime()))
535        self._update_profile(DUT_STATE_KEY, state)
536
537    def update_servo_state(self, state):
538        # pylint: disable=missing-docstring
539        if state == self.get_servo_state():
540            logging.debug('The servo is already in %s state.', state)
541            return
542        self._update_profile(SERVO_STATE_KEY, state)
543
544    def close(self):
545        # pylint: disable=missing-docstring
546        self.refresh_update_time()
547        self._dump_profile()
548        if self._is_containerized_servod:
549            self._copy_to_local()
550        else:
551            self._upload_profile()
552