1# Copyright 2020 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import logging 6import time 7import re 8 9from autotest_lib.client.common_lib import error 10 11# Storage types supported 12STORAGE_TYPE_SSD = 'ssd' 13STORAGE_TYPE_NVME = 'nvme' 14STORAGE_TYPE_MMC = 'mmc' 15 16# Storage states supported 17STORAGE_STATE_NORMAL = 'normal' 18STORAGE_STATE_WARNING = 'warning' 19STORAGE_STATE_CRITICAL = 'critical' 20 21BADBLOCK_CHECK_RO = 'RO' 22BADBLOCK_CHECK_RW = 'RW' 23 24 25class StorageError(error.TestFail): 26 """Custom error class to indicate is unsupported or unavailable 27 detect storage info. 28 """ 29 pass 30 31 32class ConsoleError(error.TestFail): 33 """Common error class for servod console-back control failures.""" 34 pass 35 36 37class StorageStateValidator(object): 38 """Class to detect types and state of the DUT storage. 39 40 The class supporting SSD, NVME and MMC storage types. 41 The state detection and set state as: 42 - normal - drive in a good shape 43 - warning - drive close to the worn out state by any metrics 44 - critical - drive is worn out and has errors 45 """ 46 47 def __init__(self, host): 48 """Initialize the storage validator. 49 50 @param host: cros_host object providing console access 51 for reading the target info. 52 53 @raises ConsoleError: if cannot read info 54 @raises StorageError: if info is not preset 55 """ 56 self._host = host 57 self._storage_type = None 58 self._storage_state = None 59 self._info = [] 60 61 if not self._host: 62 raise StorageError('Host is not provided') 63 64 self._read_storage_info() 65 66 def _read_storage_info(self): 67 """Reading the storage info from SMART 68 69 The info will be located as collection of lines 70 @raises StorageError: if no info provided or data unavailable 71 """ 72 logging.info('Extraction storage info') 73 command = '. /usr/share/misc/storage-info-common.sh; get_storage_info' 74 cmd_result = self._host.run(command, ignore_status=True) 75 if cmd_result.exit_status != 0: 76 raise StorageError('receive error: %s;', cmd_result.stderr) 77 78 if cmd_result.stdout: 79 self._info = cmd_result.stdout.splitlines() 80 if len(self._info) == 0: 81 raise StorageError('Storage info is empty') 82 83 def get_type(self): 84 """Determine the type of the storage on the host. 85 86 @returns storage type (SSD, NVME, MMC) 87 88 @raises StorageError: if type not supported or not determine 89 """ 90 if not self._storage_type: 91 self._storage_type = self._get_storage_type() 92 return self._storage_type 93 94 def get_state(self, run_badblocks=None): 95 """Determine the type of the storage on the host. 96 97 @param run_badblocks: string key to run badblock check. 98 None - check if we can run it 99 "NOT" - do not run check 100 "RW" - run read-write if booted from USB 101 "RO" - run read-only check 102 @returns storage state (normal|warning|critical) 103 104 @raises StorageError: if type not supported or state cannot 105 be determine 106 """ 107 if not self._storage_state: 108 storage_type = self.get_type() 109 if storage_type == STORAGE_TYPE_SSD: 110 self._storage_state = self._get_state_for_ssd() 111 elif storage_type == STORAGE_TYPE_MMC: 112 self._storage_state = self._get_state_for_mms() 113 elif storage_type == STORAGE_TYPE_NVME: 114 self._storage_state = self._get_state_for_nvme() 115 if (run_badblocks != 'NOT' 116 and self._storage_state != STORAGE_STATE_CRITICAL 117 and self._support_health_profile()): 118 # run badblocks if storage not in critical state 119 # if bad block found then mark storage as bad 120 logging.info('Trying run badblocks on device') 121 dhp = self._host.health_profile 122 usb_boot = self._host.is_boot_from_external_device() 123 if run_badblocks is None: 124 if _is_time_to_run_badblocks_ro(dhp): 125 run_badblocks = BADBLOCK_CHECK_RO 126 if usb_boot and _is_time_to_run_badblocks_rw(dhp): 127 run_badblocks = BADBLOCK_CHECK_RW 128 logging.debug('run_badblocks=%s', run_badblocks) 129 if usb_boot and run_badblocks == BADBLOCK_CHECK_RW: 130 self._run_read_write_badblocks_check() 131 dhp.refresh_badblocks_rw_run_time() 132 # RO is subclass of RW so update it too 133 dhp.refresh_badblocks_ro_run_time() 134 if run_badblocks == BADBLOCK_CHECK_RO: 135 # SMART stats sometimes is not giving issue if blocks 136 # bad for reading. So we run RO check. 137 self._run_readonly_badblocks_check() 138 dhp.refresh_badblocks_ro_run_time() 139 return self._storage_state 140 141 def _get_storage_type(self): 142 """Read the info to detect type of the storage by patterns""" 143 logging.info('Extraction storage type') 144 # Example "SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s)" 145 sata_detect = r"SATA Version is:.*" 146 147 # Example " Extended CSD rev 1.7 (MMC 5.0)" 148 mmc_detect = r"\s*Extended CSD rev.*MMC (?P<version>\d+.\d+)" 149 150 # Example "SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff)" 151 nvme_detect = r".*NVMe Log .*" 152 153 for line in self._info: 154 if re.match(sata_detect, line): 155 logging.info('Found SATA device') 156 logging.debug('Found line => ' + line) 157 return STORAGE_TYPE_SSD 158 159 m = re.match(mmc_detect, line) 160 if m: 161 version = m.group('version') 162 logging.info('Found eMMC device, version: %s', version) 163 logging.debug('Found line => ' + line) 164 return STORAGE_TYPE_MMC 165 166 if re.match(nvme_detect, line): 167 logging.info('Found NVMe device') 168 logging.debug('Found line => ' + line) 169 return STORAGE_TYPE_NVME 170 raise StorageError('Storage type cannot be detect') 171 172 def _get_state_for_ssd(self): 173 """Read the info to detect state for SSD storage""" 174 logging.info('Extraction metrics for SSD storage') 175 # Field meaning and example line that have failing attribute 176 # https://en.wikipedia.org/wiki/S.M.A.R.T. 177 # ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE 178 # 184 End-to-End_Error PO--CK 001 001 097 NOW 135 179 ssd_fail = r"""\s*(?P<param>\S+\s\S+) # ID and attribute name 180 \s+[P-][O-][S-][R-][C-][K-] # flags 181 (\s+\d{3}){3} # three 3-digits numbers 182 \s+NOW # fail indicator""" 183 184 ssd_relocate_sectors = r"""\s*\d\sReallocated_Sector_Ct 185 \s*[P-][O-][S-][R-][C-][K-] # flags 186 \s*(?P<value>\d{3}) # VALUE 187 \s*(?P<worst>\d{3}) # WORST 188 \s*(?P<thresh>\d{3})# THRESH 189 """ 190 # future optimizations: read GPL and determine persentage 191 for line in self._info: 192 if re.match(ssd_fail, line): 193 logging.debug('Found fail line => ' + line) 194 return STORAGE_STATE_CRITICAL 195 196 m = re.match(ssd_relocate_sectors, line) 197 if m: 198 logging.info('Found critical line => ' + line) 199 value = int(m.group('value')) 200 # manufacture set default value 100, 201 # if number started to grow then it is time to mark it 202 if value > 100: 203 return STORAGE_STATE_WARNING 204 return STORAGE_STATE_NORMAL 205 206 def _get_state_for_mms(self): 207 """Read the info to detect state for MMC storage""" 208 logging.debug('Extraction metrics for MMC storage') 209 # Ex: 210 # Device life time type A [DEVICE_LIFE_TIME_EST_TYP_A: 0x01] 211 # 0x00~9 means 0-90% band 212 # 0x0a means 90-100% band 213 # 0x0b means over 100% band 214 mmc_fail_lev = r""".*(?P<param>DEVICE_LIFE_TIME_EST_TYP_.)]?: 215 0x0(?P<val>\S)""" #life time persentage 216 217 # Ex "Pre EOL information [PRE_EOL_INFO: 0x01]" 218 # 0x00 - not defined 219 # 0x01 - Normal 220 # 0x02 - Warning, consumed 80% of the reserved blocks 221 # 0x03 - Urgent, consumed 90% of the reserved blocks 222 mmc_fail_eol = r".*(?P<param>PRE_EOL_INFO.)]?: 0x0(?P<val>\d)" 223 224 eol_value = 0 225 lev_value = -1 226 for line in self._info: 227 m = re.match(mmc_fail_lev, line) 228 if m: 229 param = m.group('val') 230 logging.debug('Found line for lifetime estimate => ' + line) 231 if 'a' == param: 232 val = 100 233 elif 'b' == param: 234 val = 101 235 else: 236 val = int(param)*10 237 if val > lev_value: 238 lev_value = val 239 continue 240 241 m = re.match(mmc_fail_eol, line) 242 if m: 243 param = m.group('val') 244 logging.debug('Found line for end-of-life => ' + line) 245 eol_value = int(param) 246 break 247 248 # set state based on end-of-life 249 if eol_value == 3: 250 return STORAGE_STATE_CRITICAL 251 elif eol_value == 2: 252 return STORAGE_STATE_WARNING 253 elif eol_value == 1: 254 return STORAGE_STATE_NORMAL 255 256 # set state based on life of estimates 257 elif lev_value < 90: 258 return STORAGE_STATE_NORMAL 259 elif lev_value < 100: 260 return STORAGE_STATE_WARNING 261 return STORAGE_STATE_CRITICAL 262 263 def _get_state_for_nvme(self): 264 """Read the info to detect state for NVMe storage""" 265 logging.debug('Extraction metrics for NVMe storage') 266 # Ex "Percentage Used: 100%" 267 nvme_fail = r"Percentage Used:\s+(?P<param>(\d{1,3}))%" 268 used_value = -1 269 for line in self._info: 270 m = re.match(nvme_fail, line) 271 if m: 272 param = m.group('param') 273 logging.debug('Found line for usage => ' + line) 274 try: 275 val = int(param) 276 used_value = val 277 except ValueError as e: 278 logging.info('Could not cast: %s to int ', param) 279 break 280 281 if used_value < 91: 282 return STORAGE_STATE_NORMAL 283 # Stop mark device as bad when they reached 100% usage 284 # TODO(otabek) crbug.com/1140507 re-evaluate the max usage 285 return STORAGE_STATE_WARNING 286 287 def _get_device_storage_path(self): 288 """Find and return the path to the device storage. 289 290 Method support detection even when the device booted from USB. 291 292 @returns path to the main device like '/dev/XXXX' 293 """ 294 # find the name of device storage 295 cmd = ('. /usr/sbin/write_gpt.sh;' 296 ' . /usr/share/misc/chromeos-common.sh;' 297 ' load_base_vars; get_fixed_dst_drive') 298 cmd_result = self._host.run(cmd, 299 ignore_status=True, 300 timeout=60) 301 if cmd_result.exit_status != 0: 302 logging.debug('Failed to detect path to the device storage') 303 return None 304 return cmd_result.stdout.strip() 305 306 def _run_readonly_badblocks_check(self): 307 """Run backblocks readonly verification on device storage. 308 309 The blocksize set as 512 based. 310 """ 311 path = self._get_device_storage_path() 312 if not path: 313 # cannot continue if storage was not detected 314 return 315 logging.info("Running readonly badblocks check; path=%s", path) 316 cmd = 'badblocks -e 1 -s -b 512 %s' % path 317 try: 318 # set limit in 1 hour but expecting to finish it up 30 minutes 319 cmd_result = self._host.run(cmd, ignore_status=True, timeout=3600) 320 if cmd_result.exit_status != 0: 321 logging.debug('Failed to detect path to the device storage') 322 return 323 result = cmd_result.stdout.strip() 324 if result: 325 logging.debug("Check result: '%s'", result) 326 # So has result is Bad and empty is Good. 327 self._storage_state = STORAGE_STATE_CRITICAL 328 except Exception as e: 329 if 'Timeout encountered:' in str(e): 330 logging.info('Timeout during running action') 331 logging.debug(str(e)) 332 333 def _run_read_write_badblocks_check(self): 334 """Run non-destructive read-write check on device storage. 335 336 The blocksize set as 512 based. 337 We can run this test only when DUT booted from USB. 338 """ 339 path = self._get_device_storage_path() 340 if not path: 341 # cannot continue if storage was not detected 342 return 343 logging.info("Running read-write badblocks check; path=%s", path) 344 cmd = 'badblocks -e 1 -nsv -b 4096 %s' % path 345 try: 346 # set limit in 90 minutes but expecting to finish it up 50 minutes 347 cmd_result = self._host.run(cmd, ignore_status=True, timeout=5400) 348 if cmd_result.exit_status != 0: 349 logging.debug('Failed to detect path to the device storage') 350 return 351 result = cmd_result.stdout.strip() 352 if result: 353 logging.debug("Check result: '%s'", result) 354 # So has result is Bad and empty is Good. 355 self._storage_state = STORAGE_STATE_CRITICAL 356 except Exception as e: 357 if 'Timeout encountered:' in str(e): 358 logging.info('Timeout during running action') 359 logging.info('(Not critical) %s', e) 360 361 def _support_health_profile(self): 362 return (hasattr(self._host, 'health_profile') 363 and self._host.health_profile) 364 365 366def _is_time_to_run_badblocks_ro(dhp): 367 """Verify that device can proceed to run read-only badblocks check. 368 The RO check can be executed not often then one per 6 days. 369 370 @returns True if can proceed, False if not 371 """ 372 today_time = int(time.time()) 373 last_check = dhp.get_badblocks_ro_run_time_epoch() 374 can_run = today_time > (last_check + (6 * 24 * 60 * 60)) 375 if not can_run: 376 logging.info( 377 'Run RO badblocks not allowed because we have run it recently,' 378 ' last run %s. RO check allowed to run only once per 6 days', 379 dhp.get_badblocks_ro_run_time()) 380 return can_run 381 382 383def _is_time_to_run_badblocks_rw(dhp): 384 """Verify that device can proceed to run read-write badblocks check. 385 The RW check can be executed not often then one per 60 days. 386 387 @returns True if can proceed, False if not 388 """ 389 today_time = int(time.time()) 390 last_check = dhp.get_badblocks_rw_run_time_epoch() 391 can_run = today_time > (last_check + (60 * 24 * 60 * 60)) 392 if not can_run: 393 logging.info( 394 'Run RW badblocks not allowed because we have run it recently,' 395 ' last run %s. RW check allowed to run only once per 60 days', 396 dhp.get_badblocks_rw_run_time()) 397 return can_run 398