xref: /aosp_15_r20/external/autotest/server/cros/storage/storage_validate.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright 2020 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import logging
6import time
7import re
8
9from autotest_lib.client.common_lib import error
10
11# Storage types supported
12STORAGE_TYPE_SSD = 'ssd'
13STORAGE_TYPE_NVME = 'nvme'
14STORAGE_TYPE_MMC = 'mmc'
15
16# Storage states supported
17STORAGE_STATE_NORMAL = 'normal'
18STORAGE_STATE_WARNING = 'warning'
19STORAGE_STATE_CRITICAL = 'critical'
20
21BADBLOCK_CHECK_RO = 'RO'
22BADBLOCK_CHECK_RW = 'RW'
23
24
25class StorageError(error.TestFail):
26    """Custom error class to indicate is unsupported or unavailable
27    detect storage info.
28    """
29    pass
30
31
32class ConsoleError(error.TestFail):
33    """Common error class for servod console-back control failures."""
34    pass
35
36
37class StorageStateValidator(object):
38    """Class to detect types and state of the DUT storage.
39
40    The class supporting SSD, NVME and MMC storage types.
41    The state detection and set state as:
42    - normal - drive in a good shape
43    - warning - drive close to the worn out state by any metrics
44    - critical - drive is worn out and has errors
45    """
46
47    def __init__(self, host):
48        """Initialize the storage validator.
49
50        @param host: cros_host object providing console access
51                     for reading the target info.
52
53        @raises ConsoleError: if cannot read info
54        @raises StorageError: if info is not preset
55        """
56        self._host = host
57        self._storage_type = None
58        self._storage_state = None
59        self._info = []
60
61        if not self._host:
62            raise StorageError('Host is not provided')
63
64        self._read_storage_info()
65
66    def _read_storage_info(self):
67        """Reading the storage info from SMART
68
69        The info will be located as collection of lines
70        @raises StorageError: if no info provided or data unavailable
71        """
72        logging.info('Extraction storage info')
73        command = '. /usr/share/misc/storage-info-common.sh; get_storage_info'
74        cmd_result = self._host.run(command, ignore_status=True)
75        if cmd_result.exit_status != 0:
76            raise StorageError('receive error: %s;', cmd_result.stderr)
77
78        if cmd_result.stdout:
79            self._info = cmd_result.stdout.splitlines()
80        if len(self._info) == 0:
81            raise StorageError('Storage info is empty')
82
83    def get_type(self):
84        """Determine the type of the storage on the host.
85
86        @returns storage type (SSD, NVME, MMC)
87
88        @raises StorageError: if type not supported or not determine
89        """
90        if not self._storage_type:
91            self._storage_type = self._get_storage_type()
92        return self._storage_type
93
94    def get_state(self, run_badblocks=None):
95        """Determine the type of the storage on the host.
96
97        @param run_badblocks: string key to run badblock check.
98                                None - check if we can run it
99                                "NOT" - do not run check
100                                "RW" - run read-write if booted from USB
101                                "RO"  - run read-only check
102        @returns storage state (normal|warning|critical)
103
104        @raises StorageError: if type not supported or state cannot
105                            be determine
106        """
107        if not self._storage_state:
108            storage_type = self.get_type()
109            if storage_type == STORAGE_TYPE_SSD:
110                self._storage_state = self._get_state_for_ssd()
111            elif storage_type == STORAGE_TYPE_MMC:
112                self._storage_state = self._get_state_for_mms()
113            elif storage_type == STORAGE_TYPE_NVME:
114                self._storage_state = self._get_state_for_nvme()
115        if (run_badblocks != 'NOT'
116                    and self._storage_state != STORAGE_STATE_CRITICAL
117                    and self._support_health_profile()):
118            # run badblocks if storage not in critical state
119            # if bad block found then mark storage as bad
120            logging.info('Trying run badblocks on device')
121            dhp = self._host.health_profile
122            usb_boot = self._host.is_boot_from_external_device()
123            if run_badblocks is None:
124                if _is_time_to_run_badblocks_ro(dhp):
125                    run_badblocks = BADBLOCK_CHECK_RO
126                if usb_boot and _is_time_to_run_badblocks_rw(dhp):
127                    run_badblocks = BADBLOCK_CHECK_RW
128            logging.debug('run_badblocks=%s', run_badblocks)
129            if usb_boot and run_badblocks == BADBLOCK_CHECK_RW:
130                self._run_read_write_badblocks_check()
131                dhp.refresh_badblocks_rw_run_time()
132                # RO is subclass of RW so update it too
133                dhp.refresh_badblocks_ro_run_time()
134            if run_badblocks == BADBLOCK_CHECK_RO:
135                # SMART stats sometimes is not giving issue if blocks
136                # bad for reading. So we run RO check.
137                self._run_readonly_badblocks_check()
138                dhp.refresh_badblocks_ro_run_time()
139        return self._storage_state
140
141    def _get_storage_type(self):
142        """Read the info to detect type of the storage by patterns"""
143        logging.info('Extraction storage type')
144        # Example "SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s)"
145        sata_detect = r"SATA Version is:.*"
146
147        # Example "   Extended CSD rev 1.7 (MMC 5.0)"
148        mmc_detect = r"\s*Extended CSD rev.*MMC (?P<version>\d+.\d+)"
149
150        # Example "SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff)"
151        nvme_detect = r".*NVMe Log .*"
152
153        for line in self._info:
154            if re.match(sata_detect, line):
155                logging.info('Found SATA device')
156                logging.debug('Found line => ' + line)
157                return STORAGE_TYPE_SSD
158
159            m = re.match(mmc_detect, line)
160            if m:
161                version = m.group('version')
162                logging.info('Found eMMC device, version: %s', version)
163                logging.debug('Found line => ' + line)
164                return STORAGE_TYPE_MMC
165
166            if re.match(nvme_detect, line):
167                logging.info('Found NVMe device')
168                logging.debug('Found line => ' + line)
169                return STORAGE_TYPE_NVME
170        raise StorageError('Storage type cannot be detect')
171
172    def _get_state_for_ssd(self):
173        """Read the info to detect state for SSD storage"""
174        logging.info('Extraction metrics for SSD storage')
175        # Field meaning and example line that have failing attribute
176        # https://en.wikipedia.org/wiki/S.M.A.R.T.
177        # ID# ATTRIBUTE_NAME     FLAGS    VALUE WORST THRESH FAIL RAW_VALUE
178        # 184 End-to-End_Error   PO--CK   001   001   097    NOW  135
179        ssd_fail = r"""\s*(?P<param>\S+\s\S+)      # ID and attribute name
180                    \s+[P-][O-][S-][R-][C-][K-] # flags
181                    (\s+\d{3}){3}               # three 3-digits numbers
182                    \s+NOW                      # fail indicator"""
183
184        ssd_relocate_sectors = r"""\s*\d\sReallocated_Sector_Ct
185                    \s*[P-][O-][S-][R-][C-][K-] # flags
186                    \s*(?P<value>\d{3}) # VALUE
187                    \s*(?P<worst>\d{3}) # WORST
188                    \s*(?P<thresh>\d{3})# THRESH
189                    """
190        # future optimizations: read GPL and determine persentage
191        for line in self._info:
192            if re.match(ssd_fail, line):
193                logging.debug('Found fail line => ' + line)
194                return STORAGE_STATE_CRITICAL
195
196            m = re.match(ssd_relocate_sectors, line)
197            if m:
198                logging.info('Found critical line => ' + line)
199                value = int(m.group('value'))
200                # manufacture set default value 100,
201                # if number started to grow then it is time to mark it
202                if value > 100:
203                    return STORAGE_STATE_WARNING
204        return STORAGE_STATE_NORMAL
205
206    def _get_state_for_mms(self):
207        """Read the info to detect state for MMC storage"""
208        logging.debug('Extraction metrics for MMC storage')
209        # Ex:
210        # Device life time type A [DEVICE_LIFE_TIME_EST_TYP_A: 0x01]
211        # 0x00~9 means 0-90% band
212        # 0x0a means 90-100% band
213        # 0x0b means over 100% band
214        mmc_fail_lev = r""".*(?P<param>DEVICE_LIFE_TIME_EST_TYP_.)]?:
215                        0x0(?P<val>\S)""" #life time persentage
216
217        # Ex "Pre EOL information [PRE_EOL_INFO: 0x01]"
218        # 0x00 - not defined
219        # 0x01 - Normal
220        # 0x02 - Warning, consumed 80% of the reserved blocks
221        # 0x03 - Urgent, consumed 90% of the reserved blocks
222        mmc_fail_eol = r".*(?P<param>PRE_EOL_INFO.)]?: 0x0(?P<val>\d)"
223
224        eol_value = 0
225        lev_value = -1
226        for line in self._info:
227            m = re.match(mmc_fail_lev, line)
228            if m:
229                param = m.group('val')
230                logging.debug('Found line for lifetime estimate => ' + line)
231                if 'a' == param:
232                    val = 100
233                elif 'b' == param:
234                    val = 101
235                else:
236                    val = int(param)*10
237                if val > lev_value:
238                    lev_value = val
239                continue
240
241            m = re.match(mmc_fail_eol, line)
242            if m:
243                param = m.group('val')
244                logging.debug('Found line for end-of-life => ' + line)
245                eol_value = int(param)
246                break
247
248        # set state based on end-of-life
249        if eol_value == 3:
250            return STORAGE_STATE_CRITICAL
251        elif eol_value == 2:
252            return STORAGE_STATE_WARNING
253        elif eol_value == 1:
254            return STORAGE_STATE_NORMAL
255
256        # set state based on life of estimates
257        elif lev_value < 90:
258            return STORAGE_STATE_NORMAL
259        elif lev_value < 100:
260            return STORAGE_STATE_WARNING
261        return STORAGE_STATE_CRITICAL
262
263    def _get_state_for_nvme(self):
264        """Read the info to detect state for NVMe storage"""
265        logging.debug('Extraction metrics for NVMe storage')
266        # Ex "Percentage Used:         100%"
267        nvme_fail = r"Percentage Used:\s+(?P<param>(\d{1,3}))%"
268        used_value = -1
269        for line in self._info:
270            m = re.match(nvme_fail, line)
271            if m:
272                param = m.group('param')
273                logging.debug('Found line for usage => ' + line)
274                try:
275                    val = int(param)
276                    used_value = val
277                except ValueError as e:
278                    logging.info('Could not cast: %s to int ', param)
279                break
280
281        if used_value < 91:
282            return STORAGE_STATE_NORMAL
283        # Stop mark device as bad when they reached 100% usage
284        # TODO(otabek) crbug.com/1140507 re-evaluate the max usage
285        return STORAGE_STATE_WARNING
286
287    def _get_device_storage_path(self):
288        """Find and return the path to the device storage.
289
290        Method support detection even when the device booted from USB.
291
292        @returns path to the main device like '/dev/XXXX'
293        """
294        # find the name of device storage
295        cmd = ('. /usr/sbin/write_gpt.sh;'
296               ' . /usr/share/misc/chromeos-common.sh;'
297               ' load_base_vars; get_fixed_dst_drive')
298        cmd_result = self._host.run(cmd,
299                                    ignore_status=True,
300                                    timeout=60)
301        if cmd_result.exit_status != 0:
302            logging.debug('Failed to detect path to the device storage')
303            return None
304        return cmd_result.stdout.strip()
305
306    def _run_readonly_badblocks_check(self):
307        """Run backblocks readonly verification on device storage.
308
309        The blocksize set as 512 based.
310        """
311        path = self._get_device_storage_path()
312        if not path:
313            # cannot continue if storage was not detected
314            return
315        logging.info("Running readonly badblocks check; path=%s", path)
316        cmd = 'badblocks -e 1 -s -b 512 %s' % path
317        try:
318            # set limit in 1 hour but expecting to finish it up 30 minutes
319            cmd_result = self._host.run(cmd, ignore_status=True, timeout=3600)
320            if cmd_result.exit_status != 0:
321                logging.debug('Failed to detect path to the device storage')
322                return
323            result = cmd_result.stdout.strip()
324            if result:
325                logging.debug("Check result: '%s'", result)
326                # So has result is Bad and empty is Good.
327                self._storage_state = STORAGE_STATE_CRITICAL
328        except Exception as e:
329            if 'Timeout encountered:' in str(e):
330                logging.info('Timeout during running action')
331            logging.debug(str(e))
332
333    def _run_read_write_badblocks_check(self):
334        """Run non-destructive read-write check on device storage.
335
336        The blocksize set as 512 based.
337        We can run this test only when DUT booted from USB.
338        """
339        path = self._get_device_storage_path()
340        if not path:
341            # cannot continue if storage was not detected
342            return
343        logging.info("Running read-write badblocks check; path=%s", path)
344        cmd = 'badblocks -e 1 -nsv -b 4096 %s' % path
345        try:
346            # set limit in 90 minutes but expecting to finish it up 50 minutes
347            cmd_result = self._host.run(cmd, ignore_status=True, timeout=5400)
348            if cmd_result.exit_status != 0:
349                logging.debug('Failed to detect path to the device storage')
350                return
351            result = cmd_result.stdout.strip()
352            if result:
353                logging.debug("Check result: '%s'", result)
354                # So has result is Bad and empty is Good.
355                self._storage_state = STORAGE_STATE_CRITICAL
356        except Exception as e:
357            if 'Timeout encountered:' in str(e):
358                logging.info('Timeout during running action')
359            logging.info('(Not critical) %s', e)
360
361    def _support_health_profile(self):
362        return (hasattr(self._host, 'health_profile')
363                and self._host.health_profile)
364
365
366def _is_time_to_run_badblocks_ro(dhp):
367    """Verify that device can proceed to run read-only badblocks check.
368    The RO check can be executed not often then one per 6 days.
369
370    @returns True if can proceed, False if not
371    """
372    today_time = int(time.time())
373    last_check = dhp.get_badblocks_ro_run_time_epoch()
374    can_run = today_time > (last_check + (6 * 24 * 60 * 60))
375    if not can_run:
376        logging.info(
377                'Run RO badblocks not allowed because we have run it recently,'
378                ' last run %s. RO check allowed to run only once per 6 days',
379                dhp.get_badblocks_ro_run_time())
380    return can_run
381
382
383def _is_time_to_run_badblocks_rw(dhp):
384    """Verify that device can proceed to run read-write badblocks check.
385    The RW check can be executed not often then one per 60 days.
386
387    @returns True if can proceed, False if not
388    """
389    today_time = int(time.time())
390    last_check = dhp.get_badblocks_rw_run_time_epoch()
391    can_run = today_time > (last_check + (60 * 24 * 60 * 60))
392    if not can_run:
393        logging.info(
394                'Run RW badblocks not allowed because we have run it recently,'
395                ' last run %s. RW check allowed to run only once per 60 days',
396                dhp.get_badblocks_rw_run_time())
397    return can_run
398