xref: /aosp_15_r20/external/autotest/server/hosts/labstation_host.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright (c) 2019 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""This file provides core logic for labstation verify/repair process."""
6
7import logging
8
9from autotest_lib.client.common_lib import error
10from autotest_lib.server import afe_utils
11from autotest_lib.server.hosts import base_label
12from autotest_lib.server.hosts import cros_label
13from autotest_lib.server.hosts import labstation_repair
14from autotest_lib.server.cros import provision
15from autotest_lib.server.hosts import base_servohost
16from autotest_lib.server.cros.dynamic_suite import constants as ds_constants
17from autotest_lib.server.cros.dynamic_suite import tools
18from autotest_lib.client.common_lib.cros import dev_server
19from autotest_lib.server import utils as server_utils
20from autotest_lib.site_utils.rpm_control_system import rpm_client
21
22class LabstationHost(base_servohost.BaseServoHost):
23    """Labstation specific host class"""
24
25    # Threshold we decide to ignore a in_use file lock. In minutes
26    IN_USE_FILE_EXPIRE_MINS = 90
27
28    # Uptime threshold to perform a labstation reboot, this is to prevent a
29    # broken DUT keep trying to reboot a labstation. In hours
30    UP_TIME_THRESH_HOLD_HOURS = 6
31
32    VERSION_PREFIX = provision.CROS_VERSION_PREFIX
33
34    @staticmethod
35    def check_host(host, timeout=10):
36        """
37        Check if the given host is a labstation host.
38
39        @param host: An ssh host representing a device.
40        @param timeout: The timeout for the run command.
41
42        @return: True if the host device is labstation.
43
44        @raises AutoservRunError: If the command failed.
45        @raises AutoservSSHTimeout: Ssh connection has timed out.
46
47        """
48        try:
49            result = host.run(
50                'grep -q labstation /etc/lsb-release',
51                ignore_status=True, timeout=timeout)
52        except (error.AutoservRunError, error.AutoservSSHTimeout):
53            return False
54        return result.exit_status == 0
55
56
57    def _initialize(self, hostname, *args, **dargs):
58        super(LabstationHost, self)._initialize(hostname=hostname,
59                                                *args, **dargs)
60        self._repair_strategy = (
61            labstation_repair.create_labstation_repair_strategy())
62        self.labels = base_label.LabelRetriever(cros_label.LABSTATION_LABELS)
63
64
65    def is_reboot_requested(self):
66        """Check if a reboot is requested for this labstation, the reboot can
67        either be requested from labstation or DUTs. For request from DUTs we
68        only process it if uptime longer than a threshold because we want
69        to prevent a broken servo keep its labstation in reboot cycle.
70
71        @returns True if a reboot is required, otherwise False
72        """
73        if self._check_update_status() == self.UPDATE_STATE.PENDING_REBOOT:
74            logging.info('Labstation reboot requested from labstation for'
75                         ' update image')
76            return True
77
78        if not self._validate_uptime():
79            logging.info('Ignoring DUTs reboot request because %s was'
80                         ' rebooted in last %d hours.',
81                         self.hostname, self.UP_TIME_THRESH_HOLD_HOURS)
82            return False
83
84        cmd = 'find %s*%s' % (self.TEMP_FILE_DIR, self.REBOOT_FILE_POSTFIX)
85        output = self.run(cmd, ignore_status=True).stdout
86        if output:
87            in_use_file_list = output.strip().split('\n')
88            logging.info('%s DUT(s) are currently requesting to'
89                         ' reboot labstation.', len(in_use_file_list))
90            return True
91        else:
92            return False
93
94
95    def try_reboot(self):
96        """Try to reboot the labstation if it's safe to do(no servo in use,
97         and not processing updates), and cleanup reboot control file.
98        """
99        if self._is_servo_in_use():
100            logging.info('Aborting reboot action because some DUT(s) are'
101                         ' currently using servo(s).')
102            return
103
104        update_state = self._check_update_status()
105        if update_state == self.UPDATE_STATE.RUNNING:
106            logging.info('Aborting reboot action because an update process'
107                         ' is running.')
108            return
109        if update_state == self.UPDATE_STATE.PENDING_REBOOT:
110            self._post_update_reboot()
111        else:
112            self._servo_host_reboot()
113        self.update_cros_version_label()
114        logging.info('Cleaning up reboot control files.')
115        self._cleanup_post_reboot()
116
117
118    def get_labels(self):
119        """Return the detected labels on the host."""
120        return self.labels.get_labels(self)
121
122
123    def get_os_type(self):
124        return 'labstation'
125
126
127    def verify_job_repo_url(self, tag=''):
128        """
129        Make sure job_repo_url of this host is valid.
130
131        Eg: The job_repo_url "http://lmn.cd.ab.xyx:8080/static/\
132        lumpy-release/R29-4279.0.0/autotest/packages" claims to have the
133        autotest package for lumpy-release/R29-4279.0.0. If this isn't the case,
134        download and extract it. If the devserver embedded in the url is
135        unresponsive, update the job_repo_url of the host after staging it on
136        another devserver.
137
138        @param job_repo_url: A url pointing to the devserver where the autotest
139            package for this build should be staged.
140        @param tag: The tag from the server job, in the format
141                    <job_id>-<user>/<hostname>, or <hostless> for a server job.
142
143        @raises DevServerException: If we could not resolve a devserver.
144        @raises AutoservError: If we're unable to save the new job_repo_url as
145            a result of choosing a new devserver because the old one failed to
146            respond to a health check.
147        @raises urllib2.URLError: If the devserver embedded in job_repo_url
148                                  doesn't respond within the timeout.
149        """
150        info = self.host_info_store.get()
151        job_repo_url = info.attributes.get(ds_constants.JOB_REPO_URL, '')
152        if not job_repo_url:
153            logging.warning('No job repo url set on host %s', self.hostname)
154            return
155
156        logging.info('Verifying job repo url %s', job_repo_url)
157        devserver_url, image_name = tools.get_devserver_build_from_package_url(
158            job_repo_url)
159
160        ds = dev_server.ImageServer(devserver_url)
161
162        logging.info('Staging autotest artifacts for %s on devserver %s',
163                     image_name, ds.url())
164
165        ds.stage_artifacts(image_name, ['autotest_packages'])
166
167
168    def host_version_prefix(self, image):
169        """Return version label prefix.
170
171        In case the CrOS provisioning version is something other than the
172        standard CrOS version e.g. CrOS TH version, this function will
173        find the prefix from provision.py.
174
175        @param image: The image name to find its version prefix.
176        @returns: A prefix string for the image type.
177        """
178        return provision.get_version_label_prefix(image)
179
180
181    def stage_server_side_package(self, image=None):
182        """Stage autotest server-side package on devserver.
183
184        @param image: Full path of an OS image to install or a build name.
185
186        @return: A url to the autotest server-side package.
187
188        @raise: error.AutoservError if fail to locate the build to test with, or
189                fail to stage server-side package.
190        """
191        # If enable_drone_in_restricted_subnet is False, do not set hostname
192        # in devserver.resolve call, so a devserver in non-restricted subnet
193        # is picked to stage autotest server package for drone to download.
194        hostname = self.hostname
195        if not server_utils.ENABLE_DRONE_IN_RESTRICTED_SUBNET:
196            hostname = None
197        if image:
198            image_name = tools.get_build_from_image(image)
199            if not image_name:
200                raise error.AutoservError(
201                    'Failed to parse build name from %s' % image)
202            ds = dev_server.ImageServer.resolve(image_name, hostname)
203        else:
204            info = self.host_info_store.get()
205            job_repo_url = info.attributes.get(ds_constants.JOB_REPO_URL, '')
206            if job_repo_url:
207                devserver_url, image_name = (
208                    tools.get_devserver_build_from_package_url(job_repo_url))
209                # If enable_drone_in_restricted_subnet is True, use the
210                # existing devserver. Otherwise, resolve a new one in
211                # non-restricted subnet.
212                if server_utils.ENABLE_DRONE_IN_RESTRICTED_SUBNET:
213                    ds = dev_server.ImageServer(devserver_url)
214                else:
215                    ds = dev_server.ImageServer.resolve(image_name)
216            elif info.build is not None:
217                ds = dev_server.ImageServer.resolve(info.build, hostname)
218                image_name = info.build
219            else:
220                raise error.AutoservError(
221                    'Failed to stage server-side package. The host has '
222                    'no job_repo_url attribute or cros-version label.')
223
224        ds.stage_artifacts(image_name, ['autotest_server_package'])
225        return '%s/static/%s/%s' % (ds.url(), image_name,
226                                    'autotest_server_package.tar.bz2')
227
228
229    def repair(self):
230        """Attempt to repair a labstation."""
231        message = 'Beginning repair for host %s board %s model %s'
232        info = self.host_info_store.get()
233        message %= (self.hostname, info.board, info.model)
234        self.record('INFO', None, None, message)
235        self._repair_strategy.repair(self)
236
237
238    def update_cros_version_label(self):
239        """Update cros-version label on labstation"""
240        image_name = self.get_full_release_path()
241        if not image_name:
242            logging.info('Could not get labstation version, it could be'
243                         ' the labstation is running a customized image.')
244            info = self.host_info_store.get()
245            info.clear_version_labels(version_prefix=self.VERSION_PREFIX)
246            self.host_info_store.commit(info)
247            return
248        afe_utils.add_provision_labels(self, self.VERSION_PREFIX, image_name)
249
250
251    def _validate_uptime(self):
252        return (float(self.check_uptime()) >
253                self.UP_TIME_THRESH_HOLD_HOURS * 3600)
254
255
256    def _is_servo_in_use(self):
257        """Determine if there are any DUTs currently running task that uses
258         servo, only files that has been touched within pre-set threshold of
259          minutes counts.
260
261        @returns True if any DUTs is using servos, otherwise False.
262        """
263        cmd = 'find %s*%s -mmin -%s' % (self.TEMP_FILE_DIR,
264                                        self.LOCK_FILE_POSTFIX,
265                                        self.IN_USE_FILE_EXPIRE_MINS)
266        result = self.run(cmd, ignore_status=True)
267        return bool(result.stdout)
268
269
270    def _cleanup_post_reboot(self):
271        """Clean up all xxxx_reboot file after reboot."""
272        cmd = 'rm %s*%s' % (self.TEMP_FILE_DIR, self.REBOOT_FILE_POSTFIX)
273        self.run(cmd, ignore_status=True)
274
275    def rpm_power_on_and_wait(self, _rpm_client=None):
276        """Power on a labstation through RPM and wait for it to come up"""
277        return self.change_rpm_state_and_wait("ON", _rpm_client=_rpm_client)
278
279    def rpm_power_off_and_wait(self, _rpm_client=None):
280        """Power off a labstation through RPM and wait for it to shut down"""
281        return self.change_rpm_state_and_wait("OFF", _rpm_client=_rpm_client)
282
283    def change_rpm_state_and_wait(self, state, _rpm_client=None):
284        """Change the state of a labstation
285
286        @param state: on or off
287        @param _rpm_client: rpm_client module, to support testing
288        """
289        _rpm_client = _rpm_client or rpm_client
290        wait = {
291            "ON":  self.wait_up,
292            "OFF": self.wait_down,
293        }[state]
294        timeout = {
295            "ON": self.BOOT_TIMEOUT,
296            "OFF": self.WAIT_DOWN_REBOOT_TIMEOUT,
297        }[state]
298        _rpm_client.set_power(self, state)
299        if not wait(timeout=timeout):
300            msg = "%s didn't enter %s state in %s seconds" % (
301                getattr(self, 'hostname', None),
302                state,
303                timeout,
304            )
305            raise Exception(msg)
306