xref: /aosp_15_r20/external/autotest/server/hosts/labstation_host.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li# Copyright (c) 2019 The Chromium OS Authors. All rights reserved.
2*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
3*9c5db199SXin Li# found in the LICENSE file.
4*9c5db199SXin Li
5*9c5db199SXin Li"""This file provides core logic for labstation verify/repair process."""
6*9c5db199SXin Li
7*9c5db199SXin Liimport logging
8*9c5db199SXin Li
9*9c5db199SXin Lifrom autotest_lib.client.common_lib import error
10*9c5db199SXin Lifrom autotest_lib.server import afe_utils
11*9c5db199SXin Lifrom autotest_lib.server.hosts import base_label
12*9c5db199SXin Lifrom autotest_lib.server.hosts import cros_label
13*9c5db199SXin Lifrom autotest_lib.server.hosts import labstation_repair
14*9c5db199SXin Lifrom autotest_lib.server.cros import provision
15*9c5db199SXin Lifrom autotest_lib.server.hosts import base_servohost
16*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import constants as ds_constants
17*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import tools
18*9c5db199SXin Lifrom autotest_lib.client.common_lib.cros import dev_server
19*9c5db199SXin Lifrom autotest_lib.server import utils as server_utils
20*9c5db199SXin Lifrom autotest_lib.site_utils.rpm_control_system import rpm_client
21*9c5db199SXin Li
22*9c5db199SXin Liclass LabstationHost(base_servohost.BaseServoHost):
23*9c5db199SXin Li    """Labstation specific host class"""
24*9c5db199SXin Li
25*9c5db199SXin Li    # Threshold we decide to ignore a in_use file lock. In minutes
26*9c5db199SXin Li    IN_USE_FILE_EXPIRE_MINS = 90
27*9c5db199SXin Li
28*9c5db199SXin Li    # Uptime threshold to perform a labstation reboot, this is to prevent a
29*9c5db199SXin Li    # broken DUT keep trying to reboot a labstation. In hours
30*9c5db199SXin Li    UP_TIME_THRESH_HOLD_HOURS = 6
31*9c5db199SXin Li
32*9c5db199SXin Li    VERSION_PREFIX = provision.CROS_VERSION_PREFIX
33*9c5db199SXin Li
34*9c5db199SXin Li    @staticmethod
35*9c5db199SXin Li    def check_host(host, timeout=10):
36*9c5db199SXin Li        """
37*9c5db199SXin Li        Check if the given host is a labstation host.
38*9c5db199SXin Li
39*9c5db199SXin Li        @param host: An ssh host representing a device.
40*9c5db199SXin Li        @param timeout: The timeout for the run command.
41*9c5db199SXin Li
42*9c5db199SXin Li        @return: True if the host device is labstation.
43*9c5db199SXin Li
44*9c5db199SXin Li        @raises AutoservRunError: If the command failed.
45*9c5db199SXin Li        @raises AutoservSSHTimeout: Ssh connection has timed out.
46*9c5db199SXin Li
47*9c5db199SXin Li        """
48*9c5db199SXin Li        try:
49*9c5db199SXin Li            result = host.run(
50*9c5db199SXin Li                'grep -q labstation /etc/lsb-release',
51*9c5db199SXin Li                ignore_status=True, timeout=timeout)
52*9c5db199SXin Li        except (error.AutoservRunError, error.AutoservSSHTimeout):
53*9c5db199SXin Li            return False
54*9c5db199SXin Li        return result.exit_status == 0
55*9c5db199SXin Li
56*9c5db199SXin Li
57*9c5db199SXin Li    def _initialize(self, hostname, *args, **dargs):
58*9c5db199SXin Li        super(LabstationHost, self)._initialize(hostname=hostname,
59*9c5db199SXin Li                                                *args, **dargs)
60*9c5db199SXin Li        self._repair_strategy = (
61*9c5db199SXin Li            labstation_repair.create_labstation_repair_strategy())
62*9c5db199SXin Li        self.labels = base_label.LabelRetriever(cros_label.LABSTATION_LABELS)
63*9c5db199SXin Li
64*9c5db199SXin Li
65*9c5db199SXin Li    def is_reboot_requested(self):
66*9c5db199SXin Li        """Check if a reboot is requested for this labstation, the reboot can
67*9c5db199SXin Li        either be requested from labstation or DUTs. For request from DUTs we
68*9c5db199SXin Li        only process it if uptime longer than a threshold because we want
69*9c5db199SXin Li        to prevent a broken servo keep its labstation in reboot cycle.
70*9c5db199SXin Li
71*9c5db199SXin Li        @returns True if a reboot is required, otherwise False
72*9c5db199SXin Li        """
73*9c5db199SXin Li        if self._check_update_status() == self.UPDATE_STATE.PENDING_REBOOT:
74*9c5db199SXin Li            logging.info('Labstation reboot requested from labstation for'
75*9c5db199SXin Li                         ' update image')
76*9c5db199SXin Li            return True
77*9c5db199SXin Li
78*9c5db199SXin Li        if not self._validate_uptime():
79*9c5db199SXin Li            logging.info('Ignoring DUTs reboot request because %s was'
80*9c5db199SXin Li                         ' rebooted in last %d hours.',
81*9c5db199SXin Li                         self.hostname, self.UP_TIME_THRESH_HOLD_HOURS)
82*9c5db199SXin Li            return False
83*9c5db199SXin Li
84*9c5db199SXin Li        cmd = 'find %s*%s' % (self.TEMP_FILE_DIR, self.REBOOT_FILE_POSTFIX)
85*9c5db199SXin Li        output = self.run(cmd, ignore_status=True).stdout
86*9c5db199SXin Li        if output:
87*9c5db199SXin Li            in_use_file_list = output.strip().split('\n')
88*9c5db199SXin Li            logging.info('%s DUT(s) are currently requesting to'
89*9c5db199SXin Li                         ' reboot labstation.', len(in_use_file_list))
90*9c5db199SXin Li            return True
91*9c5db199SXin Li        else:
92*9c5db199SXin Li            return False
93*9c5db199SXin Li
94*9c5db199SXin Li
95*9c5db199SXin Li    def try_reboot(self):
96*9c5db199SXin Li        """Try to reboot the labstation if it's safe to do(no servo in use,
97*9c5db199SXin Li         and not processing updates), and cleanup reboot control file.
98*9c5db199SXin Li        """
99*9c5db199SXin Li        if self._is_servo_in_use():
100*9c5db199SXin Li            logging.info('Aborting reboot action because some DUT(s) are'
101*9c5db199SXin Li                         ' currently using servo(s).')
102*9c5db199SXin Li            return
103*9c5db199SXin Li
104*9c5db199SXin Li        update_state = self._check_update_status()
105*9c5db199SXin Li        if update_state == self.UPDATE_STATE.RUNNING:
106*9c5db199SXin Li            logging.info('Aborting reboot action because an update process'
107*9c5db199SXin Li                         ' is running.')
108*9c5db199SXin Li            return
109*9c5db199SXin Li        if update_state == self.UPDATE_STATE.PENDING_REBOOT:
110*9c5db199SXin Li            self._post_update_reboot()
111*9c5db199SXin Li        else:
112*9c5db199SXin Li            self._servo_host_reboot()
113*9c5db199SXin Li        self.update_cros_version_label()
114*9c5db199SXin Li        logging.info('Cleaning up reboot control files.')
115*9c5db199SXin Li        self._cleanup_post_reboot()
116*9c5db199SXin Li
117*9c5db199SXin Li
118*9c5db199SXin Li    def get_labels(self):
119*9c5db199SXin Li        """Return the detected labels on the host."""
120*9c5db199SXin Li        return self.labels.get_labels(self)
121*9c5db199SXin Li
122*9c5db199SXin Li
123*9c5db199SXin Li    def get_os_type(self):
124*9c5db199SXin Li        return 'labstation'
125*9c5db199SXin Li
126*9c5db199SXin Li
127*9c5db199SXin Li    def verify_job_repo_url(self, tag=''):
128*9c5db199SXin Li        """
129*9c5db199SXin Li        Make sure job_repo_url of this host is valid.
130*9c5db199SXin Li
131*9c5db199SXin Li        Eg: The job_repo_url "http://lmn.cd.ab.xyx:8080/static/\
132*9c5db199SXin Li        lumpy-release/R29-4279.0.0/autotest/packages" claims to have the
133*9c5db199SXin Li        autotest package for lumpy-release/R29-4279.0.0. If this isn't the case,
134*9c5db199SXin Li        download and extract it. If the devserver embedded in the url is
135*9c5db199SXin Li        unresponsive, update the job_repo_url of the host after staging it on
136*9c5db199SXin Li        another devserver.
137*9c5db199SXin Li
138*9c5db199SXin Li        @param job_repo_url: A url pointing to the devserver where the autotest
139*9c5db199SXin Li            package for this build should be staged.
140*9c5db199SXin Li        @param tag: The tag from the server job, in the format
141*9c5db199SXin Li                    <job_id>-<user>/<hostname>, or <hostless> for a server job.
142*9c5db199SXin Li
143*9c5db199SXin Li        @raises DevServerException: If we could not resolve a devserver.
144*9c5db199SXin Li        @raises AutoservError: If we're unable to save the new job_repo_url as
145*9c5db199SXin Li            a result of choosing a new devserver because the old one failed to
146*9c5db199SXin Li            respond to a health check.
147*9c5db199SXin Li        @raises urllib2.URLError: If the devserver embedded in job_repo_url
148*9c5db199SXin Li                                  doesn't respond within the timeout.
149*9c5db199SXin Li        """
150*9c5db199SXin Li        info = self.host_info_store.get()
151*9c5db199SXin Li        job_repo_url = info.attributes.get(ds_constants.JOB_REPO_URL, '')
152*9c5db199SXin Li        if not job_repo_url:
153*9c5db199SXin Li            logging.warning('No job repo url set on host %s', self.hostname)
154*9c5db199SXin Li            return
155*9c5db199SXin Li
156*9c5db199SXin Li        logging.info('Verifying job repo url %s', job_repo_url)
157*9c5db199SXin Li        devserver_url, image_name = tools.get_devserver_build_from_package_url(
158*9c5db199SXin Li            job_repo_url)
159*9c5db199SXin Li
160*9c5db199SXin Li        ds = dev_server.ImageServer(devserver_url)
161*9c5db199SXin Li
162*9c5db199SXin Li        logging.info('Staging autotest artifacts for %s on devserver %s',
163*9c5db199SXin Li                     image_name, ds.url())
164*9c5db199SXin Li
165*9c5db199SXin Li        ds.stage_artifacts(image_name, ['autotest_packages'])
166*9c5db199SXin Li
167*9c5db199SXin Li
168*9c5db199SXin Li    def host_version_prefix(self, image):
169*9c5db199SXin Li        """Return version label prefix.
170*9c5db199SXin Li
171*9c5db199SXin Li        In case the CrOS provisioning version is something other than the
172*9c5db199SXin Li        standard CrOS version e.g. CrOS TH version, this function will
173*9c5db199SXin Li        find the prefix from provision.py.
174*9c5db199SXin Li
175*9c5db199SXin Li        @param image: The image name to find its version prefix.
176*9c5db199SXin Li        @returns: A prefix string for the image type.
177*9c5db199SXin Li        """
178*9c5db199SXin Li        return provision.get_version_label_prefix(image)
179*9c5db199SXin Li
180*9c5db199SXin Li
181*9c5db199SXin Li    def stage_server_side_package(self, image=None):
182*9c5db199SXin Li        """Stage autotest server-side package on devserver.
183*9c5db199SXin Li
184*9c5db199SXin Li        @param image: Full path of an OS image to install or a build name.
185*9c5db199SXin Li
186*9c5db199SXin Li        @return: A url to the autotest server-side package.
187*9c5db199SXin Li
188*9c5db199SXin Li        @raise: error.AutoservError if fail to locate the build to test with, or
189*9c5db199SXin Li                fail to stage server-side package.
190*9c5db199SXin Li        """
191*9c5db199SXin Li        # If enable_drone_in_restricted_subnet is False, do not set hostname
192*9c5db199SXin Li        # in devserver.resolve call, so a devserver in non-restricted subnet
193*9c5db199SXin Li        # is picked to stage autotest server package for drone to download.
194*9c5db199SXin Li        hostname = self.hostname
195*9c5db199SXin Li        if not server_utils.ENABLE_DRONE_IN_RESTRICTED_SUBNET:
196*9c5db199SXin Li            hostname = None
197*9c5db199SXin Li        if image:
198*9c5db199SXin Li            image_name = tools.get_build_from_image(image)
199*9c5db199SXin Li            if not image_name:
200*9c5db199SXin Li                raise error.AutoservError(
201*9c5db199SXin Li                    'Failed to parse build name from %s' % image)
202*9c5db199SXin Li            ds = dev_server.ImageServer.resolve(image_name, hostname)
203*9c5db199SXin Li        else:
204*9c5db199SXin Li            info = self.host_info_store.get()
205*9c5db199SXin Li            job_repo_url = info.attributes.get(ds_constants.JOB_REPO_URL, '')
206*9c5db199SXin Li            if job_repo_url:
207*9c5db199SXin Li                devserver_url, image_name = (
208*9c5db199SXin Li                    tools.get_devserver_build_from_package_url(job_repo_url))
209*9c5db199SXin Li                # If enable_drone_in_restricted_subnet is True, use the
210*9c5db199SXin Li                # existing devserver. Otherwise, resolve a new one in
211*9c5db199SXin Li                # non-restricted subnet.
212*9c5db199SXin Li                if server_utils.ENABLE_DRONE_IN_RESTRICTED_SUBNET:
213*9c5db199SXin Li                    ds = dev_server.ImageServer(devserver_url)
214*9c5db199SXin Li                else:
215*9c5db199SXin Li                    ds = dev_server.ImageServer.resolve(image_name)
216*9c5db199SXin Li            elif info.build is not None:
217*9c5db199SXin Li                ds = dev_server.ImageServer.resolve(info.build, hostname)
218*9c5db199SXin Li                image_name = info.build
219*9c5db199SXin Li            else:
220*9c5db199SXin Li                raise error.AutoservError(
221*9c5db199SXin Li                    'Failed to stage server-side package. The host has '
222*9c5db199SXin Li                    'no job_repo_url attribute or cros-version label.')
223*9c5db199SXin Li
224*9c5db199SXin Li        ds.stage_artifacts(image_name, ['autotest_server_package'])
225*9c5db199SXin Li        return '%s/static/%s/%s' % (ds.url(), image_name,
226*9c5db199SXin Li                                    'autotest_server_package.tar.bz2')
227*9c5db199SXin Li
228*9c5db199SXin Li
229*9c5db199SXin Li    def repair(self):
230*9c5db199SXin Li        """Attempt to repair a labstation."""
231*9c5db199SXin Li        message = 'Beginning repair for host %s board %s model %s'
232*9c5db199SXin Li        info = self.host_info_store.get()
233*9c5db199SXin Li        message %= (self.hostname, info.board, info.model)
234*9c5db199SXin Li        self.record('INFO', None, None, message)
235*9c5db199SXin Li        self._repair_strategy.repair(self)
236*9c5db199SXin Li
237*9c5db199SXin Li
238*9c5db199SXin Li    def update_cros_version_label(self):
239*9c5db199SXin Li        """Update cros-version label on labstation"""
240*9c5db199SXin Li        image_name = self.get_full_release_path()
241*9c5db199SXin Li        if not image_name:
242*9c5db199SXin Li            logging.info('Could not get labstation version, it could be'
243*9c5db199SXin Li                         ' the labstation is running a customized image.')
244*9c5db199SXin Li            info = self.host_info_store.get()
245*9c5db199SXin Li            info.clear_version_labels(version_prefix=self.VERSION_PREFIX)
246*9c5db199SXin Li            self.host_info_store.commit(info)
247*9c5db199SXin Li            return
248*9c5db199SXin Li        afe_utils.add_provision_labels(self, self.VERSION_PREFIX, image_name)
249*9c5db199SXin Li
250*9c5db199SXin Li
251*9c5db199SXin Li    def _validate_uptime(self):
252*9c5db199SXin Li        return (float(self.check_uptime()) >
253*9c5db199SXin Li                self.UP_TIME_THRESH_HOLD_HOURS * 3600)
254*9c5db199SXin Li
255*9c5db199SXin Li
256*9c5db199SXin Li    def _is_servo_in_use(self):
257*9c5db199SXin Li        """Determine if there are any DUTs currently running task that uses
258*9c5db199SXin Li         servo, only files that has been touched within pre-set threshold of
259*9c5db199SXin Li          minutes counts.
260*9c5db199SXin Li
261*9c5db199SXin Li        @returns True if any DUTs is using servos, otherwise False.
262*9c5db199SXin Li        """
263*9c5db199SXin Li        cmd = 'find %s*%s -mmin -%s' % (self.TEMP_FILE_DIR,
264*9c5db199SXin Li                                        self.LOCK_FILE_POSTFIX,
265*9c5db199SXin Li                                        self.IN_USE_FILE_EXPIRE_MINS)
266*9c5db199SXin Li        result = self.run(cmd, ignore_status=True)
267*9c5db199SXin Li        return bool(result.stdout)
268*9c5db199SXin Li
269*9c5db199SXin Li
270*9c5db199SXin Li    def _cleanup_post_reboot(self):
271*9c5db199SXin Li        """Clean up all xxxx_reboot file after reboot."""
272*9c5db199SXin Li        cmd = 'rm %s*%s' % (self.TEMP_FILE_DIR, self.REBOOT_FILE_POSTFIX)
273*9c5db199SXin Li        self.run(cmd, ignore_status=True)
274*9c5db199SXin Li
275*9c5db199SXin Li    def rpm_power_on_and_wait(self, _rpm_client=None):
276*9c5db199SXin Li        """Power on a labstation through RPM and wait for it to come up"""
277*9c5db199SXin Li        return self.change_rpm_state_and_wait("ON", _rpm_client=_rpm_client)
278*9c5db199SXin Li
279*9c5db199SXin Li    def rpm_power_off_and_wait(self, _rpm_client=None):
280*9c5db199SXin Li        """Power off a labstation through RPM and wait for it to shut down"""
281*9c5db199SXin Li        return self.change_rpm_state_and_wait("OFF", _rpm_client=_rpm_client)
282*9c5db199SXin Li
283*9c5db199SXin Li    def change_rpm_state_and_wait(self, state, _rpm_client=None):
284*9c5db199SXin Li        """Change the state of a labstation
285*9c5db199SXin Li
286*9c5db199SXin Li        @param state: on or off
287*9c5db199SXin Li        @param _rpm_client: rpm_client module, to support testing
288*9c5db199SXin Li        """
289*9c5db199SXin Li        _rpm_client = _rpm_client or rpm_client
290*9c5db199SXin Li        wait = {
291*9c5db199SXin Li            "ON":  self.wait_up,
292*9c5db199SXin Li            "OFF": self.wait_down,
293*9c5db199SXin Li        }[state]
294*9c5db199SXin Li        timeout = {
295*9c5db199SXin Li            "ON": self.BOOT_TIMEOUT,
296*9c5db199SXin Li            "OFF": self.WAIT_DOWN_REBOOT_TIMEOUT,
297*9c5db199SXin Li        }[state]
298*9c5db199SXin Li        _rpm_client.set_power(self, state)
299*9c5db199SXin Li        if not wait(timeout=timeout):
300*9c5db199SXin Li            msg = "%s didn't enter %s state in %s seconds" % (
301*9c5db199SXin Li                getattr(self, 'hostname', None),
302*9c5db199SXin Li                state,
303*9c5db199SXin Li                timeout,
304*9c5db199SXin Li            )
305*9c5db199SXin Li            raise Exception(msg)
306