1*9c5db199SXin Li# Copyright (c) 2019 The Chromium OS Authors. All rights reserved. 2*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be 3*9c5db199SXin Li# found in the LICENSE file. 4*9c5db199SXin Li 5*9c5db199SXin Li"""This file provides core logic for labstation verify/repair process.""" 6*9c5db199SXin Li 7*9c5db199SXin Liimport logging 8*9c5db199SXin Li 9*9c5db199SXin Lifrom autotest_lib.client.common_lib import error 10*9c5db199SXin Lifrom autotest_lib.server import afe_utils 11*9c5db199SXin Lifrom autotest_lib.server.hosts import base_label 12*9c5db199SXin Lifrom autotest_lib.server.hosts import cros_label 13*9c5db199SXin Lifrom autotest_lib.server.hosts import labstation_repair 14*9c5db199SXin Lifrom autotest_lib.server.cros import provision 15*9c5db199SXin Lifrom autotest_lib.server.hosts import base_servohost 16*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import constants as ds_constants 17*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import tools 18*9c5db199SXin Lifrom autotest_lib.client.common_lib.cros import dev_server 19*9c5db199SXin Lifrom autotest_lib.server import utils as server_utils 20*9c5db199SXin Lifrom autotest_lib.site_utils.rpm_control_system import rpm_client 21*9c5db199SXin Li 22*9c5db199SXin Liclass LabstationHost(base_servohost.BaseServoHost): 23*9c5db199SXin Li """Labstation specific host class""" 24*9c5db199SXin Li 25*9c5db199SXin Li # Threshold we decide to ignore a in_use file lock. In minutes 26*9c5db199SXin Li IN_USE_FILE_EXPIRE_MINS = 90 27*9c5db199SXin Li 28*9c5db199SXin Li # Uptime threshold to perform a labstation reboot, this is to prevent a 29*9c5db199SXin Li # broken DUT keep trying to reboot a labstation. In hours 30*9c5db199SXin Li UP_TIME_THRESH_HOLD_HOURS = 6 31*9c5db199SXin Li 32*9c5db199SXin Li VERSION_PREFIX = provision.CROS_VERSION_PREFIX 33*9c5db199SXin Li 34*9c5db199SXin Li @staticmethod 35*9c5db199SXin Li def check_host(host, timeout=10): 36*9c5db199SXin Li """ 37*9c5db199SXin Li Check if the given host is a labstation host. 38*9c5db199SXin Li 39*9c5db199SXin Li @param host: An ssh host representing a device. 40*9c5db199SXin Li @param timeout: The timeout for the run command. 41*9c5db199SXin Li 42*9c5db199SXin Li @return: True if the host device is labstation. 43*9c5db199SXin Li 44*9c5db199SXin Li @raises AutoservRunError: If the command failed. 45*9c5db199SXin Li @raises AutoservSSHTimeout: Ssh connection has timed out. 46*9c5db199SXin Li 47*9c5db199SXin Li """ 48*9c5db199SXin Li try: 49*9c5db199SXin Li result = host.run( 50*9c5db199SXin Li 'grep -q labstation /etc/lsb-release', 51*9c5db199SXin Li ignore_status=True, timeout=timeout) 52*9c5db199SXin Li except (error.AutoservRunError, error.AutoservSSHTimeout): 53*9c5db199SXin Li return False 54*9c5db199SXin Li return result.exit_status == 0 55*9c5db199SXin Li 56*9c5db199SXin Li 57*9c5db199SXin Li def _initialize(self, hostname, *args, **dargs): 58*9c5db199SXin Li super(LabstationHost, self)._initialize(hostname=hostname, 59*9c5db199SXin Li *args, **dargs) 60*9c5db199SXin Li self._repair_strategy = ( 61*9c5db199SXin Li labstation_repair.create_labstation_repair_strategy()) 62*9c5db199SXin Li self.labels = base_label.LabelRetriever(cros_label.LABSTATION_LABELS) 63*9c5db199SXin Li 64*9c5db199SXin Li 65*9c5db199SXin Li def is_reboot_requested(self): 66*9c5db199SXin Li """Check if a reboot is requested for this labstation, the reboot can 67*9c5db199SXin Li either be requested from labstation or DUTs. For request from DUTs we 68*9c5db199SXin Li only process it if uptime longer than a threshold because we want 69*9c5db199SXin Li to prevent a broken servo keep its labstation in reboot cycle. 70*9c5db199SXin Li 71*9c5db199SXin Li @returns True if a reboot is required, otherwise False 72*9c5db199SXin Li """ 73*9c5db199SXin Li if self._check_update_status() == self.UPDATE_STATE.PENDING_REBOOT: 74*9c5db199SXin Li logging.info('Labstation reboot requested from labstation for' 75*9c5db199SXin Li ' update image') 76*9c5db199SXin Li return True 77*9c5db199SXin Li 78*9c5db199SXin Li if not self._validate_uptime(): 79*9c5db199SXin Li logging.info('Ignoring DUTs reboot request because %s was' 80*9c5db199SXin Li ' rebooted in last %d hours.', 81*9c5db199SXin Li self.hostname, self.UP_TIME_THRESH_HOLD_HOURS) 82*9c5db199SXin Li return False 83*9c5db199SXin Li 84*9c5db199SXin Li cmd = 'find %s*%s' % (self.TEMP_FILE_DIR, self.REBOOT_FILE_POSTFIX) 85*9c5db199SXin Li output = self.run(cmd, ignore_status=True).stdout 86*9c5db199SXin Li if output: 87*9c5db199SXin Li in_use_file_list = output.strip().split('\n') 88*9c5db199SXin Li logging.info('%s DUT(s) are currently requesting to' 89*9c5db199SXin Li ' reboot labstation.', len(in_use_file_list)) 90*9c5db199SXin Li return True 91*9c5db199SXin Li else: 92*9c5db199SXin Li return False 93*9c5db199SXin Li 94*9c5db199SXin Li 95*9c5db199SXin Li def try_reboot(self): 96*9c5db199SXin Li """Try to reboot the labstation if it's safe to do(no servo in use, 97*9c5db199SXin Li and not processing updates), and cleanup reboot control file. 98*9c5db199SXin Li """ 99*9c5db199SXin Li if self._is_servo_in_use(): 100*9c5db199SXin Li logging.info('Aborting reboot action because some DUT(s) are' 101*9c5db199SXin Li ' currently using servo(s).') 102*9c5db199SXin Li return 103*9c5db199SXin Li 104*9c5db199SXin Li update_state = self._check_update_status() 105*9c5db199SXin Li if update_state == self.UPDATE_STATE.RUNNING: 106*9c5db199SXin Li logging.info('Aborting reboot action because an update process' 107*9c5db199SXin Li ' is running.') 108*9c5db199SXin Li return 109*9c5db199SXin Li if update_state == self.UPDATE_STATE.PENDING_REBOOT: 110*9c5db199SXin Li self._post_update_reboot() 111*9c5db199SXin Li else: 112*9c5db199SXin Li self._servo_host_reboot() 113*9c5db199SXin Li self.update_cros_version_label() 114*9c5db199SXin Li logging.info('Cleaning up reboot control files.') 115*9c5db199SXin Li self._cleanup_post_reboot() 116*9c5db199SXin Li 117*9c5db199SXin Li 118*9c5db199SXin Li def get_labels(self): 119*9c5db199SXin Li """Return the detected labels on the host.""" 120*9c5db199SXin Li return self.labels.get_labels(self) 121*9c5db199SXin Li 122*9c5db199SXin Li 123*9c5db199SXin Li def get_os_type(self): 124*9c5db199SXin Li return 'labstation' 125*9c5db199SXin Li 126*9c5db199SXin Li 127*9c5db199SXin Li def verify_job_repo_url(self, tag=''): 128*9c5db199SXin Li """ 129*9c5db199SXin Li Make sure job_repo_url of this host is valid. 130*9c5db199SXin Li 131*9c5db199SXin Li Eg: The job_repo_url "http://lmn.cd.ab.xyx:8080/static/\ 132*9c5db199SXin Li lumpy-release/R29-4279.0.0/autotest/packages" claims to have the 133*9c5db199SXin Li autotest package for lumpy-release/R29-4279.0.0. If this isn't the case, 134*9c5db199SXin Li download and extract it. If the devserver embedded in the url is 135*9c5db199SXin Li unresponsive, update the job_repo_url of the host after staging it on 136*9c5db199SXin Li another devserver. 137*9c5db199SXin Li 138*9c5db199SXin Li @param job_repo_url: A url pointing to the devserver where the autotest 139*9c5db199SXin Li package for this build should be staged. 140*9c5db199SXin Li @param tag: The tag from the server job, in the format 141*9c5db199SXin Li <job_id>-<user>/<hostname>, or <hostless> for a server job. 142*9c5db199SXin Li 143*9c5db199SXin Li @raises DevServerException: If we could not resolve a devserver. 144*9c5db199SXin Li @raises AutoservError: If we're unable to save the new job_repo_url as 145*9c5db199SXin Li a result of choosing a new devserver because the old one failed to 146*9c5db199SXin Li respond to a health check. 147*9c5db199SXin Li @raises urllib2.URLError: If the devserver embedded in job_repo_url 148*9c5db199SXin Li doesn't respond within the timeout. 149*9c5db199SXin Li """ 150*9c5db199SXin Li info = self.host_info_store.get() 151*9c5db199SXin Li job_repo_url = info.attributes.get(ds_constants.JOB_REPO_URL, '') 152*9c5db199SXin Li if not job_repo_url: 153*9c5db199SXin Li logging.warning('No job repo url set on host %s', self.hostname) 154*9c5db199SXin Li return 155*9c5db199SXin Li 156*9c5db199SXin Li logging.info('Verifying job repo url %s', job_repo_url) 157*9c5db199SXin Li devserver_url, image_name = tools.get_devserver_build_from_package_url( 158*9c5db199SXin Li job_repo_url) 159*9c5db199SXin Li 160*9c5db199SXin Li ds = dev_server.ImageServer(devserver_url) 161*9c5db199SXin Li 162*9c5db199SXin Li logging.info('Staging autotest artifacts for %s on devserver %s', 163*9c5db199SXin Li image_name, ds.url()) 164*9c5db199SXin Li 165*9c5db199SXin Li ds.stage_artifacts(image_name, ['autotest_packages']) 166*9c5db199SXin Li 167*9c5db199SXin Li 168*9c5db199SXin Li def host_version_prefix(self, image): 169*9c5db199SXin Li """Return version label prefix. 170*9c5db199SXin Li 171*9c5db199SXin Li In case the CrOS provisioning version is something other than the 172*9c5db199SXin Li standard CrOS version e.g. CrOS TH version, this function will 173*9c5db199SXin Li find the prefix from provision.py. 174*9c5db199SXin Li 175*9c5db199SXin Li @param image: The image name to find its version prefix. 176*9c5db199SXin Li @returns: A prefix string for the image type. 177*9c5db199SXin Li """ 178*9c5db199SXin Li return provision.get_version_label_prefix(image) 179*9c5db199SXin Li 180*9c5db199SXin Li 181*9c5db199SXin Li def stage_server_side_package(self, image=None): 182*9c5db199SXin Li """Stage autotest server-side package on devserver. 183*9c5db199SXin Li 184*9c5db199SXin Li @param image: Full path of an OS image to install or a build name. 185*9c5db199SXin Li 186*9c5db199SXin Li @return: A url to the autotest server-side package. 187*9c5db199SXin Li 188*9c5db199SXin Li @raise: error.AutoservError if fail to locate the build to test with, or 189*9c5db199SXin Li fail to stage server-side package. 190*9c5db199SXin Li """ 191*9c5db199SXin Li # If enable_drone_in_restricted_subnet is False, do not set hostname 192*9c5db199SXin Li # in devserver.resolve call, so a devserver in non-restricted subnet 193*9c5db199SXin Li # is picked to stage autotest server package for drone to download. 194*9c5db199SXin Li hostname = self.hostname 195*9c5db199SXin Li if not server_utils.ENABLE_DRONE_IN_RESTRICTED_SUBNET: 196*9c5db199SXin Li hostname = None 197*9c5db199SXin Li if image: 198*9c5db199SXin Li image_name = tools.get_build_from_image(image) 199*9c5db199SXin Li if not image_name: 200*9c5db199SXin Li raise error.AutoservError( 201*9c5db199SXin Li 'Failed to parse build name from %s' % image) 202*9c5db199SXin Li ds = dev_server.ImageServer.resolve(image_name, hostname) 203*9c5db199SXin Li else: 204*9c5db199SXin Li info = self.host_info_store.get() 205*9c5db199SXin Li job_repo_url = info.attributes.get(ds_constants.JOB_REPO_URL, '') 206*9c5db199SXin Li if job_repo_url: 207*9c5db199SXin Li devserver_url, image_name = ( 208*9c5db199SXin Li tools.get_devserver_build_from_package_url(job_repo_url)) 209*9c5db199SXin Li # If enable_drone_in_restricted_subnet is True, use the 210*9c5db199SXin Li # existing devserver. Otherwise, resolve a new one in 211*9c5db199SXin Li # non-restricted subnet. 212*9c5db199SXin Li if server_utils.ENABLE_DRONE_IN_RESTRICTED_SUBNET: 213*9c5db199SXin Li ds = dev_server.ImageServer(devserver_url) 214*9c5db199SXin Li else: 215*9c5db199SXin Li ds = dev_server.ImageServer.resolve(image_name) 216*9c5db199SXin Li elif info.build is not None: 217*9c5db199SXin Li ds = dev_server.ImageServer.resolve(info.build, hostname) 218*9c5db199SXin Li image_name = info.build 219*9c5db199SXin Li else: 220*9c5db199SXin Li raise error.AutoservError( 221*9c5db199SXin Li 'Failed to stage server-side package. The host has ' 222*9c5db199SXin Li 'no job_repo_url attribute or cros-version label.') 223*9c5db199SXin Li 224*9c5db199SXin Li ds.stage_artifacts(image_name, ['autotest_server_package']) 225*9c5db199SXin Li return '%s/static/%s/%s' % (ds.url(), image_name, 226*9c5db199SXin Li 'autotest_server_package.tar.bz2') 227*9c5db199SXin Li 228*9c5db199SXin Li 229*9c5db199SXin Li def repair(self): 230*9c5db199SXin Li """Attempt to repair a labstation.""" 231*9c5db199SXin Li message = 'Beginning repair for host %s board %s model %s' 232*9c5db199SXin Li info = self.host_info_store.get() 233*9c5db199SXin Li message %= (self.hostname, info.board, info.model) 234*9c5db199SXin Li self.record('INFO', None, None, message) 235*9c5db199SXin Li self._repair_strategy.repair(self) 236*9c5db199SXin Li 237*9c5db199SXin Li 238*9c5db199SXin Li def update_cros_version_label(self): 239*9c5db199SXin Li """Update cros-version label on labstation""" 240*9c5db199SXin Li image_name = self.get_full_release_path() 241*9c5db199SXin Li if not image_name: 242*9c5db199SXin Li logging.info('Could not get labstation version, it could be' 243*9c5db199SXin Li ' the labstation is running a customized image.') 244*9c5db199SXin Li info = self.host_info_store.get() 245*9c5db199SXin Li info.clear_version_labels(version_prefix=self.VERSION_PREFIX) 246*9c5db199SXin Li self.host_info_store.commit(info) 247*9c5db199SXin Li return 248*9c5db199SXin Li afe_utils.add_provision_labels(self, self.VERSION_PREFIX, image_name) 249*9c5db199SXin Li 250*9c5db199SXin Li 251*9c5db199SXin Li def _validate_uptime(self): 252*9c5db199SXin Li return (float(self.check_uptime()) > 253*9c5db199SXin Li self.UP_TIME_THRESH_HOLD_HOURS * 3600) 254*9c5db199SXin Li 255*9c5db199SXin Li 256*9c5db199SXin Li def _is_servo_in_use(self): 257*9c5db199SXin Li """Determine if there are any DUTs currently running task that uses 258*9c5db199SXin Li servo, only files that has been touched within pre-set threshold of 259*9c5db199SXin Li minutes counts. 260*9c5db199SXin Li 261*9c5db199SXin Li @returns True if any DUTs is using servos, otherwise False. 262*9c5db199SXin Li """ 263*9c5db199SXin Li cmd = 'find %s*%s -mmin -%s' % (self.TEMP_FILE_DIR, 264*9c5db199SXin Li self.LOCK_FILE_POSTFIX, 265*9c5db199SXin Li self.IN_USE_FILE_EXPIRE_MINS) 266*9c5db199SXin Li result = self.run(cmd, ignore_status=True) 267*9c5db199SXin Li return bool(result.stdout) 268*9c5db199SXin Li 269*9c5db199SXin Li 270*9c5db199SXin Li def _cleanup_post_reboot(self): 271*9c5db199SXin Li """Clean up all xxxx_reboot file after reboot.""" 272*9c5db199SXin Li cmd = 'rm %s*%s' % (self.TEMP_FILE_DIR, self.REBOOT_FILE_POSTFIX) 273*9c5db199SXin Li self.run(cmd, ignore_status=True) 274*9c5db199SXin Li 275*9c5db199SXin Li def rpm_power_on_and_wait(self, _rpm_client=None): 276*9c5db199SXin Li """Power on a labstation through RPM and wait for it to come up""" 277*9c5db199SXin Li return self.change_rpm_state_and_wait("ON", _rpm_client=_rpm_client) 278*9c5db199SXin Li 279*9c5db199SXin Li def rpm_power_off_and_wait(self, _rpm_client=None): 280*9c5db199SXin Li """Power off a labstation through RPM and wait for it to shut down""" 281*9c5db199SXin Li return self.change_rpm_state_and_wait("OFF", _rpm_client=_rpm_client) 282*9c5db199SXin Li 283*9c5db199SXin Li def change_rpm_state_and_wait(self, state, _rpm_client=None): 284*9c5db199SXin Li """Change the state of a labstation 285*9c5db199SXin Li 286*9c5db199SXin Li @param state: on or off 287*9c5db199SXin Li @param _rpm_client: rpm_client module, to support testing 288*9c5db199SXin Li """ 289*9c5db199SXin Li _rpm_client = _rpm_client or rpm_client 290*9c5db199SXin Li wait = { 291*9c5db199SXin Li "ON": self.wait_up, 292*9c5db199SXin Li "OFF": self.wait_down, 293*9c5db199SXin Li }[state] 294*9c5db199SXin Li timeout = { 295*9c5db199SXin Li "ON": self.BOOT_TIMEOUT, 296*9c5db199SXin Li "OFF": self.WAIT_DOWN_REBOOT_TIMEOUT, 297*9c5db199SXin Li }[state] 298*9c5db199SXin Li _rpm_client.set_power(self, state) 299*9c5db199SXin Li if not wait(timeout=timeout): 300*9c5db199SXin Li msg = "%s didn't enter %s state in %s seconds" % ( 301*9c5db199SXin Li getattr(self, 'hostname', None), 302*9c5db199SXin Li state, 303*9c5db199SXin Li timeout, 304*9c5db199SXin Li ) 305*9c5db199SXin Li raise Exception(msg) 306