xref: /aosp_15_r20/external/autotest/server/hosts/moblab_host.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li# Lint as: python2, python3
2*9c5db199SXin Li# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
4*9c5db199SXin Li# found in the LICENSE file.
5*9c5db199SXin Li
6*9c5db199SXin Lifrom __future__ import absolute_import
7*9c5db199SXin Lifrom __future__ import division
8*9c5db199SXin Lifrom __future__ import print_function
9*9c5db199SXin Li
10*9c5db199SXin Liimport logging
11*9c5db199SXin Liimport os
12*9c5db199SXin Liimport re
13*9c5db199SXin Liimport time
14*9c5db199SXin Li
15*9c5db199SXin Liimport common
16*9c5db199SXin Lifrom autotest_lib.client.common_lib import error, global_config
17*9c5db199SXin Lifrom autotest_lib.client.common_lib.cros import retry
18*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
19*9c5db199SXin Lifrom autotest_lib.server.hosts import cros_host
20*9c5db199SXin Lifrom autotest_lib.server.hosts import cros_repair
21*9c5db199SXin Li
22*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import timeout_util
23*9c5db199SXin Liimport six
24*9c5db199SXin Li
25*9c5db199SXin LiAUTOTEST_INSTALL_DIR = global_config.global_config.get_config_value(
26*9c5db199SXin Li        'SCHEDULER', 'drone_installation_directory')
27*9c5db199SXin Li
28*9c5db199SXin Li#'/usr/local/autotest'
29*9c5db199SXin LiSHADOW_CONFIG_PATH = '%s/shadow_config.ini' % AUTOTEST_INSTALL_DIR
30*9c5db199SXin LiATEST_PATH = '%s/cli/atest' % AUTOTEST_INSTALL_DIR
31*9c5db199SXin Li
32*9c5db199SXin Li# Sample output of fping that we are matching against, the fping command
33*9c5db199SXin Li# will return 10 lines but they will be one of these two formats.
34*9c5db199SXin Li# We want to get the IP address for the first line and not match the
35*9c5db199SXin Li# second line that has a non 0 %loss.
36*9c5db199SXin Li#192.168.231.100 : xmt/rcv/%loss = 10/10/0%, min/avg/max = 0.68/0.88/1.13
37*9c5db199SXin Li#192.168.231.102 : xmt/rcv/%loss = 10/0/100%
38*9c5db199SXin LiSUBNET_DUT_SEARCH_RE = (r'(?P<ip>192.168.231.1[0-1][0-9]) : '
39*9c5db199SXin Li                        'xmt\/rcv\/%loss = [0-9]+\/[0-9]+\/0%')
40*9c5db199SXin Li
41*9c5db199SXin LiMOBLAB_HOME = '/home/moblab'
42*9c5db199SXin LiMOBLAB_BOTO_LOCATION = '%s/.boto' % MOBLAB_HOME
43*9c5db199SXin LiMOBLAB_LAUNCH_CONTROL_KEY_LOCATION = '%s/.launch_control_key' % MOBLAB_HOME
44*9c5db199SXin LiMOBLAB_SERVICE_ACCOUNT_LOCATION = '%s/.service_account.json' % MOBLAB_HOME
45*9c5db199SXin LiMOBLAB_AUTODIR = '/usr/local/autodir'
46*9c5db199SXin LiDHCPD_LEASE_FILE = '/var/lib/dhcp/dhcpd.leases'
47*9c5db199SXin LiMOBLAB_SERVICES = ['moblab-scheduler-init',
48*9c5db199SXin Li                   'moblab-database-init',
49*9c5db199SXin Li                   'moblab-devserver-init',
50*9c5db199SXin Li                   'moblab-gsoffloader-init',
51*9c5db199SXin Li                   'moblab-gsoffloader_s-init']
52*9c5db199SXin LiMOBLAB_PROCESSES = ['apache2', 'dhcpd']
53*9c5db199SXin LiDUT_VERIFY_SLEEP_SECS = 5
54*9c5db199SXin LiDUT_VERIFY_TIMEOUT = 15 * 60
55*9c5db199SXin LiMOBLAB_TMP_DIR = '/mnt/moblab/tmp'
56*9c5db199SXin LiMOBLAB_PORT = 80
57*9c5db199SXin Li
58*9c5db199SXin Li
59*9c5db199SXin Liclass UpstartServiceNotRunning(error.AutoservError):
60*9c5db199SXin Li    """An expected upstart service was not in the expected state."""
61*9c5db199SXin Li
62*9c5db199SXin Li    def __init__(self, service_name):
63*9c5db199SXin Li        """Create us.
64*9c5db199SXin Li        @param service_name: Name of the service_name that was in the worng
65*9c5db199SXin Li                state.
66*9c5db199SXin Li        """
67*9c5db199SXin Li        super(UpstartServiceNotRunning, self).__init__(
68*9c5db199SXin Li                'Upstart service %s not in running state. Most likely this '
69*9c5db199SXin Li                'means moblab did not boot correctly, check the boot logs '
70*9c5db199SXin Li                'for detailed error messages as to see why this service was '
71*9c5db199SXin Li                'not started.' %
72*9c5db199SXin Li                service_name)
73*9c5db199SXin Li
74*9c5db199SXin Li
75*9c5db199SXin Liclass MoblabHost(cros_host.CrosHost):
76*9c5db199SXin Li    """Moblab specific host class."""
77*9c5db199SXin Li
78*9c5db199SXin Li
79*9c5db199SXin Li    def _initialize_frontend_rpcs(self, timeout_min):
80*9c5db199SXin Li        """Initialize frontends for AFE and TKO for a moblab host.
81*9c5db199SXin Li
82*9c5db199SXin Li        We tunnel all communication to the frontends through an SSH tunnel as
83*9c5db199SXin Li        many testing environments block everything except SSH access to the
84*9c5db199SXin Li        moblab DUT.
85*9c5db199SXin Li
86*9c5db199SXin Li        @param timeout_min: The timeout minuties for AFE services.
87*9c5db199SXin Li        """
88*9c5db199SXin Li        web_address = self.rpc_server_tracker.tunnel_connect(MOBLAB_PORT)
89*9c5db199SXin Li        # Pass timeout_min to self.afe
90*9c5db199SXin Li        self.afe = frontend_wrappers.RetryingAFE(timeout_min=timeout_min,
91*9c5db199SXin Li                                                 user='moblab',
92*9c5db199SXin Li                                                 server=web_address)
93*9c5db199SXin Li        # Use default timeout_min of MoblabHost for self.tko
94*9c5db199SXin Li        self.tko = frontend_wrappers.RetryingTKO(timeout_min=self.timeout_min,
95*9c5db199SXin Li                                                 user='moblab',
96*9c5db199SXin Li                                                 server=web_address)
97*9c5db199SXin Li
98*9c5db199SXin Li
99*9c5db199SXin Li    def _initialize(self, *args, **dargs):
100*9c5db199SXin Li        super(MoblabHost, self)._initialize(*args, **dargs)
101*9c5db199SXin Li        # TODO(jrbarnette):  Our superclass already initialized
102*9c5db199SXin Li        # _repair_strategy, and now we're re-initializing it here.
103*9c5db199SXin Li        # That's awkward, if not actually wrong.
104*9c5db199SXin Li        self._repair_strategy = cros_repair.create_moblab_repair_strategy()
105*9c5db199SXin Li        self.timeout_min = dargs.get('rpc_timeout_min', 1)
106*9c5db199SXin Li        self._initialize_frontend_rpcs(self.timeout_min)
107*9c5db199SXin Li
108*9c5db199SXin Li
109*9c5db199SXin Li    @staticmethod
110*9c5db199SXin Li    def check_host(host, timeout=10):
111*9c5db199SXin Li        """
112*9c5db199SXin Li        Check if the given host is an moblab host.
113*9c5db199SXin Li
114*9c5db199SXin Li        @param host: An ssh host representing a device.
115*9c5db199SXin Li        @param timeout: The timeout for the run command.
116*9c5db199SXin Li
117*9c5db199SXin Li
118*9c5db199SXin Li        @return: True if the host device has adb.
119*9c5db199SXin Li
120*9c5db199SXin Li        @raises AutoservRunError: If the command failed.
121*9c5db199SXin Li        @raises AutoservSSHTimeout: Ssh connection has timed out.
122*9c5db199SXin Li        """
123*9c5db199SXin Li        return False
124*9c5db199SXin Li
125*9c5db199SXin Li
126*9c5db199SXin Li    def install_boto_file(self, boto_path=''):
127*9c5db199SXin Li        """Install a boto file on the Moblab device.
128*9c5db199SXin Li
129*9c5db199SXin Li        @param boto_path: Path to the boto file to install. If None, sends the
130*9c5db199SXin Li                          boto file in the current HOME directory.
131*9c5db199SXin Li
132*9c5db199SXin Li        @raises error.TestError if the boto file does not exist.
133*9c5db199SXin Li        """
134*9c5db199SXin Li        if not boto_path:
135*9c5db199SXin Li            boto_path = os.path.join(os.getenv('HOME'), '.boto')
136*9c5db199SXin Li        if not os.path.exists(boto_path):
137*9c5db199SXin Li            raise error.TestError('Boto File:%s does not exist.' % boto_path)
138*9c5db199SXin Li        self.send_file(boto_path, MOBLAB_BOTO_LOCATION)
139*9c5db199SXin Li        self.run('chown moblab:moblab %s' % MOBLAB_BOTO_LOCATION)
140*9c5db199SXin Li
141*9c5db199SXin Li
142*9c5db199SXin Li    def get_autodir(self):
143*9c5db199SXin Li        """Return the directory to install autotest for client side tests."""
144*9c5db199SXin Li        return self.autodir or MOBLAB_AUTODIR
145*9c5db199SXin Li
146*9c5db199SXin Li
147*9c5db199SXin Li    def run_as_moblab(self, command, **kwargs):
148*9c5db199SXin Li        """Moblab commands should be ran as the moblab user not root.
149*9c5db199SXin Li
150*9c5db199SXin Li        @param command: Command to run as user moblab.
151*9c5db199SXin Li        """
152*9c5db199SXin Li        command = "su - moblab -c '%s'" % command
153*9c5db199SXin Li        return self.run(command, **kwargs)
154*9c5db199SXin Li
155*9c5db199SXin Li
156*9c5db199SXin Li    def wait_afe_up(self, timeout_min=5):
157*9c5db199SXin Li        """Wait till the AFE is up and loaded.
158*9c5db199SXin Li
159*9c5db199SXin Li        Attempt to reach the Moblab's AFE and database through its RPC
160*9c5db199SXin Li        interface.
161*9c5db199SXin Li
162*9c5db199SXin Li        @param timeout_min: Minutes to wait for the AFE to respond. Default is
163*9c5db199SXin Li                            5 minutes.
164*9c5db199SXin Li
165*9c5db199SXin Li        @raises urllib2.HTTPError if AFE does not respond within the timeout.
166*9c5db199SXin Li        """
167*9c5db199SXin Li        # Use moblabhost's own AFE object with a longer timeout to wait for the
168*9c5db199SXin Li        # AFE to load. Also re-create the ssh tunnel for connections to moblab.
169*9c5db199SXin Li        # Set the timeout_min to be longer than self.timeout_min for rebooting.
170*9c5db199SXin Li        self._initialize_frontend_rpcs(timeout_min)
171*9c5db199SXin Li        # Verify the AFE can handle a simple request.
172*9c5db199SXin Li        self._check_afe()
173*9c5db199SXin Li        # Reset the timeout_min after rebooting checks for afe services.
174*9c5db199SXin Li        self.afe.set_timeout(self.timeout_min)
175*9c5db199SXin Li
176*9c5db199SXin Li
177*9c5db199SXin Li    def add_dut(self, hostname):
178*9c5db199SXin Li        """Add a DUT hostname to the AFE.
179*9c5db199SXin Li
180*9c5db199SXin Li        @param hostname: DUT hostname to add.
181*9c5db199SXin Li        """
182*9c5db199SXin Li        result = self.run_as_moblab('%s host create %s' % (ATEST_PATH,
183*9c5db199SXin Li                                                           hostname))
184*9c5db199SXin Li        logging.debug('atest host create output for host %s:\n%s',
185*9c5db199SXin Li                      hostname, result.stdout)
186*9c5db199SXin Li
187*9c5db199SXin Li
188*9c5db199SXin Li    def find_and_add_duts(self):
189*9c5db199SXin Li        """Discover DUTs on the testing subnet and add them to the AFE.
190*9c5db199SXin Li
191*9c5db199SXin Li        Pings the range of IP's a DUT might be assigned by moblab, then
192*9c5db199SXin Li        parses the output to discover connected DUTs, connected means
193*9c5db199SXin Li        they have 0% dropped pings.
194*9c5db199SXin Li        If they are not already in the AFE, adds them to AFE.
195*9c5db199SXin Li        """
196*9c5db199SXin Li        existing_hosts = [host.hostname for host in self.afe.get_hosts()]
197*9c5db199SXin Li        fping_result = self.run('fping -g 192.168.231.100 192.168.231.110 '
198*9c5db199SXin Li                                '-a -c 10 -p 30 -q', ignore_status=True)
199*9c5db199SXin Li        for line in fping_result.stderr.splitlines():
200*9c5db199SXin Li            match = re.match(SUBNET_DUT_SEARCH_RE, line)
201*9c5db199SXin Li            if match:
202*9c5db199SXin Li                dut_ip = match.group('ip')
203*9c5db199SXin Li                if dut_ip in existing_hosts:
204*9c5db199SXin Li                    break
205*9c5db199SXin Li                if self._check_dut_ssh(dut_ip):
206*9c5db199SXin Li                    self.add_dut(dut_ip)
207*9c5db199SXin Li                    existing_hosts.append(dut_ip)
208*9c5db199SXin Li
209*9c5db199SXin Li    def _check_dut_ssh(self, dut_ip):
210*9c5db199SXin Li        is_sshable = False
211*9c5db199SXin Li        count = 0
212*9c5db199SXin Li        while not is_sshable and count < 10:
213*9c5db199SXin Li            cmd = ('ssh  -o ConnectTimeout=30 -o ConnectionAttempts=30'
214*9c5db199SXin Li                   ' root@%s echo Testing' % dut_ip)
215*9c5db199SXin Li            result = self.run(cmd)
216*9c5db199SXin Li            is_sshable = 'Testing' in result.stdout
217*9c5db199SXin Li            logging.info(is_sshable)
218*9c5db199SXin Li            count += 1
219*9c5db199SXin Li        return is_sshable
220*9c5db199SXin Li
221*9c5db199SXin Li    def verify_software(self):
222*9c5db199SXin Li        """Create the autodir then do standard verify."""
223*9c5db199SXin Li        # In case cleanup or powerwash wiped the autodir, create an empty
224*9c5db199SXin Li        # directory.
225*9c5db199SXin Li        # Removing this mkdir command will result in the disk size check
226*9c5db199SXin Li        # not being performed.
227*9c5db199SXin Li        self.run('mkdir -p %s' % MOBLAB_AUTODIR)
228*9c5db199SXin Li        super(MoblabHost, self).verify_software()
229*9c5db199SXin Li
230*9c5db199SXin Li
231*9c5db199SXin Li    def _verify_upstart_service(self, service, timeout_m):
232*9c5db199SXin Li        """Verify that the given moblab service is running.
233*9c5db199SXin Li
234*9c5db199SXin Li        @param service: The upstart service to check for.
235*9c5db199SXin Li        @timeout_m: Timeout (in minuts) before giving up.
236*9c5db199SXin Li        @raises TimeoutException or UpstartServiceNotRunning if service isn't
237*9c5db199SXin Li                running.
238*9c5db199SXin Li        """
239*9c5db199SXin Li        @retry.retry(error.AutoservError, timeout_min=timeout_m, delay_sec=10)
240*9c5db199SXin Li        def _verify():
241*9c5db199SXin Li            if not self.upstart_status(service):
242*9c5db199SXin Li                raise UpstartServiceNotRunning(service)
243*9c5db199SXin Li        _verify()
244*9c5db199SXin Li
245*9c5db199SXin Li    def verify_moblab_services(self, timeout_m):
246*9c5db199SXin Li        """Verify the required Moblab services are up and running.
247*9c5db199SXin Li
248*9c5db199SXin Li        @param timeout_m: Timeout (in minutes) for how long to wait for services
249*9c5db199SXin Li                to start. Actual time taken may be slightly more than this.
250*9c5db199SXin Li        @raises AutoservError if any moblab service is not running.
251*9c5db199SXin Li        """
252*9c5db199SXin Li        if not MOBLAB_SERVICES:
253*9c5db199SXin Li            return
254*9c5db199SXin Li
255*9c5db199SXin Li        service = MOBLAB_SERVICES[0]
256*9c5db199SXin Li        try:
257*9c5db199SXin Li            # First service can take a long time to start, especially on first
258*9c5db199SXin Li            # boot where container setup can take 5-10 minutes, depending on the
259*9c5db199SXin Li            # device.
260*9c5db199SXin Li            self._verify_upstart_service(service, timeout_m)
261*9c5db199SXin Li        except error.TimeoutException:
262*9c5db199SXin Li            raise UpstartServiceNotRunning(service)
263*9c5db199SXin Li
264*9c5db199SXin Li        for service in MOBLAB_SERVICES[1:]:
265*9c5db199SXin Li            try:
266*9c5db199SXin Li                # Follow up services should come up quickly.
267*9c5db199SXin Li                self._verify_upstart_service(service, 0.5)
268*9c5db199SXin Li            except error.TimeoutException:
269*9c5db199SXin Li                raise UpstartServiceNotRunning(service)
270*9c5db199SXin Li
271*9c5db199SXin Li        for process in MOBLAB_PROCESSES:
272*9c5db199SXin Li            try:
273*9c5db199SXin Li                self.run('pgrep %s' % process)
274*9c5db199SXin Li            except error.AutoservRunError:
275*9c5db199SXin Li                raise error.AutoservError('Moblab process: %s is not running.'
276*9c5db199SXin Li                                          % process)
277*9c5db199SXin Li
278*9c5db199SXin Li
279*9c5db199SXin Li    def _check_afe(self):
280*9c5db199SXin Li        """Verify whether afe of moblab works before verifying its DUTs.
281*9c5db199SXin Li
282*9c5db199SXin Li        Verifying moblab sometimes happens after a successful provision, in
283*9c5db199SXin Li        which case moblab is restarted but tunnel of afe is not re-connected.
284*9c5db199SXin Li        This func is used to check whether afe is working now.
285*9c5db199SXin Li
286*9c5db199SXin Li        @return True if afe works.
287*9c5db199SXin Li        @raises error.AutoservError if AFE is down; other exceptions are passed
288*9c5db199SXin Li                through.
289*9c5db199SXin Li        """
290*9c5db199SXin Li        try:
291*9c5db199SXin Li            self.afe.get_hosts()
292*9c5db199SXin Li        except (error.TimeoutException, timeout_util.TimeoutError) as e:
293*9c5db199SXin Li            raise error.AutoservError('Moblab AFE is not responding: %s' %
294*9c5db199SXin Li                                      str(e))
295*9c5db199SXin Li        except Exception as e:
296*9c5db199SXin Li            logging.error('Unknown exception when checking moblab AFE: %s', e)
297*9c5db199SXin Li            raise
298*9c5db199SXin Li
299*9c5db199SXin Li        return True
300*9c5db199SXin Li
301*9c5db199SXin Li
302*9c5db199SXin Li    def verify_duts(self):
303*9c5db199SXin Li        """Verify the Moblab DUTs are up and running.
304*9c5db199SXin Li
305*9c5db199SXin Li        @raises AutoservError if no DUTs are in the Ready State.
306*9c5db199SXin Li        """
307*9c5db199SXin Li        hosts = self.afe.reverify_hosts()
308*9c5db199SXin Li        logging.debug('DUTs scheduled for reverification: %s', hosts)
309*9c5db199SXin Li
310*9c5db199SXin Li
311*9c5db199SXin Li    def verify_special_tasks_complete(self):
312*9c5db199SXin Li        """Wait till the special tasks on the moblab host are complete."""
313*9c5db199SXin Li        total_time = 0
314*9c5db199SXin Li        while (self.afe.get_special_tasks(is_complete=False) and
315*9c5db199SXin Li               total_time < DUT_VERIFY_TIMEOUT):
316*9c5db199SXin Li            total_time = total_time + DUT_VERIFY_SLEEP_SECS
317*9c5db199SXin Li            time.sleep(DUT_VERIFY_SLEEP_SECS)
318*9c5db199SXin Li        if not self.afe.get_hosts(status='Ready'):
319*9c5db199SXin Li            for host in self.afe.get_hosts():
320*9c5db199SXin Li                logging.error('DUT: %s Status: %s', host, host.status)
321*9c5db199SXin Li            raise error.AutoservError('Moblab has 0 Ready DUTs')
322*9c5db199SXin Li
323*9c5db199SXin Li
324*9c5db199SXin Li    def get_platform(self):
325*9c5db199SXin Li        """Determine the correct platform label for this host.
326*9c5db199SXin Li
327*9c5db199SXin Li        For Moblab devices '_moblab' is appended.
328*9c5db199SXin Li
329*9c5db199SXin Li        @returns a string representing this host's platform.
330*9c5db199SXin Li        """
331*9c5db199SXin Li        return super(MoblabHost, self).get_platform() + '_moblab'
332*9c5db199SXin Li
333*9c5db199SXin Li
334*9c5db199SXin Li    def make_tmp_dir(self, base=MOBLAB_TMP_DIR):
335*9c5db199SXin Li        """Creates a temporary directory.
336*9c5db199SXin Li
337*9c5db199SXin Li        @param base: The directory where it should be created.
338*9c5db199SXin Li
339*9c5db199SXin Li        @return Path to a newly created temporary directory.
340*9c5db199SXin Li        """
341*9c5db199SXin Li        self.run('mkdir -p %s' % base)
342*9c5db199SXin Li        return self.run('mktemp -d -p %s' % base).stdout.strip()
343*9c5db199SXin Li
344*9c5db199SXin Li
345*9c5db199SXin Li    def get_os_type(self):
346*9c5db199SXin Li        return 'moblab'
347